src/5013011/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     I32         uni_semantics;  /* If a d charset modifier should use unicode
 138                                    rules, even if the pattern is not in
 139                                    utf8 */
 140     HV          *paren_names;           /* Paren names */
 141
 142     regnode     **recurse;              /* Recurse regops */
 143     I32         recurse_count;          /* Number of recurse regops */
 144     I32         in_lookbehind;
 145     I32         contains_locale;
 146 #if ADD_TO_REGEXEC
 147     char        *starttry;              /* -Dr: where regtry was called. */
 148 #define RExC_starttry   (pRExC_state->starttry)
 149 #endif
 150 #ifdef DEBUGGING
 151     const char  *lastparse;
 152     I32         lastnum;
 153     AV          *paren_name_list;       /* idx -> name */
 154 #define RExC_lastparse  (pRExC_state->lastparse)
 155 #define RExC_lastnum    (pRExC_state->lastnum)
 156 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 157 #endif
 158 } RExC_state_t;
 159
 160 #define RExC_flags      (pRExC_state->flags)
 161 #define RExC_precomp    (pRExC_state->precomp)
 162 #define RExC_rx_sv      (pRExC_state->rx_sv)
 163 #define RExC_rx         (pRExC_state->rx)
 164 #define RExC_rxi        (pRExC_state->rxi)
 165 #define RExC_start      (pRExC_state->start)
 166 #define RExC_end        (pRExC_state->end)
 167 #define RExC_parse      (pRExC_state->parse)
 168 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 169 #ifdef RE_TRACK_PATTERN_OFFSETS
 170 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 171 #endif
 172 #define RExC_emit       (pRExC_state->emit)
 173 #define RExC_emit_start (pRExC_state->emit_start)
 174 #define RExC_emit_bound (pRExC_state->emit_bound)
 175 #define RExC_naughty    (pRExC_state->naughty)
 176 #define RExC_sawback    (pRExC_state->sawback)
 177 #define RExC_seen       (pRExC_state->seen)
 178 #define RExC_size       (pRExC_state->size)
 179 #define RExC_npar       (pRExC_state->npar)
 180 #define RExC_nestroot   (pRExC_state->nestroot)
 181 #define RExC_extralen   (pRExC_state->extralen)
 182 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 183 #define RExC_seen_evals (pRExC_state->seen_evals)
 184 #define RExC_utf8       (pRExC_state->utf8)
 185 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 186 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 187 #define RExC_open_parens        (pRExC_state->open_parens)
 188 #define RExC_close_parens       (pRExC_state->close_parens)
 189 #define RExC_opend      (pRExC_state->opend)
 190 #define RExC_paren_names        (pRExC_state->paren_names)
 191 #define RExC_recurse    (pRExC_state->recurse)
 192 #define RExC_recurse_count      (pRExC_state->recurse_count)
 193 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 194 #define RExC_contains_locale    (pRExC_state->contains_locale)
 195
 196
 197 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 198 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 199         ((*s) == '{' && regcurly(s)))
 200
 201 #ifdef SPSTART
 202 #undef SPSTART          /* dratted cpp namespace... */
 203 #endif
 204 /*
 205  * Flags to be passed up and down.
 206  */
 207 #define WORST           0       /* Worst case. */
 208 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 209
 210 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 211  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 212 #define SIMPLE          0x02
 213 #define SPSTART         0x04    /* Starts with * or +. */
 214 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 215 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 216
 217 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 218
 219 /* whether trie related optimizations are enabled */
 220 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 221 #define TRIE_STUDY_OPT
 222 #define FULL_TRIE_STUDY
 223 #define TRIE_STCLASS
 224 #endif
 225
 226
 227
 228 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 229 #define PBITVAL(paren) (1 << ((paren) & 7))
 230 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 231 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 232 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 233
 234 /* If not already in utf8, do a longjmp back to the beginning */
 235 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 236 #define REQUIRE_UTF8    STMT_START {                                       \
 237                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 238                         } STMT_END
 239
 240 /* About scan_data_t.
 241
 242   During optimisation we recurse through the regexp program performing
 243   various inplace (keyhole style) optimisations. In addition study_chunk
 244   and scan_commit populate this data structure with information about
 245   what strings MUST appear in the pattern. We look for the longest
 246   string that must appear at a fixed location, and we look for the
 247   longest string that may appear at a floating location. So for instance
 248   in the pattern:
 249
 250     /FOO[xX]A.*B[xX]BAR/
 251
 252   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 253   strings (because they follow a .* construct). study_chunk will identify
 254   both FOO and BAR as being the longest fixed and floating strings respectively.
 255
 256   The strings can be composites, for instance
 257
 258      /(f)(o)(o)/
 259
 260   will result in a composite fixed substring 'foo'.
 261
 262   For each string some basic information is maintained:
 263
 264   - offset or min_offset
 265     This is the position the string must appear at, or not before.
 266     It also implicitly (when combined with minlenp) tells us how many
 267     characters must match before the string we are searching for.
 268     Likewise when combined with minlenp and the length of the string it
 269     tells us how many characters must appear after the string we have
 270     found.
 271
 272   - max_offset
 273     Only used for floating strings. This is the rightmost point that
 274     the string can appear at. If set to I32 max it indicates that the
 275     string can occur infinitely far to the right.
 276
 277   - minlenp
 278     A pointer to the minimum length of the pattern that the string
 279     was found inside. This is important as in the case of positive
 280     lookahead or positive lookbehind we can have multiple patterns
 281     involved. Consider
 282
 283     /(?=FOO).*F/
 284
 285     The minimum length of the pattern overall is 3, the minimum length
 286     of the lookahead part is 3, but the minimum length of the part that
 287     will actually match is 1. So 'FOO's minimum length is 3, but the
 288     minimum length for the F is 1. This is important as the minimum length
 289     is used to determine offsets in front of and behind the string being
 290     looked for.  Since strings can be composites this is the length of the
 291     pattern at the time it was committed with a scan_commit. Note that
 292     the length is calculated by study_chunk, so that the minimum lengths
 293     are not known until the full pattern has been compiled, thus the
 294     pointer to the value.
 295
 296   - lookbehind
 297
 298     In the case of lookbehind the string being searched for can be
 299     offset past the start point of the final matching string.
 300     If this value was just blithely removed from the min_offset it would
 301     invalidate some of the calculations for how many chars must match
 302     before or after (as they are derived from min_offset and minlen and
 303     the length of the string being searched for).
 304     When the final pattern is compiled and the data is moved from the
 305     scan_data_t structure into the regexp structure the information
 306     about lookbehind is factored in, with the information that would
 307     have been lost precalculated in the end_shift field for the
 308     associated string.
 309
 310   The fields pos_min and pos_delta are used to store the minimum offset
 311   and the delta to the maximum offset at the current point in the pattern.
 312
 313 */
 314
 315 typedef struct scan_data_t {
 316     /*I32 len_min;      unused */
 317     /*I32 len_delta;    unused */
 318     I32 pos_min;
 319     I32 pos_delta;
 320     SV *last_found;
 321     I32 last_end;           /* min value, <0 unless valid. */
 322     I32 last_start_min;
 323     I32 last_start_max;
 324     SV **longest;           /* Either &l_fixed, or &l_float. */
 325     SV *longest_fixed;      /* longest fixed string found in pattern */
 326     I32 offset_fixed;       /* offset where it starts */
 327     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 328     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 329     SV *longest_float;      /* longest floating string found in pattern */
 330     I32 offset_float_min;   /* earliest point in string it can appear */
 331     I32 offset_float_max;   /* latest point in string it can appear */
 332     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 333     I32 lookbehind_float;   /* is the position of the string modified by LB */
 334     I32 flags;
 335     I32 whilem_c;
 336     I32 *last_closep;
 337     struct regnode_charclass_class *start_class;
 338 } scan_data_t;
 339
 340 /*
 341  * Forward declarations for pregcomp()'s friends.
 342  */
 343
 344 static const scan_data_t zero_scan_data =
 345   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 346
 347 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 348 #define SF_BEFORE_SEOL          0x0001
 349 #define SF_BEFORE_MEOL          0x0002
 350 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 351 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 352
 353 #ifdef NO_UNARY_PLUS
 354 #  define SF_FIX_SHIFT_EOL      (0+2)
 355 #  define SF_FL_SHIFT_EOL               (0+4)
 356 #else
 357 #  define SF_FIX_SHIFT_EOL      (+2)
 358 #  define SF_FL_SHIFT_EOL               (+4)
 359 #endif
 360
 361 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 362 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 363
 364 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 365 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 366 #define SF_IS_INF               0x0040
 367 #define SF_HAS_PAR              0x0080
 368 #define SF_IN_PAR               0x0100
 369 #define SF_HAS_EVAL             0x0200
 370 #define SCF_DO_SUBSTR           0x0400
 371 #define SCF_DO_STCLASS_AND      0x0800
 372 #define SCF_DO_STCLASS_OR       0x1000
 373 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 374 #define SCF_WHILEM_VISITED_POS  0x2000
 375
 376 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 377 #define SCF_SEEN_ACCEPT         0x8000
 378
 379 #define UTF cBOOL(RExC_utf8)
 380 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 381 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 382 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 383 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 384 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 385 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 386 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 387
 388 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 389
 390 #define OOB_UNICODE             12345678
 391 #define OOB_NAMEDCLASS          -1
 392
 393 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 394 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 395
 396
 397 /* length of regex to show in messages that don't mark a position within */
 398 #define RegexLengthToShowInErrorMessages 127
 399
 400 /*
 401  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 402  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 403  * op/pragma/warn/regcomp.
 404  */
 405 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 406 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 407
 408 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 409
 410 /*
 411  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 412  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 413  * "...".
 414  */
 415 #define _FAIL(code) STMT_START {                                        \
 416     const char *ellipses = "";                                          \
 417     IV len = RExC_end - RExC_precomp;                                   \
 418                                                                         \
 419     if (!SIZE_ONLY)                                                     \
 420         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 421     if (len > RegexLengthToShowInErrorMessages) {                       \
 422         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 423         len = RegexLengthToShowInErrorMessages - 10;                    \
 424         ellipses = "...";                                               \
 425     }                                                                   \
 426     code;                                                               \
 427 } STMT_END
 428
 429 #define FAIL(msg) _FAIL(                            \
 430     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 431             msg, (int)len, RExC_precomp, ellipses))
 432
 433 #define FAIL2(msg,arg) _FAIL(                       \
 434     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 435             arg, (int)len, RExC_precomp, ellipses))
 436
 437 /*
 438  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 439  */
 440 #define Simple_vFAIL(m) STMT_START {                                    \
 441     const IV offset = RExC_parse - RExC_precomp;                        \
 442     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 443             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 444 } STMT_END
 445
 446 /*
 447  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 448  */
 449 #define vFAIL(m) STMT_START {                           \
 450     if (!SIZE_ONLY)                                     \
 451         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 452     Simple_vFAIL(m);                                    \
 453 } STMT_END
 454
 455 /*
 456  * Like Simple_vFAIL(), but accepts two arguments.
 457  */
 458 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 459     const IV offset = RExC_parse - RExC_precomp;                        \
 460     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 461             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 462 } STMT_END
 463
 464 /*
 465  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 466  */
 467 #define vFAIL2(m,a1) STMT_START {                       \
 468     if (!SIZE_ONLY)                                     \
 469         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 470     Simple_vFAIL2(m, a1);                               \
 471 } STMT_END
 472
 473
 474 /*
 475  * Like Simple_vFAIL(), but accepts three arguments.
 476  */
 477 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 478     const IV offset = RExC_parse - RExC_precomp;                \
 479     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 480             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 481 } STMT_END
 482
 483 /*
 484  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 485  */
 486 #define vFAIL3(m,a1,a2) STMT_START {                    \
 487     if (!SIZE_ONLY)                                     \
 488         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 489     Simple_vFAIL3(m, a1, a2);                           \
 490 } STMT_END
 491
 492 /*
 493  * Like Simple_vFAIL(), but accepts four arguments.
 494  */
 495 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 496     const IV offset = RExC_parse - RExC_precomp;                \
 497     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 498             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 499 } STMT_END
 500
 501 #define ckWARNreg(loc,m) STMT_START {                                   \
 502     const IV offset = loc - RExC_precomp;                               \
 503     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 504             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 505 } STMT_END
 506
 507 #define ckWARNregdep(loc,m) STMT_START {                                \
 508     const IV offset = loc - RExC_precomp;                               \
 509     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 510             m REPORT_LOCATION,                                          \
 511             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 512 } STMT_END
 513
 514 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 515     const IV offset = loc - RExC_precomp;                               \
 516     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 517             m REPORT_LOCATION,                                          \
 518             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 519 } STMT_END
 520
 521 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 522     const IV offset = loc - RExC_precomp;                               \
 523     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 524             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 525 } STMT_END
 526
 527 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 528     const IV offset = loc - RExC_precomp;                               \
 529     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 530             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 531 } STMT_END
 532
 533 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 534     const IV offset = loc - RExC_precomp;                               \
 535     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 536             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 537 } STMT_END
 538
 539 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 540     const IV offset = loc - RExC_precomp;                               \
 541     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 542             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 543 } STMT_END
 544
 545 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 546     const IV offset = loc - RExC_precomp;                               \
 547     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 548             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 549 } STMT_END
 550
 551 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 552     const IV offset = loc - RExC_precomp;                               \
 553     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 554             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 555 } STMT_END
 556
 557
 558 /* Allow for side effects in s */
 559 #define REGC(c,s) STMT_START {                  \
 560     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 561 } STMT_END
 562
 563 /* Macros for recording node offsets.   20001227 mjd@plover.com
 564  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 565  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 566  * Element 0 holds the number n.
 567  * Position is 1 indexed.
 568  */
 569 #ifndef RE_TRACK_PATTERN_OFFSETS
 570 #define Set_Node_Offset_To_R(node,byte)
 571 #define Set_Node_Offset(node,byte)
 572 #define Set_Cur_Node_Offset
 573 #define Set_Node_Length_To_R(node,len)
 574 #define Set_Node_Length(node,len)
 575 #define Set_Node_Cur_Length(node)
 576 #define Node_Offset(n)
 577 #define Node_Length(n)
 578 #define Set_Node_Offset_Length(node,offset,len)
 579 #define ProgLen(ri) ri->u.proglen
 580 #define SetProgLen(ri,x) ri->u.proglen = x
 581 #else
 582 #define ProgLen(ri) ri->u.offsets[0]
 583 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 584 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 585     if (! SIZE_ONLY) {                                                  \
 586         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 587                     __LINE__, (int)(node), (int)(byte)));               \
 588         if((node) < 0) {                                                \
 589             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 590         } else {                                                        \
 591             RExC_offsets[2*(node)-1] = (byte);                          \
 592         }                                                               \
 593     }                                                                   \
 594 } STMT_END
 595
 596 #define Set_Node_Offset(node,byte) \
 597     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 598 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 599
 600 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 601     if (! SIZE_ONLY) {                                                  \
 602         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 603                 __LINE__, (int)(node), (int)(len)));                    \
 604         if((node) < 0) {                                                \
 605             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 606         } else {                                                        \
 607             RExC_offsets[2*(node)] = (len);                             \
 608         }                                                               \
 609     }                                                                   \
 610 } STMT_END
 611
 612 #define Set_Node_Length(node,len) \
 613     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 614 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 615 #define Set_Node_Cur_Length(node) \
 616     Set_Node_Length(node, RExC_parse - parse_start)
 617
 618 /* Get offsets and lengths */
 619 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 620 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 621
 622 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 623     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 624     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 625 } STMT_END
 626 #endif
 627
 628 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 629 #define EXPERIMENTAL_INPLACESCAN
 630 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 631
 632 #define DEBUG_STUDYDATA(str,data,depth)                              \
 633 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 634     PerlIO_printf(Perl_debug_log,                                    \
 635         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 636         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 637         (int)(depth)*2, "",                                          \
 638         (IV)((data)->pos_min),                                       \
 639         (IV)((data)->pos_delta),                                     \
 640         (UV)((data)->flags),                                         \
 641         (IV)((data)->whilem_c),                                      \
 642         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 643         is_inf ? "INF " : ""                                         \
 644     );                                                               \
 645     if ((data)->last_found)                                          \
 646         PerlIO_printf(Perl_debug_log,                                \
 647             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 648             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 649             SvPVX_const((data)->last_found),                         \
 650             (IV)((data)->last_end),                                  \
 651             (IV)((data)->last_start_min),                            \
 652             (IV)((data)->last_start_max),                            \
 653             ((data)->longest &&                                      \
 654              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 655             SvPVX_const((data)->longest_fixed),                      \
 656             (IV)((data)->offset_fixed),                              \
 657             ((data)->longest &&                                      \
 658              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 659             SvPVX_const((data)->longest_float),                      \
 660             (IV)((data)->offset_float_min),                          \
 661             (IV)((data)->offset_float_max)                           \
 662         );                                                           \
 663     PerlIO_printf(Perl_debug_log,"\n");                              \
 664 });
 665
 666 static void clear_re(pTHX_ void *r);
 667
 668 /* Mark that we cannot extend a found fixed substring at this point.
 669    Update the longest found anchored substring and the longest found
 670    floating substrings if needed. */
 671
 672 STATIC void
 673 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 674 {
 675     const STRLEN l = CHR_SVLEN(data->last_found);
 676     const STRLEN old_l = CHR_SVLEN(*data->longest);
 677     GET_RE_DEBUG_FLAGS_DECL;
 678
 679     PERL_ARGS_ASSERT_SCAN_COMMIT;
 680
 681     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 682         SvSetMagicSV(*data->longest, data->last_found);
 683         if (*data->longest == data->longest_fixed) {
 684             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 685             if (data->flags & SF_BEFORE_EOL)
 686                 data->flags
 687                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 688             else
 689                 data->flags &= ~SF_FIX_BEFORE_EOL;
 690             data->minlen_fixed=minlenp;
 691             data->lookbehind_fixed=0;
 692         }
 693         else { /* *data->longest == data->longest_float */
 694             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 695             data->offset_float_max = (l
 696                                       ? data->last_start_max
 697                                       : data->pos_min + data->pos_delta);
 698             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 699                 data->offset_float_max = I32_MAX;
 700             if (data->flags & SF_BEFORE_EOL)
 701                 data->flags
 702                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 703             else
 704                 data->flags &= ~SF_FL_BEFORE_EOL;
 705             data->minlen_float=minlenp;
 706             data->lookbehind_float=0;
 707         }
 708     }
 709     SvCUR_set(data->last_found, 0);
 710     {
 711         SV * const sv = data->last_found;
 712         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 713             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 714             if (mg)
 715                 mg->mg_len = 0;
 716         }
 717     }
 718     data->last_end = -1;
 719     data->flags &= ~SF_BEFORE_EOL;
 720     DEBUG_STUDYDATA("commit: ",data,0);
 721 }
 722
 723 /* Can match anything (initialization) */
 724 STATIC void
 725 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 726 {
 727     PERL_ARGS_ASSERT_CL_ANYTHING;
 728
 729     ANYOF_BITMAP_SETALL(cl);
 730     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 731                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL
 732                     /* Even though no bitmap is in use here, we need to set
 733                      * the flag below so an AND with a node that does have one
 734                      * doesn't lose that one.  The flag should get cleared if
 735                      * the other one doesn't; and the code in regexec.c is
 736                      * structured so this being set when not needed does no
 737                      * harm.  It seemed a little cleaner to set it here than do
 738                      * a special case in cl_and() */
 739                 |ANYOF_NONBITMAP_NON_UTF8;
 740
 741     /* If any portion of the regex is to operate under locale rules,
 742      * initialization includes it.  The reason this isn't done for all regexes
 743      * is that the optimizer was written under the assumption that locale was
 744      * all-or-nothing.  Given the complexity and lack of documentation in the
 745      * optimizer, and that there are inadequate test cases for locale, so many
 746      * parts of it may not work properly, it is safest to avoid locale unless
 747      * necessary. */
 748     if (RExC_contains_locale) {
 749         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 750         cl->flags |= ANYOF_LOCALE;
 751     }
 752     else {
 753         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 754     }
 755 }
 756
 757 /* Can match anything (initialization) */
 758 STATIC int
 759 S_cl_is_anything(const struct regnode_charclass_class *cl)
 760 {
 761     int value;
 762
 763     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 764
 765     for (value = 0; value <= ANYOF_MAX; value += 2)
 766         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 767             return 1;
 768     if (!(cl->flags & ANYOF_UNICODE_ALL))
 769         return 0;
 770     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 771         return 0;
 772     return 1;
 773 }
 774
 775 /* Can match anything (initialization) */
 776 STATIC void
 777 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 778 {
 779     PERL_ARGS_ASSERT_CL_INIT;
 780
 781     Zero(cl, 1, struct regnode_charclass_class);
 782     cl->type = ANYOF;
 783     cl_anything(pRExC_state, cl);
 784     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 785 }
 786
 787 /* These two functions currently do the exact same thing */
 788 #define cl_init_zero            S_cl_init
 789
 790 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 791  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 792  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 793 STATIC void
 794 S_cl_and(struct regnode_charclass_class *cl,
 795         const struct regnode_charclass_class *and_with)
 796 {
 797     PERL_ARGS_ASSERT_CL_AND;
 798
 799     assert(and_with->type == ANYOF);
 800
 801     /* I (khw) am not sure all these restrictions are necessary XXX */
 802     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 803         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 804         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 805         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 806         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 807         int i;
 808
 809         if (and_with->flags & ANYOF_INVERT)
 810             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 811                 cl->bitmap[i] &= ~and_with->bitmap[i];
 812         else
 813             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 814                 cl->bitmap[i] &= and_with->bitmap[i];
 815     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 816
 817     if (and_with->flags & ANYOF_INVERT) {
 818
 819         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 820          * aren't affected by the inversion.  Those that are affected are
 821          * handled individually below */
 822         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 823         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 824         cl->flags |= affected_flags;
 825
 826         /* We currently don't know how to deal with things that aren't in the
 827          * bitmap, but we know that the intersection is no greater than what
 828          * is already in cl, so let there be false positives that get sorted
 829          * out after the synthetic start class succeeds, and the node is
 830          * matched for real. */
 831
 832         /* The inversion of these two flags indicate that the resulting
 833          * intersection doesn't have them */
 834         if (and_with->flags & ANYOF_UNICODE_ALL) {
 835             cl->flags &= ~ANYOF_UNICODE_ALL;
 836         }
 837         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 838             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 839         }
 840     }
 841     else {   /* and'd node is not inverted */
 842         if (! ANYOF_NONBITMAP(and_with)) {
 843
 844             /* Here 'and_with' doesn't match anything outside the bitmap
 845              * (except possibly ANYOF_UNICODE_ALL), which means the
 846              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 847              * which case we don't know what the intersection is, but it's no
 848              * greater than what cl already has, so can just leave it alone,
 849              * with possible false positives */
 850             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 851                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 852                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 853             }
 854         }
 855         else if (! ANYOF_NONBITMAP(cl)) {
 856
 857             /* Here, 'and_with' does match something outside the bitmap, and cl
 858              * doesn't have a list of things to match outside the bitmap.  If
 859              * cl can match all code points above 255, the intersection will
 860              * be those above-255 code points that 'and_with' matches.  There
 861              * may be false positives from code points in 'and_with' that are
 862              * outside the bitmap but below 256, but those get sorted out
 863              * after the synthetic start class succeeds).  If cl can't match
 864              * all Unicode code points, it means here that it can't match *
 865              * anything outside the bitmap, so we leave the bitmap empty */
 866             if (cl->flags & ANYOF_UNICODE_ALL) {
 867                 ARG_SET(cl, ARG(and_with));
 868             }
 869         }
 870         else {
 871             /* Here, both 'and_with' and cl match something outside the
 872              * bitmap.  Currently we do not do the intersection, so just match
 873              * whatever cl had at the beginning.  */
 874         }
 875
 876
 877         /* Take the intersection of the two sets of flags */
 878         cl->flags &= and_with->flags;
 879     }
 880 }
 881
 882 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 883  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 884  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 885 STATIC void
 886 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 887 {
 888     PERL_ARGS_ASSERT_CL_OR;
 889
 890     if (or_with->flags & ANYOF_INVERT) {
 891
 892         /* Here, the or'd node is to be inverted.  This means we take the
 893          * complement of everything not in the bitmap, but currently we don't
 894          * know what that is, so give up and match anything */
 895         if (ANYOF_NONBITMAP(or_with)) {
 896             cl_anything(pRExC_state, cl);
 897         }
 898         /* We do not use
 899          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 900          *   <= (B1 | !B2) | (CL1 | !CL2)
 901          * which is wasteful if CL2 is small, but we ignore CL2:
 902          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 903          * XXXX Can we handle case-fold?  Unclear:
 904          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 905          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 906          */
 907         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 908              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 909              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 910             int i;
 911
 912             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 913                 cl->bitmap[i] |= ~or_with->bitmap[i];
 914         } /* XXXX: logic is complicated otherwise */
 915         else {
 916             cl_anything(pRExC_state, cl);
 917         }
 918
 919         /* And, we can just take the union of the flags that aren't affected
 920          * by the inversion */
 921         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 922
 923         /* For the remaining flags:
 924             ANYOF_UNICODE_ALL and inverted means to not match anything above
 925                     255, which means that the union with cl should just be
 926                     what cl has in it, so can ignore this flag
 927             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 928                     is 127-255 to match them, but then invert that, so the
 929                     union with cl should just be what cl has in it, so can
 930                     ignore this flag
 931          */
 932     } else {    /* 'or_with' is not inverted */
 933         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 934         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 935              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 936                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 937             int i;
 938
 939             /* OR char bitmap and class bitmap separately */
 940             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 941                 cl->bitmap[i] |= or_with->bitmap[i];
 942             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 943                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 944                     cl->classflags[i] |= or_with->classflags[i];
 945                 cl->flags |= ANYOF_CLASS;
 946             }
 947         }
 948         else { /* XXXX: logic is complicated, leave it along for a moment. */
 949             cl_anything(pRExC_state, cl);
 950         }
 951
 952         if (ANYOF_NONBITMAP(or_with)) {
 953
 954             /* Use the added node's outside-the-bit-map match if there isn't a
 955              * conflict.  If there is a conflict (both nodes match something
 956              * outside the bitmap, but what they match outside is not the same
 957              * pointer, and hence not easily compared until XXX we extend
 958              * inversion lists this far), give up and allow the start class to
 959              * match everything outside the bitmap.  If that stuff is all above
 960              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 961             if (! ANYOF_NONBITMAP(cl)) {
 962                 ARG_SET(cl, ARG(or_with));
 963             }
 964             else if (ARG(cl) != ARG(or_with)) {
 965
 966                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
 967                     cl_anything(pRExC_state, cl);
 968                 }
 969                 else {
 970                     cl->flags |= ANYOF_UNICODE_ALL;
 971                 }
 972             }
 973
 974         /* Take the union */
 975         cl->flags |= or_with->flags;
 976         }
 977     }
 978 }
 979
 980 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
 981 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
 982 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
 983 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
 984
 985
 986 #ifdef DEBUGGING
 987 /*
 988    dump_trie(trie,widecharmap,revcharmap)
 989    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
 990    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
 991
 992    These routines dump out a trie in a somewhat readable format.
 993    The _interim_ variants are used for debugging the interim
 994    tables that are used to generate the final compressed
 995    representation which is what dump_trie expects.
 996
 997    Part of the reason for their existence is to provide a form
 998    of documentation as to how the different representations function.
 999
1000 */
1001
1002 /*
1003   Dumps the final compressed table form of the trie to Perl_debug_log.
1004   Used for debugging make_trie().
1005 */
1006
1007 STATIC void
1008 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1009             AV *revcharmap, U32 depth)
1010 {
1011     U32 state;
1012     SV *sv=sv_newmortal();
1013     int colwidth= widecharmap ? 6 : 4;
1014     U16 word;
1015     GET_RE_DEBUG_FLAGS_DECL;
1016
1017     PERL_ARGS_ASSERT_DUMP_TRIE;
1018
1019     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1020         (int)depth * 2 + 2,"",
1021         "Match","Base","Ofs" );
1022
1023     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1024         SV ** const tmp = av_fetch( revcharmap, state, 0);
1025         if ( tmp ) {
1026             PerlIO_printf( Perl_debug_log, "%*s",
1027                 colwidth,
1028                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1029                             PL_colors[0], PL_colors[1],
1030                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1031                             PERL_PV_ESCAPE_FIRSTCHAR
1032                 )
1033             );
1034         }
1035     }
1036     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1037         (int)depth * 2 + 2,"");
1038
1039     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1040         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1041     PerlIO_printf( Perl_debug_log, "\n");
1042
1043     for( state = 1 ; state < trie->statecount ; state++ ) {
1044         const U32 base = trie->states[ state ].trans.base;
1045
1046         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1047
1048         if ( trie->states[ state ].wordnum ) {
1049             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1050         } else {
1051             PerlIO_printf( Perl_debug_log, "%6s", "" );
1052         }
1053
1054         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1055
1056         if ( base ) {
1057             U32 ofs = 0;
1058
1059             while( ( base + ofs  < trie->uniquecharcount ) ||
1060                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1061                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1062                     ofs++;
1063
1064             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1065
1066             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1067                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1068                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1069                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1070                 {
1071                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1072                     colwidth,
1073                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1074                 } else {
1075                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1076                 }
1077             }
1078
1079             PerlIO_printf( Perl_debug_log, "]");
1080
1081         }
1082         PerlIO_printf( Perl_debug_log, "\n" );
1083     }
1084     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1085     for (word=1; word <= trie->wordcount; word++) {
1086         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1087             (int)word, (int)(trie->wordinfo[word].prev),
1088             (int)(trie->wordinfo[word].len));
1089     }
1090     PerlIO_printf(Perl_debug_log, "\n" );
1091 }
1092 /*
1093   Dumps a fully constructed but uncompressed trie in list form.
1094   List tries normally only are used for construction when the number of
1095   possible chars (trie->uniquecharcount) is very high.
1096   Used for debugging make_trie().
1097 */
1098 STATIC void
1099 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1100                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1101                          U32 depth)
1102 {
1103     U32 state;
1104     SV *sv=sv_newmortal();
1105     int colwidth= widecharmap ? 6 : 4;
1106     GET_RE_DEBUG_FLAGS_DECL;
1107
1108     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1109
1110     /* print out the table precompression.  */
1111     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1112         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1113         "------:-----+-----------------\n" );
1114
1115     for( state=1 ; state < next_alloc ; state ++ ) {
1116         U16 charid;
1117
1118         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1119             (int)depth * 2 + 2,"", (UV)state  );
1120         if ( ! trie->states[ state ].wordnum ) {
1121             PerlIO_printf( Perl_debug_log, "%5s| ","");
1122         } else {
1123             PerlIO_printf( Perl_debug_log, "W%4x| ",
1124                 trie->states[ state ].wordnum
1125             );
1126         }
1127         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1128             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1129             if ( tmp ) {
1130                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1131                     colwidth,
1132                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1133                             PL_colors[0], PL_colors[1],
1134                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1135                             PERL_PV_ESCAPE_FIRSTCHAR
1136                     ) ,
1137                     TRIE_LIST_ITEM(state,charid).forid,
1138                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1139                 );
1140                 if (!(charid % 10))
1141                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1142                         (int)((depth * 2) + 14), "");
1143             }
1144         }
1145         PerlIO_printf( Perl_debug_log, "\n");
1146     }
1147 }
1148
1149 /*
1150   Dumps a fully constructed but uncompressed trie in table form.
1151   This is the normal DFA style state transition table, with a few
1152   twists to facilitate compression later.
1153   Used for debugging make_trie().
1154 */
1155 STATIC void
1156 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1157                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1158                           U32 depth)
1159 {
1160     U32 state;
1161     U16 charid;
1162     SV *sv=sv_newmortal();
1163     int colwidth= widecharmap ? 6 : 4;
1164     GET_RE_DEBUG_FLAGS_DECL;
1165
1166     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1167
1168     /*
1169        print out the table precompression so that we can do a visual check
1170        that they are identical.
1171      */
1172
1173     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1174
1175     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1176         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1177         if ( tmp ) {
1178             PerlIO_printf( Perl_debug_log, "%*s",
1179                 colwidth,
1180                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1181                             PL_colors[0], PL_colors[1],
1182                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1183                             PERL_PV_ESCAPE_FIRSTCHAR
1184                 )
1185             );
1186         }
1187     }
1188
1189     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1190
1191     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1192         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1193     }
1194
1195     PerlIO_printf( Perl_debug_log, "\n" );
1196
1197     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1198
1199         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1200             (int)depth * 2 + 2,"",
1201             (UV)TRIE_NODENUM( state ) );
1202
1203         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1204             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1205             if (v)
1206                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1207             else
1208                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1209         }
1210         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1211             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1212         } else {
1213             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1214             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1215         }
1216     }
1217 }
1218
1219 #endif
1220
1221
1222 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1223   startbranch: the first branch in the whole branch sequence
1224   first      : start branch of sequence of branch-exact nodes.
1225                May be the same as startbranch
1226   last       : Thing following the last branch.
1227                May be the same as tail.
1228   tail       : item following the branch sequence
1229   count      : words in the sequence
1230   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1231   depth      : indent depth
1232
1233 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1234
1235 A trie is an N'ary tree where the branches are determined by digital
1236 decomposition of the key. IE, at the root node you look up the 1st character and
1237 follow that branch repeat until you find the end of the branches. Nodes can be
1238 marked as "accepting" meaning they represent a complete word. Eg:
1239
1240   /he|she|his|hers/
1241
1242 would convert into the following structure. Numbers represent states, letters
1243 following numbers represent valid transitions on the letter from that state, if
1244 the number is in square brackets it represents an accepting state, otherwise it
1245 will be in parenthesis.
1246
1247       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1248       |    |
1249       |   (2)
1250       |    |
1251      (1)   +-i->(6)-+-s->[7]
1252       |
1253       +-s->(3)-+-h->(4)-+-e->[5]
1254
1255       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1256
1257 This shows that when matching against the string 'hers' we will begin at state 1
1258 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1259 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1260 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1261 single traverse. We store a mapping from accepting to state to which word was
1262 matched, and then when we have multiple possibilities we try to complete the
1263 rest of the regex in the order in which they occured in the alternation.
1264
1265 The only prior NFA like behaviour that would be changed by the TRIE support is
1266 the silent ignoring of duplicate alternations which are of the form:
1267
1268  / (DUPE|DUPE) X? (?{ ... }) Y /x
1269
1270 Thus EVAL blocks following a trie may be called a different number of times with
1271 and without the optimisation. With the optimisations dupes will be silently
1272 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1273 the following demonstrates:
1274
1275  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1276
1277 which prints out 'word' three times, but
1278
1279  'words'=~/(word|word|word)(?{ print $1 })S/
1280
1281 which doesnt print it out at all. This is due to other optimisations kicking in.
1282
1283 Example of what happens on a structural level:
1284
1285 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1286
1287    1: CURLYM[1] {1,32767}(18)
1288    5:   BRANCH(8)
1289    6:     EXACT <ac>(16)
1290    8:   BRANCH(11)
1291    9:     EXACT <ad>(16)
1292   11:   BRANCH(14)
1293   12:     EXACT <ab>(16)
1294   16:   SUCCEED(0)
1295   17:   NOTHING(18)
1296   18: END(0)
1297
1298 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1299 and should turn into:
1300
1301    1: CURLYM[1] {1,32767}(18)
1302    5:   TRIE(16)
1303         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1304           <ac>
1305           <ad>
1306           <ab>
1307   16:   SUCCEED(0)
1308   17:   NOTHING(18)
1309   18: END(0)
1310
1311 Cases where tail != last would be like /(?foo|bar)baz/:
1312
1313    1: BRANCH(4)
1314    2:   EXACT <foo>(8)
1315    4: BRANCH(7)
1316    5:   EXACT <bar>(8)
1317    7: TAIL(8)
1318    8: EXACT <baz>(10)
1319   10: END(0)
1320
1321 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1322 and would end up looking like:
1323
1324     1: TRIE(8)
1325       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1326         <foo>
1327         <bar>
1328    7: TAIL(8)
1329    8: EXACT <baz>(10)
1330   10: END(0)
1331
1332     d = uvuni_to_utf8_flags(d, uv, 0);
1333
1334 is the recommended Unicode-aware way of saying
1335
1336     *(d++) = uv;
1337 */
1338
1339 #define TRIE_STORE_REVCHAR                                                 \
1340     STMT_START {                                                           \
1341         if (UTF) {                                                         \
1342             SV *zlopp = newSV(2);                                          \
1343             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1344             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1345             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1346             SvPOK_on(zlopp);                                               \
1347             SvUTF8_on(zlopp);                                              \
1348             av_push(revcharmap, zlopp);                                    \
1349         } else {                                                           \
1350             char ooooff = (char)uvc;                                               \
1351             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1352         }                                                                  \
1353         } STMT_END
1354
1355 #define TRIE_READ_CHAR STMT_START {                                           \
1356     wordlen++;                                                                \
1357     if ( UTF ) {                                                              \
1358         if ( folder ) {                                                       \
1359             if ( foldlen > 0 ) {                                              \
1360                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1361                foldlen -= len;                                                \
1362                scan += len;                                                   \
1363                len = 0;                                                       \
1364             } else {                                                          \
1365                 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1366                 uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
1367                 foldlen -= UNISKIP( uvc );                                    \
1368                 scan = foldbuf + UNISKIP( uvc );                              \
1369             }                                                                 \
1370         } else {                                                              \
1371             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1372         }                                                                     \
1373     } else {                                                                  \
1374         uvc = (U32)*uc;                                                       \
1375         len = 1;                                                              \
1376     }                                                                         \
1377 } STMT_END
1378
1379
1380
1381 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1382     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1383         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1384         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1385     }                                                           \
1386     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1387     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1388     TRIE_LIST_CUR( state )++;                                   \
1389 } STMT_END
1390
1391 #define TRIE_LIST_NEW(state) STMT_START {                       \
1392     Newxz( trie->states[ state ].trans.list,               \
1393         4, reg_trie_trans_le );                                 \
1394      TRIE_LIST_CUR( state ) = 1;                                \
1395      TRIE_LIST_LEN( state ) = 4;                                \
1396 } STMT_END
1397
1398 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1399     U16 dupe= trie->states[ state ].wordnum;                    \
1400     regnode * const noper_next = regnext( noper );              \
1401                                                                 \
1402     DEBUG_r({                                                   \
1403         /* store the word for dumping */                        \
1404         SV* tmp;                                                \
1405         if (OP(noper) != NOTHING)                               \
1406             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1407         else                                                    \
1408             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1409         av_push( trie_words, tmp );                             \
1410     });                                                         \
1411                                                                 \
1412     curword++;                                                  \
1413     trie->wordinfo[curword].prev   = 0;                         \
1414     trie->wordinfo[curword].len    = wordlen;                   \
1415     trie->wordinfo[curword].accept = state;                     \
1416                                                                 \
1417     if ( noper_next < tail ) {                                  \
1418         if (!trie->jump)                                        \
1419             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1420         trie->jump[curword] = (U16)(noper_next - convert);      \
1421         if (!jumper)                                            \
1422             jumper = noper_next;                                \
1423         if (!nextbranch)                                        \
1424             nextbranch= regnext(cur);                           \
1425     }                                                           \
1426                                                                 \
1427     if ( dupe ) {                                               \
1428         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1429         /* chain, so that when the bits of chain are later    */\
1430         /* linked together, the dups appear in the chain      */\
1431         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1432         trie->wordinfo[dupe].prev = curword;                    \
1433     } else {                                                    \
1434         /* we haven't inserted this word yet.                */ \
1435         trie->states[ state ].wordnum = curword;                \
1436     }                                                           \
1437 } STMT_END
1438
1439
1440 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1441      ( ( base + charid >=  ucharcount                                   \
1442          && base + charid < ubound                                      \
1443          && state == trie->trans[ base - ucharcount + charid ].check    \
1444          && trie->trans[ base - ucharcount + charid ].next )            \
1445            ? trie->trans[ base - ucharcount + charid ].next             \
1446            : ( state==1 ? special : 0 )                                 \
1447       )
1448
1449 #define MADE_TRIE       1
1450 #define MADE_JUMP_TRIE  2
1451 #define MADE_EXACT_TRIE 4
1452
1453 STATIC I32
1454 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1455 {
1456     dVAR;
1457     /* first pass, loop through and scan words */
1458     reg_trie_data *trie;
1459     HV *widecharmap = NULL;
1460     AV *revcharmap = newAV();
1461     regnode *cur;
1462     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1463     STRLEN len = 0;
1464     UV uvc = 0;
1465     U16 curword = 0;
1466     U32 next_alloc = 0;
1467     regnode *jumper = NULL;
1468     regnode *nextbranch = NULL;
1469     regnode *convert = NULL;
1470     U32 *prev_states; /* temp array mapping each state to previous one */
1471     /* we just use folder as a flag in utf8 */
1472     const U8 * folder = NULL;
1473
1474 #ifdef DEBUGGING
1475     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1476     AV *trie_words = NULL;
1477     /* along with revcharmap, this only used during construction but both are
1478      * useful during debugging so we store them in the struct when debugging.
1479      */
1480 #else
1481     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1482     STRLEN trie_charcount=0;
1483 #endif
1484     SV *re_trie_maxbuff;
1485     GET_RE_DEBUG_FLAGS_DECL;
1486
1487     PERL_ARGS_ASSERT_MAKE_TRIE;
1488 #ifndef DEBUGGING
1489     PERL_UNUSED_ARG(depth);
1490 #endif
1491
1492     switch (flags) {
1493         case EXACTFA:
1494         case EXACTFU: folder = PL_fold_latin1; break;
1495         case EXACTF:  folder = PL_fold; break;
1496         case EXACTFL: folder = PL_fold_locale; break;
1497     }
1498
1499     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1500     trie->refcount = 1;
1501     trie->startstate = 1;
1502     trie->wordcount = word_count;
1503     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1504     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1505     if (!(UTF && folder))
1506         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1507     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1508                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1509
1510     DEBUG_r({
1511         trie_words = newAV();
1512     });
1513
1514     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1515     if (!SvIOK(re_trie_maxbuff)) {
1516         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1517     }
1518     DEBUG_OPTIMISE_r({
1519                 PerlIO_printf( Perl_debug_log,
1520                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1521                   (int)depth * 2 + 2, "",
1522                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1523                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1524                   (int)depth);
1525     });
1526
1527    /* Find the node we are going to overwrite */
1528     if ( first == startbranch && OP( last ) != BRANCH ) {
1529         /* whole branch chain */
1530         convert = first;
1531     } else {
1532         /* branch sub-chain */
1533         convert = NEXTOPER( first );
1534     }
1535
1536     /*  -- First loop and Setup --
1537
1538        We first traverse the branches and scan each word to determine if it
1539        contains widechars, and how many unique chars there are, this is
1540        important as we have to build a table with at least as many columns as we
1541        have unique chars.
1542
1543        We use an array of integers to represent the character codes 0..255
1544        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1545        native representation of the character value as the key and IV's for the
1546        coded index.
1547
1548        *TODO* If we keep track of how many times each character is used we can
1549        remap the columns so that the table compression later on is more
1550        efficient in terms of memory by ensuring the most common value is in the
1551        middle and the least common are on the outside.  IMO this would be better
1552        than a most to least common mapping as theres a decent chance the most
1553        common letter will share a node with the least common, meaning the node
1554        will not be compressible. With a middle is most common approach the worst
1555        case is when we have the least common nodes twice.
1556
1557      */
1558
1559     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1560         regnode * const noper = NEXTOPER( cur );
1561         const U8 *uc = (U8*)STRING( noper );
1562         const U8 * const e  = uc + STR_LEN( noper );
1563         STRLEN foldlen = 0;
1564         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1565         const U8 *scan = (U8*)NULL;
1566         U32 wordlen      = 0;         /* required init */
1567         STRLEN chars = 0;
1568         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1569
1570         if (OP(noper) == NOTHING) {
1571             trie->minlen= 0;
1572             continue;
1573         }
1574         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1575             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1576                                           regardless of encoding */
1577
1578         for ( ; uc < e ; uc += len ) {
1579             TRIE_CHARCOUNT(trie)++;
1580             TRIE_READ_CHAR;
1581             chars++;
1582             if ( uvc < 256 ) {
1583                 if ( !trie->charmap[ uvc ] ) {
1584                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1585                     if ( folder )
1586                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1587                     TRIE_STORE_REVCHAR;
1588                 }
1589                 if ( set_bit ) {
1590                     /* store the codepoint in the bitmap, and its folded
1591                      * equivalent. */
1592                     TRIE_BITMAP_SET(trie,uvc);
1593
1594                     /* store the folded codepoint */
1595                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1596
1597                     if ( !UTF ) {
1598                         /* store first byte of utf8 representation of
1599                            variant codepoints */
1600                         if (! UNI_IS_INVARIANT(uvc)) {
1601                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1602                         }
1603                     }
1604                     set_bit = 0; /* We've done our bit :-) */
1605                 }
1606             } else {
1607                 SV** svpp;
1608                 if ( !widecharmap )
1609                     widecharmap = newHV();
1610
1611                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1612
1613                 if ( !svpp )
1614                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1615
1616                 if ( !SvTRUE( *svpp ) ) {
1617                     sv_setiv( *svpp, ++trie->uniquecharcount );
1618                     TRIE_STORE_REVCHAR;
1619                 }
1620             }
1621         }
1622         if( cur == first ) {
1623             trie->minlen=chars;
1624             trie->maxlen=chars;
1625         } else if (chars < trie->minlen) {
1626             trie->minlen=chars;
1627         } else if (chars > trie->maxlen) {
1628             trie->maxlen=chars;
1629         }
1630
1631     } /* end first pass */
1632     DEBUG_TRIE_COMPILE_r(
1633         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1634                 (int)depth * 2 + 2,"",
1635                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1636                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1637                 (int)trie->minlen, (int)trie->maxlen )
1638     );
1639
1640     /*
1641         We now know what we are dealing with in terms of unique chars and
1642         string sizes so we can calculate how much memory a naive
1643         representation using a flat table  will take. If it's over a reasonable
1644         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1645         conservative but potentially much slower representation using an array
1646         of lists.
1647
1648         At the end we convert both representations into the same compressed
1649         form that will be used in regexec.c for matching with. The latter
1650         is a form that cannot be used to construct with but has memory
1651         properties similar to the list form and access properties similar
1652         to the table form making it both suitable for fast searches and
1653         small enough that its feasable to store for the duration of a program.
1654
1655         See the comment in the code where the compressed table is produced
1656         inplace from the flat tabe representation for an explanation of how
1657         the compression works.
1658
1659     */
1660
1661
1662     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1663     prev_states[1] = 0;
1664
1665     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1666         /*
1667             Second Pass -- Array Of Lists Representation
1668
1669             Each state will be represented by a list of charid:state records
1670             (reg_trie_trans_le) the first such element holds the CUR and LEN
1671             points of the allocated array. (See defines above).
1672
1673             We build the initial structure using the lists, and then convert
1674             it into the compressed table form which allows faster lookups
1675             (but cant be modified once converted).
1676         */
1677
1678         STRLEN transcount = 1;
1679
1680         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1681             "%*sCompiling trie using list compiler\n",
1682             (int)depth * 2 + 2, ""));
1683
1684         trie->states = (reg_trie_state *)
1685             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1686                                   sizeof(reg_trie_state) );
1687         TRIE_LIST_NEW(1);
1688         next_alloc = 2;
1689
1690         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1691
1692             regnode * const noper = NEXTOPER( cur );
1693             U8 *uc           = (U8*)STRING( noper );
1694             const U8 * const e = uc + STR_LEN( noper );
1695             U32 state        = 1;         /* required init */
1696             U16 charid       = 0;         /* sanity init */
1697             U8 *scan         = (U8*)NULL; /* sanity init */
1698             STRLEN foldlen   = 0;         /* required init */
1699             U32 wordlen      = 0;         /* required init */
1700             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1701
1702             if (OP(noper) != NOTHING) {
1703                 for ( ; uc < e ; uc += len ) {
1704
1705                     TRIE_READ_CHAR;
1706
1707                     if ( uvc < 256 ) {
1708                         charid = trie->charmap[ uvc ];
1709                     } else {
1710                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1711                         if ( !svpp ) {
1712                             charid = 0;
1713                         } else {
1714                             charid=(U16)SvIV( *svpp );
1715                         }
1716                     }
1717                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1718                     if ( charid ) {
1719
1720                         U16 check;
1721                         U32 newstate = 0;
1722
1723                         charid--;
1724                         if ( !trie->states[ state ].trans.list ) {
1725                             TRIE_LIST_NEW( state );
1726                         }
1727                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1728                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1729                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1730                                 break;
1731                             }
1732                         }
1733                         if ( ! newstate ) {
1734                             newstate = next_alloc++;
1735                             prev_states[newstate] = state;
1736                             TRIE_LIST_PUSH( state, charid, newstate );
1737                             transcount++;
1738                         }
1739                         state = newstate;
1740                     } else {
1741                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1742                     }
1743                 }
1744             }
1745             TRIE_HANDLE_WORD(state);
1746
1747         } /* end second pass */
1748
1749         /* next alloc is the NEXT state to be allocated */
1750         trie->statecount = next_alloc;
1751         trie->states = (reg_trie_state *)
1752             PerlMemShared_realloc( trie->states,
1753                                    next_alloc
1754                                    * sizeof(reg_trie_state) );
1755
1756         /* and now dump it out before we compress it */
1757         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1758                                                          revcharmap, next_alloc,
1759                                                          depth+1)
1760         );
1761
1762         trie->trans = (reg_trie_trans *)
1763             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1764         {
1765             U32 state;
1766             U32 tp = 0;
1767             U32 zp = 0;
1768
1769
1770             for( state=1 ; state < next_alloc ; state ++ ) {
1771                 U32 base=0;
1772
1773                 /*
1774                 DEBUG_TRIE_COMPILE_MORE_r(
1775                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1776                 );
1777                 */
1778
1779                 if (trie->states[state].trans.list) {
1780                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1781                     U16 maxid=minid;
1782                     U16 idx;
1783
1784                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1785                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1786                         if ( forid < minid ) {
1787                             minid=forid;
1788                         } else if ( forid > maxid ) {
1789                             maxid=forid;
1790                         }
1791                     }
1792                     if ( transcount < tp + maxid - minid + 1) {
1793                         transcount *= 2;
1794                         trie->trans = (reg_trie_trans *)
1795                             PerlMemShared_realloc( trie->trans,
1796                                                      transcount
1797                                                      * sizeof(reg_trie_trans) );
1798                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1799                     }
1800                     base = trie->uniquecharcount + tp - minid;
1801                     if ( maxid == minid ) {
1802                         U32 set = 0;
1803                         for ( ; zp < tp ; zp++ ) {
1804                             if ( ! trie->trans[ zp ].next ) {
1805                                 base = trie->uniquecharcount + zp - minid;
1806                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1807                                 trie->trans[ zp ].check = state;
1808                                 set = 1;
1809                                 break;
1810                             }
1811                         }
1812                         if ( !set ) {
1813                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1814                             trie->trans[ tp ].check = state;
1815                             tp++;
1816                             zp = tp;
1817                         }
1818                     } else {
1819                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1820                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1821                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1822                             trie->trans[ tid ].check = state;
1823                         }
1824                         tp += ( maxid - minid + 1 );
1825                     }
1826                     Safefree(trie->states[ state ].trans.list);
1827                 }
1828                 /*
1829                 DEBUG_TRIE_COMPILE_MORE_r(
1830                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1831                 );
1832                 */
1833                 trie->states[ state ].trans.base=base;
1834             }
1835             trie->lasttrans = tp + 1;
1836         }
1837     } else {
1838         /*
1839            Second Pass -- Flat Table Representation.
1840
1841            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1842            We know that we will need Charcount+1 trans at most to store the data
1843            (one row per char at worst case) So we preallocate both structures
1844            assuming worst case.
1845
1846            We then construct the trie using only the .next slots of the entry
1847            structs.
1848
1849            We use the .check field of the first entry of the node temporarily to
1850            make compression both faster and easier by keeping track of how many non
1851            zero fields are in the node.
1852
1853            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1854            transition.
1855
1856            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1857            number representing the first entry of the node, and state as a
1858            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1859            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1860            are 2 entrys per node. eg:
1861
1862              A B       A B
1863           1. 2 4    1. 3 7
1864           2. 0 3    3. 0 5
1865           3. 0 0    5. 0 0
1866           4. 0 0    7. 0 0
1867
1868            The table is internally in the right hand, idx form. However as we also
1869            have to deal with the states array which is indexed by nodenum we have to
1870            use TRIE_NODENUM() to convert.
1871
1872         */
1873         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1874             "%*sCompiling trie using table compiler\n",
1875             (int)depth * 2 + 2, ""));
1876
1877         trie->trans = (reg_trie_trans *)
1878             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1879                                   * trie->uniquecharcount + 1,
1880                                   sizeof(reg_trie_trans) );
1881         trie->states = (reg_trie_state *)
1882             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1883                                   sizeof(reg_trie_state) );
1884         next_alloc = trie->uniquecharcount + 1;
1885
1886
1887         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1888
1889             regnode * const noper   = NEXTOPER( cur );
1890             const U8 *uc     = (U8*)STRING( noper );
1891             const U8 * const e = uc + STR_LEN( noper );
1892
1893             U32 state        = 1;         /* required init */
1894
1895             U16 charid       = 0;         /* sanity init */
1896             U32 accept_state = 0;         /* sanity init */
1897             U8 *scan         = (U8*)NULL; /* sanity init */
1898
1899             STRLEN foldlen   = 0;         /* required init */
1900             U32 wordlen      = 0;         /* required init */
1901             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1902
1903             if ( OP(noper) != NOTHING ) {
1904                 for ( ; uc < e ; uc += len ) {
1905
1906                     TRIE_READ_CHAR;
1907
1908                     if ( uvc < 256 ) {
1909                         charid = trie->charmap[ uvc ];
1910                     } else {
1911                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1912                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1913                     }
1914                     if ( charid ) {
1915                         charid--;
1916                         if ( !trie->trans[ state + charid ].next ) {
1917                             trie->trans[ state + charid ].next = next_alloc;
1918                             trie->trans[ state ].check++;
1919                             prev_states[TRIE_NODENUM(next_alloc)]
1920                                     = TRIE_NODENUM(state);
1921                             next_alloc += trie->uniquecharcount;
1922                         }
1923                         state = trie->trans[ state + charid ].next;
1924                     } else {
1925                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1926                     }
1927                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1928                 }
1929             }
1930             accept_state = TRIE_NODENUM( state );
1931             TRIE_HANDLE_WORD(accept_state);
1932
1933         } /* end second pass */
1934
1935         /* and now dump it out before we compress it */
1936         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1937                                                           revcharmap,
1938                                                           next_alloc, depth+1));
1939
1940         {
1941         /*
1942            * Inplace compress the table.*
1943
1944            For sparse data sets the table constructed by the trie algorithm will
1945            be mostly 0/FAIL transitions or to put it another way mostly empty.
1946            (Note that leaf nodes will not contain any transitions.)
1947
1948            This algorithm compresses the tables by eliminating most such
1949            transitions, at the cost of a modest bit of extra work during lookup:
1950
1951            - Each states[] entry contains a .base field which indicates the
1952            index in the state[] array wheres its transition data is stored.
1953
1954            - If .base is 0 there are no valid transitions from that node.
1955
1956            - If .base is nonzero then charid is added to it to find an entry in
1957            the trans array.
1958
1959            -If trans[states[state].base+charid].check!=state then the
1960            transition is taken to be a 0/Fail transition. Thus if there are fail
1961            transitions at the front of the node then the .base offset will point
1962            somewhere inside the previous nodes data (or maybe even into a node
1963            even earlier), but the .check field determines if the transition is
1964            valid.
1965
1966            XXX - wrong maybe?
1967            The following process inplace converts the table to the compressed
1968            table: We first do not compress the root node 1,and mark all its
1969            .check pointers as 1 and set its .base pointer as 1 as well. This
1970            allows us to do a DFA construction from the compressed table later,
1971            and ensures that any .base pointers we calculate later are greater
1972            than 0.
1973
1974            - We set 'pos' to indicate the first entry of the second node.
1975
1976            - We then iterate over the columns of the node, finding the first and
1977            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1978            and set the .check pointers accordingly, and advance pos
1979            appropriately and repreat for the next node. Note that when we copy
1980            the next pointers we have to convert them from the original
1981            NODEIDX form to NODENUM form as the former is not valid post
1982            compression.
1983
1984            - If a node has no transitions used we mark its base as 0 and do not
1985            advance the pos pointer.
1986
1987            - If a node only has one transition we use a second pointer into the
1988            structure to fill in allocated fail transitions from other states.
1989            This pointer is independent of the main pointer and scans forward
1990            looking for null transitions that are allocated to a state. When it
1991            finds one it writes the single transition into the "hole".  If the
1992            pointer doesnt find one the single transition is appended as normal.
1993
1994            - Once compressed we can Renew/realloc the structures to release the
1995            excess space.
1996
1997            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1998            specifically Fig 3.47 and the associated pseudocode.
1999
2000            demq
2001         */
2002         const U32 laststate = TRIE_NODENUM( next_alloc );
2003         U32 state, charid;
2004         U32 pos = 0, zp=0;
2005         trie->statecount = laststate;
2006
2007         for ( state = 1 ; state < laststate ; state++ ) {
2008             U8 flag = 0;
2009             const U32 stateidx = TRIE_NODEIDX( state );
2010             const U32 o_used = trie->trans[ stateidx ].check;
2011             U32 used = trie->trans[ stateidx ].check;
2012             trie->trans[ stateidx ].check = 0;
2013
2014             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2015                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2016                     if ( trie->trans[ stateidx + charid ].next ) {
2017                         if (o_used == 1) {
2018                             for ( ; zp < pos ; zp++ ) {
2019                                 if ( ! trie->trans[ zp ].next ) {
2020                                     break;
2021                                 }
2022                             }
2023                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2024                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2025                             trie->trans[ zp ].check = state;
2026                             if ( ++zp > pos ) pos = zp;
2027                             break;
2028                         }
2029                         used--;
2030                     }
2031                     if ( !flag ) {
2032                         flag = 1;
2033                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2034                     }
2035                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2036                     trie->trans[ pos ].check = state;
2037                     pos++;
2038                 }
2039             }
2040         }
2041         trie->lasttrans = pos + 1;
2042         trie->states = (reg_trie_state *)
2043             PerlMemShared_realloc( trie->states, laststate
2044                                    * sizeof(reg_trie_state) );
2045         DEBUG_TRIE_COMPILE_MORE_r(
2046                 PerlIO_printf( Perl_debug_log,
2047                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2048                     (int)depth * 2 + 2,"",
2049                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2050                     (IV)next_alloc,
2051                     (IV)pos,
2052                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2053             );
2054
2055         } /* end table compress */
2056     }
2057     DEBUG_TRIE_COMPILE_MORE_r(
2058             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2059                 (int)depth * 2 + 2, "",
2060                 (UV)trie->statecount,
2061                 (UV)trie->lasttrans)
2062     );
2063     /* resize the trans array to remove unused space */
2064     trie->trans = (reg_trie_trans *)
2065         PerlMemShared_realloc( trie->trans, trie->lasttrans
2066                                * sizeof(reg_trie_trans) );
2067
2068     {   /* Modify the program and insert the new TRIE node */
2069         U8 nodetype =(U8)(flags & 0xFF);
2070         char *str=NULL;
2071
2072 #ifdef DEBUGGING
2073         regnode *optimize = NULL;
2074 #ifdef RE_TRACK_PATTERN_OFFSETS
2075
2076         U32 mjd_offset = 0;
2077         U32 mjd_nodelen = 0;
2078 #endif /* RE_TRACK_PATTERN_OFFSETS */
2079 #endif /* DEBUGGING */
2080         /*
2081            This means we convert either the first branch or the first Exact,
2082            depending on whether the thing following (in 'last') is a branch
2083            or not and whther first is the startbranch (ie is it a sub part of
2084            the alternation or is it the whole thing.)
2085            Assuming its a sub part we convert the EXACT otherwise we convert
2086            the whole branch sequence, including the first.
2087          */
2088         /* Find the node we are going to overwrite */
2089         if ( first != startbranch || OP( last ) == BRANCH ) {
2090             /* branch sub-chain */
2091             NEXT_OFF( first ) = (U16)(last - first);
2092 #ifdef RE_TRACK_PATTERN_OFFSETS
2093             DEBUG_r({
2094                 mjd_offset= Node_Offset((convert));
2095                 mjd_nodelen= Node_Length((convert));
2096             });
2097 #endif
2098             /* whole branch chain */
2099         }
2100 #ifdef RE_TRACK_PATTERN_OFFSETS
2101         else {
2102             DEBUG_r({
2103                 const  regnode *nop = NEXTOPER( convert );
2104                 mjd_offset= Node_Offset((nop));
2105                 mjd_nodelen= Node_Length((nop));
2106             });
2107         }
2108         DEBUG_OPTIMISE_r(
2109             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2110                 (int)depth * 2 + 2, "",
2111                 (UV)mjd_offset, (UV)mjd_nodelen)
2112         );
2113 #endif
2114         /* But first we check to see if there is a common prefix we can
2115            split out as an EXACT and put in front of the TRIE node.  */
2116         trie->startstate= 1;
2117         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2118             U32 state;
2119             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2120                 U32 ofs = 0;
2121                 I32 idx = -1;
2122                 U32 count = 0;
2123                 const U32 base = trie->states[ state ].trans.base;
2124
2125                 if ( trie->states[state].wordnum )
2126                         count = 1;
2127
2128                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2129                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2130                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2131                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2132                     {
2133                         if ( ++count > 1 ) {
2134                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2135                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2136                             if ( state == 1 ) break;
2137                             if ( count == 2 ) {
2138                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2139                                 DEBUG_OPTIMISE_r(
2140                                     PerlIO_printf(Perl_debug_log,
2141                                         "%*sNew Start State=%"UVuf" Class: [",
2142                                         (int)depth * 2 + 2, "",
2143                                         (UV)state));
2144                                 if (idx >= 0) {
2145                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2146                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2147
2148                                     TRIE_BITMAP_SET(trie,*ch);
2149                                     if ( folder )
2150                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2151                                     DEBUG_OPTIMISE_r(
2152                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2153                                     );
2154                                 }
2155                             }
2156                             TRIE_BITMAP_SET(trie,*ch);
2157                             if ( folder )
2158                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2159                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2160                         }
2161                         idx = ofs;
2162                     }
2163                 }
2164                 if ( count == 1 ) {
2165                     SV **tmp = av_fetch( revcharmap, idx, 0);
2166                     STRLEN len;
2167                     char *ch = SvPV( *tmp, len );
2168                     DEBUG_OPTIMISE_r({
2169                         SV *sv=sv_newmortal();
2170                         PerlIO_printf( Perl_debug_log,
2171                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2172                             (int)depth * 2 + 2, "",
2173                             (UV)state, (UV)idx,
2174                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2175                                 PL_colors[0], PL_colors[1],
2176                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2177                                 PERL_PV_ESCAPE_FIRSTCHAR
2178                             )
2179                         );
2180                     });
2181                     if ( state==1 ) {
2182                         OP( convert ) = nodetype;
2183                         str=STRING(convert);
2184                         STR_LEN(convert)=0;
2185                     }
2186                     STR_LEN(convert) += len;
2187                     while (len--)
2188                         *str++ = *ch++;
2189                 } else {
2190 #ifdef DEBUGGING
2191                     if (state>1)
2192                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2193 #endif
2194                     break;
2195                 }
2196             }
2197             trie->prefixlen = (state-1);
2198             if (str) {
2199                 regnode *n = convert+NODE_SZ_STR(convert);
2200                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2201                 trie->startstate = state;
2202                 trie->minlen -= (state - 1);
2203                 trie->maxlen -= (state - 1);
2204 #ifdef DEBUGGING
2205                /* At least the UNICOS C compiler choked on this
2206                 * being argument to DEBUG_r(), so let's just have
2207                 * it right here. */
2208                if (
2209 #ifdef PERL_EXT_RE_BUILD
2210                    1
2211 #else
2212                    DEBUG_r_TEST
2213 #endif
2214                    ) {
2215                    regnode *fix = convert;
2216                    U32 word = trie->wordcount;
2217                    mjd_nodelen++;
2218                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2219                    while( ++fix < n ) {
2220                        Set_Node_Offset_Length(fix, 0, 0);
2221                    }
2222                    while (word--) {
2223                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2224                        if (tmp) {
2225                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2226                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2227                            else
2228                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2229                        }
2230                    }
2231                }
2232 #endif
2233                 if (trie->maxlen) {
2234                     convert = n;
2235                 } else {
2236                     NEXT_OFF(convert) = (U16)(tail - convert);
2237                     DEBUG_r(optimize= n);
2238                 }
2239             }
2240         }
2241         if (!jumper)
2242             jumper = last;
2243         if ( trie->maxlen ) {
2244             NEXT_OFF( convert ) = (U16)(tail - convert);
2245             ARG_SET( convert, data_slot );
2246             /* Store the offset to the first unabsorbed branch in
2247                jump[0], which is otherwise unused by the jump logic.
2248                We use this when dumping a trie and during optimisation. */
2249             if (trie->jump)
2250                 trie->jump[0] = (U16)(nextbranch - convert);
2251
2252             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2253              *   and there is a bitmap
2254              *   and the first "jump target" node we found leaves enough room
2255              * then convert the TRIE node into a TRIEC node, with the bitmap
2256              * embedded inline in the opcode - this is hypothetically faster.
2257              */
2258             if ( !trie->states[trie->startstate].wordnum
2259                  && trie->bitmap
2260                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2261             {
2262                 OP( convert ) = TRIEC;
2263                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2264                 PerlMemShared_free(trie->bitmap);
2265                 trie->bitmap= NULL;
2266             } else
2267                 OP( convert ) = TRIE;
2268
2269             /* store the type in the flags */
2270             convert->flags = nodetype;
2271             DEBUG_r({
2272             optimize = convert
2273                       + NODE_STEP_REGNODE
2274                       + regarglen[ OP( convert ) ];
2275             });
2276             /* XXX We really should free up the resource in trie now,
2277                    as we won't use them - (which resources?) dmq */
2278         }
2279         /* needed for dumping*/
2280         DEBUG_r(if (optimize) {
2281             regnode *opt = convert;
2282
2283             while ( ++opt < optimize) {
2284                 Set_Node_Offset_Length(opt,0,0);
2285             }
2286             /*
2287                 Try to clean up some of the debris left after the
2288                 optimisation.
2289              */
2290             while( optimize < jumper ) {
2291                 mjd_nodelen += Node_Length((optimize));
2292                 OP( optimize ) = OPTIMIZED;
2293                 Set_Node_Offset_Length(optimize,0,0);
2294                 optimize++;
2295             }
2296             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2297         });
2298     } /* end node insert */
2299
2300     /*  Finish populating the prev field of the wordinfo array.  Walk back
2301      *  from each accept state until we find another accept state, and if
2302      *  so, point the first word's .prev field at the second word. If the
2303      *  second already has a .prev field set, stop now. This will be the
2304      *  case either if we've already processed that word's accept state,
2305      *  or that state had multiple words, and the overspill words were
2306      *  already linked up earlier.
2307      */
2308     {
2309         U16 word;
2310         U32 state;
2311         U16 prev;
2312
2313         for (word=1; word <= trie->wordcount; word++) {
2314             prev = 0;
2315             if (trie->wordinfo[word].prev)
2316                 continue;
2317             state = trie->wordinfo[word].accept;
2318             while (state) {
2319                 state = prev_states[state];
2320                 if (!state)
2321                     break;
2322                 prev = trie->states[state].wordnum;
2323                 if (prev)
2324                     break;
2325             }
2326             trie->wordinfo[word].prev = prev;
2327         }
2328         Safefree(prev_states);
2329     }
2330
2331
2332     /* and now dump out the compressed format */
2333     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2334
2335     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2336 #ifdef DEBUGGING
2337     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2338     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2339 #else
2340     SvREFCNT_dec(revcharmap);
2341 #endif
2342     return trie->jump
2343            ? MADE_JUMP_TRIE
2344            : trie->startstate>1
2345              ? MADE_EXACT_TRIE
2346              : MADE_TRIE;
2347 }
2348
2349 STATIC void
2350 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2351 {
2352 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2353
2354    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2355    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2356    ISBN 0-201-10088-6
2357
2358    We find the fail state for each state in the trie, this state is the longest proper
2359    suffix of the current state's 'word' that is also a proper prefix of another word in our
2360    trie. State 1 represents the word '' and is thus the default fail state. This allows
2361    the DFA not to have to restart after its tried and failed a word at a given point, it
2362    simply continues as though it had been matching the other word in the first place.
2363    Consider
2364       'abcdgu'=~/abcdefg|cdgu/
2365    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2366    fail, which would bring us to the state representing 'd' in the second word where we would
2367    try 'g' and succeed, proceeding to match 'cdgu'.
2368  */
2369  /* add a fail transition */
2370     const U32 trie_offset = ARG(source);
2371     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2372     U32 *q;
2373     const U32 ucharcount = trie->uniquecharcount;
2374     const U32 numstates = trie->statecount;
2375     const U32 ubound = trie->lasttrans + ucharcount;
2376     U32 q_read = 0;
2377     U32 q_write = 0;
2378     U32 charid;
2379     U32 base = trie->states[ 1 ].trans.base;
2380     U32 *fail;
2381     reg_ac_data *aho;
2382     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2383     GET_RE_DEBUG_FLAGS_DECL;
2384
2385     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2386 #ifndef DEBUGGING
2387     PERL_UNUSED_ARG(depth);
2388 #endif
2389
2390
2391     ARG_SET( stclass, data_slot );
2392     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2393     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2394     aho->trie=trie_offset;
2395     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2396     Copy( trie->states, aho->states, numstates, reg_trie_state );
2397     Newxz( q, numstates, U32);
2398     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2399     aho->refcount = 1;
2400     fail = aho->fail;
2401     /* initialize fail[0..1] to be 1 so that we always have
2402        a valid final fail state */
2403     fail[ 0 ] = fail[ 1 ] = 1;
2404
2405     for ( charid = 0; charid < ucharcount ; charid++ ) {
2406         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2407         if ( newstate ) {
2408             q[ q_write ] = newstate;
2409             /* set to point at the root */
2410             fail[ q[ q_write++ ] ]=1;
2411         }
2412     }
2413     while ( q_read < q_write) {
2414         const U32 cur = q[ q_read++ % numstates ];
2415         base = trie->states[ cur ].trans.base;
2416
2417         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2418             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2419             if (ch_state) {
2420                 U32 fail_state = cur;
2421                 U32 fail_base;
2422                 do {
2423                     fail_state = fail[ fail_state ];
2424                     fail_base = aho->states[ fail_state ].trans.base;
2425                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2426
2427                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2428                 fail[ ch_state ] = fail_state;
2429                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2430                 {
2431                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2432                 }
2433                 q[ q_write++ % numstates] = ch_state;
2434             }
2435         }
2436     }
2437     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2438        when we fail in state 1, this allows us to use the
2439        charclass scan to find a valid start char. This is based on the principle
2440        that theres a good chance the string being searched contains lots of stuff
2441        that cant be a start char.
2442      */
2443     fail[ 0 ] = fail[ 1 ] = 0;
2444     DEBUG_TRIE_COMPILE_r({
2445         PerlIO_printf(Perl_debug_log,
2446                       "%*sStclass Failtable (%"UVuf" states): 0",
2447                       (int)(depth * 2), "", (UV)numstates
2448         );
2449         for( q_read=1; q_read<numstates; q_read++ ) {
2450             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2451         }
2452         PerlIO_printf(Perl_debug_log, "\n");
2453     });
2454     Safefree(q);
2455     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2456 }
2457
2458
2459 /*
2460  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2461  * These need to be revisited when a newer toolchain becomes available.
2462  */
2463 #if defined(__sparc64__) && defined(__GNUC__)
2464 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2465 #       undef  SPARC64_GCC_WORKAROUND
2466 #       define SPARC64_GCC_WORKAROUND 1
2467 #   endif
2468 #endif
2469
2470 #define DEBUG_PEEP(str,scan,depth) \
2471     DEBUG_OPTIMISE_r({if (scan){ \
2472        SV * const mysv=sv_newmortal(); \
2473        regnode *Next = regnext(scan); \
2474        regprop(RExC_rx, mysv, scan); \
2475        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2476        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2477        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2478    }});
2479
2480
2481
2482
2483
2484 #define JOIN_EXACT(scan,min,flags) \
2485     if (PL_regkind[OP(scan)] == EXACT) \
2486         join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2487
2488 STATIC U32
2489 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2490     /* Merge several consecutive EXACTish nodes into one. */
2491     regnode *n = regnext(scan);
2492     U32 stringok = 1;
2493     regnode *next = scan + NODE_SZ_STR(scan);
2494     U32 merged = 0;
2495     U32 stopnow = 0;
2496 #ifdef DEBUGGING
2497     regnode *stop = scan;
2498     GET_RE_DEBUG_FLAGS_DECL;
2499 #else
2500     PERL_UNUSED_ARG(depth);
2501 #endif
2502
2503     PERL_ARGS_ASSERT_JOIN_EXACT;
2504 #ifndef EXPERIMENTAL_INPLACESCAN
2505     PERL_UNUSED_ARG(flags);
2506     PERL_UNUSED_ARG(val);
2507 #endif
2508     DEBUG_PEEP("join",scan,depth);
2509
2510     /* Skip NOTHING, merge EXACT*. */
2511     while (n &&
2512            ( PL_regkind[OP(n)] == NOTHING ||
2513              (stringok && (OP(n) == OP(scan))))
2514            && NEXT_OFF(n)
2515            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2516
2517         if (OP(n) == TAIL || n > next)
2518             stringok = 0;
2519         if (PL_regkind[OP(n)] == NOTHING) {
2520             DEBUG_PEEP("skip:",n,depth);
2521             NEXT_OFF(scan) += NEXT_OFF(n);
2522             next = n + NODE_STEP_REGNODE;
2523 #ifdef DEBUGGING
2524             if (stringok)
2525                 stop = n;
2526 #endif
2527             n = regnext(n);
2528         }
2529         else if (stringok) {
2530             const unsigned int oldl = STR_LEN(scan);
2531             regnode * const nnext = regnext(n);
2532
2533             DEBUG_PEEP("merg",n,depth);
2534
2535             merged++;
2536             if (oldl + STR_LEN(n) > U8_MAX)
2537                 break;
2538             NEXT_OFF(scan) += NEXT_OFF(n);
2539             STR_LEN(scan) += STR_LEN(n);
2540             next = n + NODE_SZ_STR(n);
2541             /* Now we can overwrite *n : */
2542             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2543 #ifdef DEBUGGING
2544             stop = next - 1;
2545 #endif
2546             n = nnext;
2547             if (stopnow) break;
2548         }
2549
2550 #ifdef EXPERIMENTAL_INPLACESCAN
2551         if (flags && !NEXT_OFF(n)) {
2552             DEBUG_PEEP("atch", val, depth);
2553             if (reg_off_by_arg[OP(n)]) {
2554                 ARG_SET(n, val - n);
2555             }
2556             else {
2557                 NEXT_OFF(n) = val - n;
2558             }
2559             stopnow = 1;
2560         }
2561 #endif
2562     }
2563 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
2564 #define IOTA_D_T        GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2565 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS     0x03B0
2566 #define UPSILON_D_T     GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2567
2568     if (UTF
2569         && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2570         && ( STR_LEN(scan) >= 6 ) )
2571     {
2572     /*
2573     Two problematic code points in Unicode casefolding of EXACT nodes:
2574
2575     U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2576     U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2577
2578     which casefold to
2579
2580     Unicode                      UTF-8
2581
2582     U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2583     U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2584
2585     This means that in case-insensitive matching (or "loose matching",
2586     as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2587     length of the above casefolded versions) can match a target string
2588     of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2589     This would rather mess up the minimum length computation.
2590
2591     What we'll do is to look for the tail four bytes, and then peek
2592     at the preceding two bytes to see whether we need to decrease
2593     the minimum length by four (six minus two).
2594
2595     Thanks to the design of UTF-8, there cannot be false matches:
2596     A sequence of valid UTF-8 bytes cannot be a subsequence of
2597     another valid sequence of UTF-8 bytes.
2598
2599     */
2600          char * const s0 = STRING(scan), *s, *t;
2601          char * const s1 = s0 + STR_LEN(scan) - 1;
2602          char * const s2 = s1 - 4;
2603 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2604          const char t0[] = "\xaf\x49\xaf\x42";
2605 #else
2606          const char t0[] = "\xcc\x88\xcc\x81";
2607 #endif
2608          const char * const t1 = t0 + 3;
2609
2610          for (s = s0 + 2;
2611               s < s2 && (t = ninstr(s, s1, t0, t1));
2612               s = t + 4) {
2613 #ifdef EBCDIC
2614               if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2615                   ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2616 #else
2617               if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2618                   ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2619 #endif
2620                    *min -= 4;
2621          }
2622     }
2623
2624 #ifdef DEBUGGING
2625     /* Allow dumping */
2626     n = scan + NODE_SZ_STR(scan);
2627     while (n <= stop) {
2628         if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2629             OP(n) = OPTIMIZED;
2630             NEXT_OFF(n) = 0;
2631         }
2632         n++;
2633     }
2634 #endif
2635     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2636     return stopnow;
2637 }
2638
2639 /* REx optimizer.  Converts nodes into quicker variants "in place".
2640    Finds fixed substrings.  */
2641
2642 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2643    to the position after last scanned or to NULL. */
2644
2645 #define INIT_AND_WITHP \
2646     assert(!and_withp); \
2647     Newx(and_withp,1,struct regnode_charclass_class); \
2648     SAVEFREEPV(and_withp)
2649
2650 /* this is a chain of data about sub patterns we are processing that
2651    need to be handled separately/specially in study_chunk. Its so
2652    we can simulate recursion without losing state.  */
2653 struct scan_frame;
2654 typedef struct scan_frame {
2655     regnode *last;  /* last node to process in this frame */
2656     regnode *next;  /* next node to process when last is reached */
2657     struct scan_frame *prev; /*previous frame*/
2658     I32 stop; /* what stopparen do we use */
2659 } scan_frame;
2660
2661
2662 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2663
2664 #define CASE_SYNST_FNC(nAmE)                                       \
2665 case nAmE:                                                         \
2666     if (flags & SCF_DO_STCLASS_AND) {                              \
2667             for (value = 0; value < 256; value++)                  \
2668                 if (!is_ ## nAmE ## _cp(value))                       \
2669                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2670     }                                                              \
2671     else {                                                         \
2672             for (value = 0; value < 256; value++)                  \
2673                 if (is_ ## nAmE ## _cp(value))                        \
2674                     ANYOF_BITMAP_SET(data->start_class, value);    \
2675     }                                                              \
2676     break;                                                         \
2677 case N ## nAmE:                                                    \
2678     if (flags & SCF_DO_STCLASS_AND) {                              \
2679             for (value = 0; value < 256; value++)                   \
2680                 if (is_ ## nAmE ## _cp(value))                         \
2681                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2682     }                                                               \
2683     else {                                                          \
2684             for (value = 0; value < 256; value++)                   \
2685                 if (!is_ ## nAmE ## _cp(value))                        \
2686                     ANYOF_BITMAP_SET(data->start_class, value);     \
2687     }                                                               \
2688     break
2689
2690
2691
2692 STATIC I32
2693 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2694                         I32 *minlenp, I32 *deltap,
2695                         regnode *last,
2696                         scan_data_t *data,
2697                         I32 stopparen,
2698                         U8* recursed,
2699                         struct regnode_charclass_class *and_withp,
2700                         U32 flags, U32 depth)
2701                         /* scanp: Start here (read-write). */
2702                         /* deltap: Write maxlen-minlen here. */
2703                         /* last: Stop before this one. */
2704                         /* data: string data about the pattern */
2705                         /* stopparen: treat close N as END */
2706                         /* recursed: which subroutines have we recursed into */
2707                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2708 {
2709     dVAR;
2710     I32 min = 0, pars = 0, code;
2711     regnode *scan = *scanp, *next;
2712     I32 delta = 0;
2713     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2714     int is_inf_internal = 0;            /* The studied chunk is infinite */
2715     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2716     scan_data_t data_fake;
2717     SV *re_trie_maxbuff = NULL;
2718     regnode *first_non_open = scan;
2719     I32 stopmin = I32_MAX;
2720     scan_frame *frame = NULL;
2721     GET_RE_DEBUG_FLAGS_DECL;
2722
2723     PERL_ARGS_ASSERT_STUDY_CHUNK;
2724
2725 #ifdef DEBUGGING
2726     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2727 #endif
2728
2729     if ( depth == 0 ) {
2730         while (first_non_open && OP(first_non_open) == OPEN)
2731             first_non_open=regnext(first_non_open);
2732     }
2733
2734
2735   fake_study_recurse:
2736     while ( scan && OP(scan) != END && scan < last ){
2737         /* Peephole optimizer: */
2738         DEBUG_STUDYDATA("Peep:", data,depth);
2739         DEBUG_PEEP("Peep",scan,depth);
2740         JOIN_EXACT(scan,&min,0);
2741
2742         /* Follow the next-chain of the current node and optimize
2743            away all the NOTHINGs from it.  */
2744         if (OP(scan) != CURLYX) {
2745             const int max = (reg_off_by_arg[OP(scan)]
2746                        ? I32_MAX
2747                        /* I32 may be smaller than U16 on CRAYs! */
2748                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2749             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2750             int noff;
2751             regnode *n = scan;
2752
2753             /* Skip NOTHING and LONGJMP. */
2754             while ((n = regnext(n))
2755                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2756                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2757                    && off + noff < max)
2758                 off += noff;
2759             if (reg_off_by_arg[OP(scan)])
2760                 ARG(scan) = off;
2761             else
2762                 NEXT_OFF(scan) = off;
2763         }
2764
2765
2766
2767         /* The principal pseudo-switch.  Cannot be a switch, since we
2768            look into several different things.  */
2769         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2770                    || OP(scan) == IFTHEN) {
2771             next = regnext(scan);
2772             code = OP(scan);
2773             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2774
2775             if (OP(next) == code || code == IFTHEN) {
2776                 /* NOTE - There is similar code to this block below for handling
2777                    TRIE nodes on a re-study.  If you change stuff here check there
2778                    too. */
2779                 I32 max1 = 0, min1 = I32_MAX, num = 0;
2780                 struct regnode_charclass_class accum;
2781                 regnode * const startbranch=scan;
2782
2783                 if (flags & SCF_DO_SUBSTR)
2784                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2785                 if (flags & SCF_DO_STCLASS)
2786                     cl_init_zero(pRExC_state, &accum);
2787
2788                 while (OP(scan) == code) {
2789                     I32 deltanext, minnext, f = 0, fake;
2790                     struct regnode_charclass_class this_class;
2791
2792                     num++;
2793                     data_fake.flags = 0;
2794                     if (data) {
2795                         data_fake.whilem_c = data->whilem_c;
2796                         data_fake.last_closep = data->last_closep;
2797                     }
2798                     else
2799                         data_fake.last_closep = &fake;
2800
2801                     data_fake.pos_delta = delta;
2802                     next = regnext(scan);
2803                     scan = NEXTOPER(scan);
2804                     if (code != BRANCH)
2805                         scan = NEXTOPER(scan);
2806                     if (flags & SCF_DO_STCLASS) {
2807                         cl_init(pRExC_state, &this_class);
2808                         data_fake.start_class = &this_class;
2809                         f = SCF_DO_STCLASS_AND;
2810                     }
2811                     if (flags & SCF_WHILEM_VISITED_POS)
2812                         f |= SCF_WHILEM_VISITED_POS;
2813
2814                     /* we suppose the run is continuous, last=next...*/
2815                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2816                                           next, &data_fake,
2817                                           stopparen, recursed, NULL, f,depth+1);
2818                     if (min1 > minnext)
2819                         min1 = minnext;
2820                     if (max1 < minnext + deltanext)
2821                         max1 = minnext + deltanext;
2822                     if (deltanext == I32_MAX)
2823                         is_inf = is_inf_internal = 1;
2824                     scan = next;
2825                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2826                         pars++;
2827                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
2828                         if ( stopmin > minnext)
2829                             stopmin = min + min1;
2830                         flags &= ~SCF_DO_SUBSTR;
2831                         if (data)
2832                             data->flags |= SCF_SEEN_ACCEPT;
2833                     }
2834                     if (data) {
2835                         if (data_fake.flags & SF_HAS_EVAL)
2836                             data->flags |= SF_HAS_EVAL;
2837                         data->whilem_c = data_fake.whilem_c;
2838                     }
2839                     if (flags & SCF_DO_STCLASS)
2840                         cl_or(pRExC_state, &accum, &this_class);
2841                 }
2842                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2843                     min1 = 0;
2844                 if (flags & SCF_DO_SUBSTR) {
2845                     data->pos_min += min1;
2846                     data->pos_delta += max1 - min1;
2847                     if (max1 != min1 || is_inf)
2848                         data->longest = &(data->longest_float);
2849                 }
2850                 min += min1;
2851                 delta += max1 - min1;
2852                 if (flags & SCF_DO_STCLASS_OR) {
2853                     cl_or(pRExC_state, data->start_class, &accum);
2854                     if (min1) {
2855                         cl_and(data->start_class, and_withp);
2856                         flags &= ~SCF_DO_STCLASS;
2857                     }
2858                 }
2859                 else if (flags & SCF_DO_STCLASS_AND) {
2860                     if (min1) {
2861                         cl_and(data->start_class, &accum);
2862                         flags &= ~SCF_DO_STCLASS;
2863                     }
2864                     else {
2865                         /* Switch to OR mode: cache the old value of
2866                          * data->start_class */
2867                         INIT_AND_WITHP;
2868                         StructCopy(data->start_class, and_withp,
2869                                    struct regnode_charclass_class);
2870                         flags &= ~SCF_DO_STCLASS_AND;
2871                         StructCopy(&accum, data->start_class,
2872                                    struct regnode_charclass_class);
2873                         flags |= SCF_DO_STCLASS_OR;
2874                         data->start_class->flags |= ANYOF_EOS;
2875                     }
2876                 }
2877
2878                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2879                 /* demq.
2880
2881                    Assuming this was/is a branch we are dealing with: 'scan' now
2882                    points at the item that follows the branch sequence, whatever
2883                    it is. We now start at the beginning of the sequence and look
2884                    for subsequences of
2885
2886                    BRANCH->EXACT=>x1
2887                    BRANCH->EXACT=>x2
2888                    tail
2889
2890                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
2891
2892                    If we can find such a subsequence we need to turn the first
2893                    element into a trie and then add the subsequent branch exact
2894                    strings to the trie.
2895
2896                    We have two cases
2897
2898                      1. patterns where the whole set of branches can be converted.
2899
2900                      2. patterns where only a subset can be converted.
2901
2902                    In case 1 we can replace the whole set with a single regop
2903                    for the trie. In case 2 we need to keep the start and end
2904                    branches so
2905
2906                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2907                      becomes BRANCH TRIE; BRANCH X;
2908
2909                   There is an additional case, that being where there is a
2910                   common prefix, which gets split out into an EXACT like node
2911                   preceding the TRIE node.
2912
2913                   If x(1..n)==tail then we can do a simple trie, if not we make
2914                   a "jump" trie, such that when we match the appropriate word
2915                   we "jump" to the appropriate tail node. Essentially we turn
2916                   a nested if into a case structure of sorts.
2917
2918                 */
2919
2920                     int made=0;
2921                     if (!re_trie_maxbuff) {
2922                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2923                         if (!SvIOK(re_trie_maxbuff))
2924                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2925                     }
2926                     if ( SvIV(re_trie_maxbuff)>=0  ) {
2927                         regnode *cur;
2928                         regnode *first = (regnode *)NULL;
2929                         regnode *last = (regnode *)NULL;
2930                         regnode *tail = scan;
2931                         U8 optype = 0;
2932                         U32 count=0;
2933
2934 #ifdef DEBUGGING
2935                         SV * const mysv = sv_newmortal();       /* for dumping */
2936 #endif
2937                         /* var tail is used because there may be a TAIL
2938                            regop in the way. Ie, the exacts will point to the
2939                            thing following the TAIL, but the last branch will
2940                            point at the TAIL. So we advance tail. If we
2941                            have nested (?:) we may have to move through several
2942                            tails.
2943                          */
2944
2945                         while ( OP( tail ) == TAIL ) {
2946                             /* this is the TAIL generated by (?:) */
2947                             tail = regnext( tail );
2948                         }
2949
2950
2951                         DEBUG_OPTIMISE_r({
2952                             regprop(RExC_rx, mysv, tail );
2953                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2954                                 (int)depth * 2 + 2, "",
2955                                 "Looking for TRIE'able sequences. Tail node is: ",
2956                                 SvPV_nolen_const( mysv )
2957                             );
2958                         });
2959
2960                         /*
2961
2962                            step through the branches, cur represents each
2963                            branch, noper is the first thing to be matched
2964                            as part of that branch and noper_next is the
2965                            regnext() of that node. if noper is an EXACT
2966                            and noper_next is the same as scan (our current
2967                            position in the regex) then the EXACT branch is
2968                            a possible optimization target. Once we have
2969                            two or more consecutive such branches we can
2970                            create a trie of the EXACT's contents and stich
2971                            it in place. If the sequence represents all of
2972                            the branches we eliminate the whole thing and
2973                            replace it with a single TRIE. If it is a
2974                            subsequence then we need to stitch it in. This
2975                            means the first branch has to remain, and needs
2976                            to be repointed at the item on the branch chain
2977                            following the last branch optimized. This could
2978                            be either a BRANCH, in which case the
2979                            subsequence is internal, or it could be the
2980                            item following the branch sequence in which
2981                            case the subsequence is at the end.
2982
2983                         */
2984
2985                         /* dont use tail as the end marker for this traverse */
2986                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2987                             regnode * const noper = NEXTOPER( cur );
2988 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2989                             regnode * const noper_next = regnext( noper );
2990 #endif
2991
2992                             DEBUG_OPTIMISE_r({
2993                                 regprop(RExC_rx, mysv, cur);
2994                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2995                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2996
2997                                 regprop(RExC_rx, mysv, noper);
2998                                 PerlIO_printf( Perl_debug_log, " -> %s",
2999                                     SvPV_nolen_const(mysv));
3000
3001                                 if ( noper_next ) {
3002                                   regprop(RExC_rx, mysv, noper_next );
3003                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3004                                     SvPV_nolen_const(mysv));
3005                                 }
3006                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3007                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3008                             });
3009                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3010                                          : PL_regkind[ OP( noper ) ] == EXACT )
3011                                   || OP(noper) == NOTHING )
3012 #ifdef NOJUMPTRIE
3013                                   && noper_next == tail
3014 #endif
3015                                   && count < U16_MAX)
3016                             {
3017                                 count++;
3018                                 if ( !first || optype == NOTHING ) {
3019                                     if (!first) first = cur;
3020                                     optype = OP( noper );
3021                                 } else {
3022                                     last = cur;
3023                                 }
3024                             } else {
3025 /*
3026     Currently we do not believe that the trie logic can
3027     handle case insensitive matching properly when the
3028     pattern is not unicode (thus forcing unicode semantics).
3029
3030     If/when this is fixed the following define can be swapped
3031     in below to fully enable trie logic.
3032
3033     XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
3034     not /aa
3035
3036 #define TRIE_TYPE_IS_SAFE 1
3037
3038 */
3039 #define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
3040
3041                                 if ( last && TRIE_TYPE_IS_SAFE ) {
3042                                     make_trie( pRExC_state,
3043                                             startbranch, first, cur, tail, count,
3044                                             optype, depth+1 );
3045                                 }
3046                                 if ( PL_regkind[ OP( noper ) ] == EXACT
3047 #ifdef NOJUMPTRIE
3048                                      && noper_next == tail
3049 #endif
3050                                 ){
3051                                     count = 1;
3052                                     first = cur;
3053                                     optype = OP( noper );
3054                                 } else {
3055                                     count = 0;
3056                                     first = NULL;
3057                                     optype = 0;
3058                                 }
3059                                 last = NULL;
3060                             }
3061                         }
3062                         DEBUG_OPTIMISE_r({
3063                             regprop(RExC_rx, mysv, cur);
3064                             PerlIO_printf( Perl_debug_log,
3065                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3066                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3067
3068                         });
3069
3070                         if ( last && TRIE_TYPE_IS_SAFE ) {
3071                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3072 #ifdef TRIE_STUDY_OPT
3073                             if ( ((made == MADE_EXACT_TRIE &&
3074                                  startbranch == first)
3075                                  || ( first_non_open == first )) &&
3076                                  depth==0 ) {
3077                                 flags |= SCF_TRIE_RESTUDY;
3078                                 if ( startbranch == first
3079                                      && scan == tail )
3080                                 {
3081                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3082                                 }
3083                             }
3084 #endif
3085                         }
3086                     }
3087
3088                 } /* do trie */
3089
3090             }
3091             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3092                 scan = NEXTOPER(NEXTOPER(scan));
3093             } else                      /* single branch is optimized. */
3094                 scan = NEXTOPER(scan);
3095             continue;
3096         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3097             scan_frame *newframe = NULL;
3098             I32 paren;
3099             regnode *start;
3100             regnode *end;
3101
3102             if (OP(scan) != SUSPEND) {
3103             /* set the pointer */
3104                 if (OP(scan) == GOSUB) {
3105                     paren = ARG(scan);
3106                     RExC_recurse[ARG2L(scan)] = scan;
3107                     start = RExC_open_parens[paren-1];
3108                     end   = RExC_close_parens[paren-1];
3109                 } else {
3110                     paren = 0;
3111                     start = RExC_rxi->program + 1;
3112                     end   = RExC_opend;
3113                 }
3114                 if (!recursed) {
3115                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3116                     SAVEFREEPV(recursed);
3117                 }
3118                 if (!PAREN_TEST(recursed,paren+1)) {
3119                     PAREN_SET(recursed,paren+1);
3120                     Newx(newframe,1,scan_frame);
3121                 } else {
3122                     if (flags & SCF_DO_SUBSTR) {
3123                         SCAN_COMMIT(pRExC_state,data,minlenp);
3124                         data->longest = &(data->longest_float);
3125                     }
3126                     is_inf = is_inf_internal = 1;
3127                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3128                         cl_anything(pRExC_state, data->start_class);
3129                     flags &= ~SCF_DO_STCLASS;
3130                 }
3131             } else {
3132                 Newx(newframe,1,scan_frame);
3133                 paren = stopparen;
3134                 start = scan+2;
3135                 end = regnext(scan);
3136             }
3137             if (newframe) {
3138                 assert(start);
3139                 assert(end);
3140                 SAVEFREEPV(newframe);
3141                 newframe->next = regnext(scan);
3142                 newframe->last = last;
3143                 newframe->stop = stopparen;
3144                 newframe->prev = frame;
3145
3146                 frame = newframe;
3147                 scan =  start;
3148                 stopparen = paren;
3149                 last = end;
3150
3151                 continue;
3152             }
3153         }
3154         else if (OP(scan) == EXACT) {
3155             I32 l = STR_LEN(scan);
3156             UV uc;
3157             if (UTF) {
3158                 const U8 * const s = (U8*)STRING(scan);
3159                 l = utf8_length(s, s + l);
3160                 uc = utf8_to_uvchr(s, NULL);
3161             } else {
3162                 uc = *((U8*)STRING(scan));
3163             }
3164             min += l;
3165             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3166                 /* The code below prefers earlier match for fixed
3167                    offset, later match for variable offset.  */
3168                 if (data->last_end == -1) { /* Update the start info. */
3169                     data->last_start_min = data->pos_min;
3170                     data->last_start_max = is_inf
3171                         ? I32_MAX : data->pos_min + data->pos_delta;
3172                 }
3173                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3174                 if (UTF)
3175                     SvUTF8_on(data->last_found);
3176                 {
3177                     SV * const sv = data->last_found;
3178                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3179                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3180                     if (mg && mg->mg_len >= 0)
3181                         mg->mg_len += utf8_length((U8*)STRING(scan),
3182                                                   (U8*)STRING(scan)+STR_LEN(scan));
3183                 }
3184                 data->last_end = data->pos_min + l;
3185                 data->pos_min += l; /* As in the first entry. */
3186                 data->flags &= ~SF_BEFORE_EOL;
3187             }
3188             if (flags & SCF_DO_STCLASS_AND) {
3189                 /* Check whether it is compatible with what we know already! */
3190                 int compat = 1;
3191
3192
3193                 /* If compatible, we or it in below.  It is compatible if is
3194                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3195                  * it's for a locale.  Even if there isn't unicode semantics
3196                  * here, at runtime there may be because of matching against a
3197                  * utf8 string, so accept a possible false positive for
3198                  * latin1-range folds */
3199                 if (uc >= 0x100 ||
3200                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3201                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3202                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3203                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3204                     )
3205                 {
3206                     compat = 0;
3207                 }
3208                 ANYOF_CLASS_ZERO(data->start_class);
3209                 ANYOF_BITMAP_ZERO(data->start_class);
3210                 if (compat)
3211                     ANYOF_BITMAP_SET(data->start_class, uc);
3212                 else if (uc >= 0x100) {
3213                     int i;
3214
3215                     /* Some Unicode code points fold to the Latin1 range; as
3216                      * XXX temporary code, instead of figuring out if this is
3217                      * one, just assume it is and set all the start class bits
3218                      * that could be some such above 255 code point's fold
3219                      * which will generate fals positives.  As the code
3220                      * elsewhere that does compute the fold settles down, it
3221                      * can be extracted out and re-used here */
3222                     for (i = 0; i < 256; i++){
3223                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3224                             ANYOF_BITMAP_SET(data->start_class, i);
3225                         }
3226                     }
3227                 }
3228                 data->start_class->flags &= ~ANYOF_EOS;
3229                 if (uc < 0x100)
3230                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3231             }
3232             else if (flags & SCF_DO_STCLASS_OR) {
3233                 /* false positive possible if the class is case-folded */
3234                 if (uc < 0x100)
3235                     ANYOF_BITMAP_SET(data->start_class, uc);
3236                 else
3237                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3238                 data->start_class->flags &= ~ANYOF_EOS;
3239                 cl_and(data->start_class, and_withp);
3240             }
3241             flags &= ~SCF_DO_STCLASS;
3242         }
3243         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3244             I32 l = STR_LEN(scan);
3245             UV uc = *((U8*)STRING(scan));
3246
3247             /* Search for fixed substrings supports EXACT only. */
3248             if (flags & SCF_DO_SUBSTR) {
3249                 assert(data);
3250                 SCAN_COMMIT(pRExC_state, data, minlenp);
3251             }
3252             if (UTF) {
3253                 const U8 * const s = (U8 *)STRING(scan);
3254                 l = utf8_length(s, s + l);
3255                 uc = utf8_to_uvchr(s, NULL);
3256             }
3257             min += l;
3258             if (flags & SCF_DO_SUBSTR)
3259                 data->pos_min += l;
3260             if (flags & SCF_DO_STCLASS_AND) {
3261                 /* Check whether it is compatible with what we know already! */
3262                 int compat = 1;
3263                 if (uc >= 0x100 ||
3264                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3265                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3266                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3267                 {
3268                     compat = 0;
3269                 }
3270                 ANYOF_CLASS_ZERO(data->start_class);
3271                 ANYOF_BITMAP_ZERO(data->start_class);
3272                 if (compat) {
3273                     ANYOF_BITMAP_SET(data->start_class, uc);
3274                     data->start_class->flags &= ~ANYOF_EOS;
3275                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3276                     if (OP(scan) == EXACTFL) {
3277                         /* XXX This set is probably no longer necessary, and
3278                          * probably wrong as LOCALE now is on in the initial
3279                          * state */
3280                         data->start_class->flags |= ANYOF_LOCALE;
3281                     }
3282                     else {
3283
3284                         /* Also set the other member of the fold pair.  In case
3285                          * that unicode semantics is called for at runtime, use
3286                          * the full latin1 fold.  (Can't do this for locale,
3287                          * because not known until runtime */
3288                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3289                     }
3290                 }
3291                 else if (uc >= 0x100) {
3292                     int i;
3293                     for (i = 0; i < 256; i++){
3294                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3295                             ANYOF_BITMAP_SET(data->start_class, i);
3296                         }
3297                     }
3298                 }
3299             }
3300             else if (flags & SCF_DO_STCLASS_OR) {
3301                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3302                     /* false positive possible if the class is case-folded.
3303                        Assume that the locale settings are the same... */
3304                     if (uc < 0x100) {
3305                         ANYOF_BITMAP_SET(data->start_class, uc);
3306                         if (OP(scan) != EXACTFL) {
3307
3308                             /* And set the other member of the fold pair, but
3309                              * can't do that in locale because not known until
3310                              * run-time */
3311                             ANYOF_BITMAP_SET(data->start_class,
3312                                              PL_fold_latin1[uc]);
3313                         }
3314                     }
3315                     data->start_class->flags &= ~ANYOF_EOS;
3316                 }
3317                 cl_and(data->start_class, and_withp);
3318             }
3319             flags &= ~SCF_DO_STCLASS;
3320         }
3321         else if (REGNODE_VARIES(OP(scan))) {
3322             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3323             I32 f = flags, pos_before = 0;
3324             regnode * const oscan = scan;
3325             struct regnode_charclass_class this_class;
3326             struct regnode_charclass_class *oclass = NULL;
3327             I32 next_is_eval = 0;
3328
3329             switch (PL_regkind[OP(scan)]) {
3330             case WHILEM:                /* End of (?:...)* . */
3331                 scan = NEXTOPER(scan);
3332                 goto finish;
3333             case PLUS:
3334                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3335                     next = NEXTOPER(scan);
3336                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3337                         mincount = 1;
3338                         maxcount = REG_INFTY;
3339                         next = regnext(scan);
3340                         scan = NEXTOPER(scan);
3341                         goto do_curly;
3342                     }
3343                 }
3344                 if (flags & SCF_DO_SUBSTR)
3345                     data->pos_min++;
3346                 min++;
3347                 /* Fall through. */
3348             case STAR:
3349                 if (flags & SCF_DO_STCLASS) {
3350                     mincount = 0;
3351                     maxcount = REG_INFTY;
3352                     next = regnext(scan);
3353                     scan = NEXTOPER(scan);
3354                     goto do_curly;
3355                 }
3356                 is_inf = is_inf_internal = 1;
3357                 scan = regnext(scan);
3358                 if (flags & SCF_DO_SUBSTR) {
3359                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3360                     data->longest = &(data->longest_float);
3361                 }
3362                 goto optimize_curly_tail;
3363             case CURLY:
3364                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3365                     && (scan->flags == stopparen))
3366                 {
3367                     mincount = 1;
3368                     maxcount = 1;
3369                 } else {
3370                     mincount = ARG1(scan);
3371                     maxcount = ARG2(scan);
3372                 }
3373                 next = regnext(scan);
3374                 if (OP(scan) == CURLYX) {
3375                     I32 lp = (data ? *(data->last_closep) : 0);
3376                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3377                 }
3378                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3379                 next_is_eval = (OP(scan) == EVAL);
3380               do_curly:
3381                 if (flags & SCF_DO_SUBSTR) {
3382                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3383                     pos_before = data->pos_min;
3384                 }
3385                 if (data) {
3386                     fl = data->flags;
3387                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3388                     if (is_inf)
3389                         data->flags |= SF_IS_INF;
3390                 }
3391                 if (flags & SCF_DO_STCLASS) {
3392                     cl_init(pRExC_state, &this_class);
3393                     oclass = data->start_class;
3394                     data->start_class = &this_class;
3395                     f |= SCF_DO_STCLASS_AND;
3396                     f &= ~SCF_DO_STCLASS_OR;
3397                 }
3398                 /* Exclude from super-linear cache processing any {n,m}
3399                    regops for which the combination of input pos and regex
3400                    pos is not enough information to determine if a match
3401                    will be possible.
3402
3403                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3404                    regex pos at the \s*, the prospects for a match depend not
3405                    only on the input position but also on how many (bar\s*)
3406                    repeats into the {4,8} we are. */
3407                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3408                     f &= ~SCF_WHILEM_VISITED_POS;
3409
3410                 /* This will finish on WHILEM, setting scan, or on NULL: */
3411                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3412                                       last, data, stopparen, recursed, NULL,
3413                                       (mincount == 0
3414                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3415
3416                 if (flags & SCF_DO_STCLASS)
3417                     data->start_class = oclass;
3418                 if (mincount == 0 || minnext == 0) {
3419                     if (flags & SCF_DO_STCLASS_OR) {
3420                         cl_or(pRExC_state, data->start_class, &this_class);
3421                     }
3422                     else if (flags & SCF_DO_STCLASS_AND) {
3423                         /* Switch to OR mode: cache the old value of
3424                          * data->start_class */
3425                         INIT_AND_WITHP;
3426                         StructCopy(data->start_class, and_withp,
3427                                    struct regnode_charclass_class);
3428                         flags &= ~SCF_DO_STCLASS_AND;
3429                         StructCopy(&this_class, data->start_class,
3430                                    struct regnode_charclass_class);
3431                         flags |= SCF_DO_STCLASS_OR;
3432                         data->start_class->flags |= ANYOF_EOS;
3433                     }
3434                 } else {                /* Non-zero len */
3435                     if (flags & SCF_DO_STCLASS_OR) {
3436                         cl_or(pRExC_state, data->start_class, &this_class);
3437                         cl_and(data->start_class, and_withp);
3438                     }
3439                     else if (flags & SCF_DO_STCLASS_AND)
3440                         cl_and(data->start_class, &this_class);
3441                     flags &= ~SCF_DO_STCLASS;
3442                 }
3443                 if (!scan)              /* It was not CURLYX, but CURLY. */
3444                     scan = next;
3445                 if ( /* ? quantifier ok, except for (?{ ... }) */
3446                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3447                     && (minnext == 0) && (deltanext == 0)
3448                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3449                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3450                 {
3451                     ckWARNreg(RExC_parse,
3452                               "Quantifier unexpected on zero-length expression");
3453                 }
3454
3455                 min += minnext * mincount;
3456                 is_inf_internal |= ((maxcount == REG_INFTY
3457                                      && (minnext + deltanext) > 0)
3458                                     || deltanext == I32_MAX);
3459                 is_inf |= is_inf_internal;
3460                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3461
3462                 /* Try powerful optimization CURLYX => CURLYN. */
3463                 if (  OP(oscan) == CURLYX && data
3464                       && data->flags & SF_IN_PAR
3465                       && !(data->flags & SF_HAS_EVAL)
3466                       && !deltanext && minnext == 1 ) {
3467                     /* Try to optimize to CURLYN.  */
3468                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3469                     regnode * const nxt1 = nxt;
3470 #ifdef DEBUGGING
3471                     regnode *nxt2;
3472 #endif
3473
3474                     /* Skip open. */
3475                     nxt = regnext(nxt);
3476                     if (!REGNODE_SIMPLE(OP(nxt))
3477                         && !(PL_regkind[OP(nxt)] == EXACT
3478                              && STR_LEN(nxt) == 1))
3479                         goto nogo;
3480 #ifdef DEBUGGING
3481                     nxt2 = nxt;
3482 #endif
3483                     nxt = regnext(nxt);
3484                     if (OP(nxt) != CLOSE)
3485                         goto nogo;
3486                     if (RExC_open_parens) {
3487                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3488                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3489                     }
3490                     /* Now we know that nxt2 is the only contents: */
3491                     oscan->flags = (U8)ARG(nxt);
3492                     OP(oscan) = CURLYN;
3493                     OP(nxt1) = NOTHING; /* was OPEN. */
3494
3495 #ifdef DEBUGGING
3496                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3497                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3498                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3499                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3500                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3501                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3502 #endif
3503                 }
3504               nogo:
3505
3506                 /* Try optimization CURLYX => CURLYM. */
3507                 if (  OP(oscan) == CURLYX && data
3508                       && !(data->flags & SF_HAS_PAR)
3509                       && !(data->flags & SF_HAS_EVAL)
3510                       && !deltanext     /* atom is fixed width */
3511                       && minnext != 0   /* CURLYM can't handle zero width */
3512                 ) {
3513                     /* XXXX How to optimize if data == 0? */
3514                     /* Optimize to a simpler form.  */
3515                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3516                     regnode *nxt2;
3517
3518                     OP(oscan) = CURLYM;
3519                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3520                             && (OP(nxt2) != WHILEM))
3521                         nxt = nxt2;
3522                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3523                     /* Need to optimize away parenths. */
3524                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3525                         /* Set the parenth number.  */
3526                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3527
3528                         oscan->flags = (U8)ARG(nxt);
3529                         if (RExC_open_parens) {
3530                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3531                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3532                         }
3533                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3534                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3535
3536 #ifdef DEBUGGING
3537                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3538                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3539                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3540                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3541 #endif
3542 #if 0
3543                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3544                             regnode *nnxt = regnext(nxt1);
3545                             if (nnxt == nxt) {
3546                                 if (reg_off_by_arg[OP(nxt1)])
3547                                     ARG_SET(nxt1, nxt2 - nxt1);
3548                                 else if (nxt2 - nxt1 < U16_MAX)
3549                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3550                                 else
3551                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3552                             }
3553                             nxt1 = nnxt;
3554                         }
3555 #endif
3556                         /* Optimize again: */
3557                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3558                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3559                     }
3560                     else
3561                         oscan->flags = 0;
3562                 }
3563                 else if ((OP(oscan) == CURLYX)
3564                          && (flags & SCF_WHILEM_VISITED_POS)
3565                          /* See the comment on a similar expression above.
3566                             However, this time it's not a subexpression
3567                             we care about, but the expression itself. */
3568                          && (maxcount == REG_INFTY)
3569                          && data && ++data->whilem_c < 16) {
3570                     /* This stays as CURLYX, we can put the count/of pair. */
3571                     /* Find WHILEM (as in regexec.c) */
3572                     regnode *nxt = oscan + NEXT_OFF(oscan);
3573
3574                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3575                         nxt += ARG(nxt);
3576                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3577                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3578                 }
3579                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3580                     pars++;
3581                 if (flags & SCF_DO_SUBSTR) {
3582                     SV *last_str = NULL;
3583                     int counted = mincount != 0;
3584
3585                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3586 #if defined(SPARC64_GCC_WORKAROUND)
3587                         I32 b = 0;
3588                         STRLEN l = 0;
3589                         const char *s = NULL;
3590                         I32 old = 0;
3591
3592                         if (pos_before >= data->last_start_min)
3593                             b = pos_before;
3594                         else
3595                             b = data->last_start_min;
3596
3597                         l = 0;
3598                         s = SvPV_const(data->last_found, l);
3599                         old = b - data->last_start_min;
3600
3601 #else
3602                         I32 b = pos_before >= data->last_start_min
3603                             ? pos_before : data->last_start_min;
3604                         STRLEN l;
3605                         const char * const s = SvPV_const(data->last_found, l);
3606                         I32 old = b - data->last_start_min;
3607 #endif
3608
3609                         if (UTF)
3610                             old = utf8_hop((U8*)s, old) - (U8*)s;
3611                         l -= old;
3612                         /* Get the added string: */
3613                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3614                         if (deltanext == 0 && pos_before == b) {
3615                             /* What was added is a constant string */
3616                             if (mincount > 1) {
3617                                 SvGROW(last_str, (mincount * l) + 1);
3618                                 repeatcpy(SvPVX(last_str) + l,
3619                                           SvPVX_const(last_str), l, mincount - 1);
3620                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3621                                 /* Add additional parts. */
3622                                 SvCUR_set(data->last_found,
3623                                           SvCUR(data->last_found) - l);
3624                                 sv_catsv(data->last_found, last_str);
3625                                 {
3626                                     SV * sv = data->last_found;
3627                                     MAGIC *mg =
3628                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3629                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3630                                     if (mg && mg->mg_len >= 0)
3631                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3632                                 }
3633                                 data->last_end += l * (mincount - 1);
3634                             }
3635                         } else {
3636                             /* start offset must point into the last copy */
3637                             data->last_start_min += minnext * (mincount - 1);
3638                             data->last_start_max += is_inf ? I32_MAX
3639                                 : (maxcount - 1) * (minnext + data->pos_delta);
3640                         }
3641                     }
3642                     /* It is counted once already... */
3643                     data->pos_min += minnext * (mincount - counted);
3644                     data->pos_delta += - counted * deltanext +
3645                         (minnext + deltanext) * maxcount - minnext * mincount;
3646                     if (mincount != maxcount) {
3647                          /* Cannot extend fixed substrings found inside
3648                             the group.  */
3649                         SCAN_COMMIT(pRExC_state,data,minlenp);
3650                         if (mincount && last_str) {
3651                             SV * const sv = data->last_found;
3652                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3653                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3654
3655                             if (mg)
3656                                 mg->mg_len = -1;
3657                             sv_setsv(sv, last_str);
3658                             data->last_end = data->pos_min;
3659                             data->last_start_min =
3660                                 data->pos_min - CHR_SVLEN(last_str);
3661                             data->last_start_max = is_inf
3662                                 ? I32_MAX
3663                                 : data->pos_min + data->pos_delta
3664                                 - CHR_SVLEN(last_str);
3665                         }
3666                         data->longest = &(data->longest_float);
3667                     }
3668                     SvREFCNT_dec(last_str);
3669                 }
3670                 if (data && (fl & SF_HAS_EVAL))
3671                     data->flags |= SF_HAS_EVAL;
3672               optimize_curly_tail:
3673                 if (OP(oscan) != CURLYX) {
3674                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3675                            && NEXT_OFF(next))
3676                         NEXT_OFF(oscan) += NEXT_OFF(next);
3677                 }
3678                 continue;
3679             default:                    /* REF, ANYOFV, and CLUMP only? */
3680                 if (flags & SCF_DO_SUBSTR) {
3681                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3682                     data->longest = &(data->longest_float);
3683                 }
3684                 is_inf = is_inf_internal = 1;
3685                 if (flags & SCF_DO_STCLASS_OR)
3686                     cl_anything(pRExC_state, data->start_class);
3687                 flags &= ~SCF_DO_STCLASS;
3688                 break;
3689             }
3690         }
3691         else if (OP(scan) == LNBREAK) {
3692             if (flags & SCF_DO_STCLASS) {
3693                 int value = 0;
3694                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3695                 if (flags & SCF_DO_STCLASS_AND) {
3696                     for (value = 0; value < 256; value++)
3697                         if (!is_VERTWS_cp(value))
3698                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3699                 }
3700                 else {
3701                     for (value = 0; value < 256; value++)
3702                         if (is_VERTWS_cp(value))
3703                             ANYOF_BITMAP_SET(data->start_class, value);
3704                 }
3705                 if (flags & SCF_DO_STCLASS_OR)
3706                     cl_and(data->start_class, and_withp);
3707                 flags &= ~SCF_DO_STCLASS;
3708             }
3709             min += 1;
3710             delta += 1;
3711             if (flags & SCF_DO_SUBSTR) {
3712                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3713                 data->pos_min += 1;
3714                 data->pos_delta += 1;
3715                 data->longest = &(data->longest_float);
3716             }
3717         }
3718         else if (OP(scan) == FOLDCHAR) {
3719             int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3720             flags &= ~SCF_DO_STCLASS;
3721             min += 1;
3722             delta += d;
3723             if (flags & SCF_DO_SUBSTR) {
3724                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
3725                 data->pos_min += 1;
3726                 data->pos_delta += d;
3727                 data->longest = &(data->longest_float);
3728             }
3729         }
3730         else if (REGNODE_SIMPLE(OP(scan))) {
3731             int value = 0;
3732
3733             if (flags & SCF_DO_SUBSTR) {
3734                 SCAN_COMMIT(pRExC_state,data,minlenp);
3735                 data->pos_min++;
3736             }
3737             min++;
3738             if (flags & SCF_DO_STCLASS) {
3739                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3740
3741                 /* Some of the logic below assumes that switching
3742                    locale on will only add false positives. */
3743                 switch (PL_regkind[OP(scan)]) {
3744                 case SANY:
3745                 default:
3746                   do_default:
3747                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3748                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3749                         cl_anything(pRExC_state, data->start_class);
3750                     break;
3751                 case REG_ANY:
3752                     if (OP(scan) == SANY)
3753                         goto do_default;
3754                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3755                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3756                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3757                         cl_anything(pRExC_state, data->start_class);
3758                     }
3759                     if (flags & SCF_DO_STCLASS_AND || !value)
3760                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3761                     break;
3762                 case ANYOF:
3763                     if (flags & SCF_DO_STCLASS_AND)
3764                         cl_and(data->start_class,
3765                                (struct regnode_charclass_class*)scan);
3766                     else
3767                         cl_or(pRExC_state, data->start_class,
3768                               (struct regnode_charclass_class*)scan);
3769                     break;
3770                 case ALNUM:
3771                     if (flags & SCF_DO_STCLASS_AND) {
3772                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3773                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3774                             if (OP(scan) == ALNUMU) {
3775                                 for (value = 0; value < 256; value++) {
3776                                     if (!isWORDCHAR_L1(value)) {
3777                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3778                                     }
3779                                 }
3780                             } else {
3781                                 for (value = 0; value < 256; value++) {
3782                                     if (!isALNUM(value)) {
3783                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3784                                     }
3785                                 }
3786                             }
3787                         }
3788                     }
3789                     else {
3790                         if (data->start_class->flags & ANYOF_LOCALE)
3791                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3792
3793                         /* Even if under locale, set the bits for non-locale
3794                          * in case it isn't a true locale-node.  This will
3795                          * create false positives if it truly is locale */
3796                         if (OP(scan) == ALNUMU) {
3797                             for (value = 0; value < 256; value++) {
3798                                 if (isWORDCHAR_L1(value)) {
3799                                     ANYOF_BITMAP_SET(data->start_class, value);
3800                                 }
3801                             }
3802                         } else {
3803                             for (value = 0; value < 256; value++) {
3804                                 if (isALNUM(value)) {
3805                                     ANYOF_BITMAP_SET(data->start_class, value);
3806                                 }
3807                             }
3808                         }
3809                     }
3810                     break;
3811                 case NALNUM:
3812                     if (flags & SCF_DO_STCLASS_AND) {
3813                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3814                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3815                             if (OP(scan) == NALNUMU) {
3816                                 for (value = 0; value < 256; value++) {
3817                                     if (isWORDCHAR_L1(value)) {
3818                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3819                                     }
3820                                 }
3821                             } else {
3822                                 for (value = 0; value < 256; value++) {
3823                                     if (isALNUM(value)) {
3824                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3825                                     }
3826                                 }
3827                             }
3828                         }
3829                     }
3830                     else {
3831                         if (data->start_class->flags & ANYOF_LOCALE)
3832                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3833
3834                         /* Even if under locale, set the bits for non-locale in
3835                          * case it isn't a true locale-node.  This will create
3836                          * false positives if it truly is locale */
3837                         if (OP(scan) == NALNUMU) {
3838                             for (value = 0; value < 256; value++) {
3839                                 if (! isWORDCHAR_L1(value)) {
3840                                     ANYOF_BITMAP_SET(data->start_class, value);
3841                                 }
3842                             }
3843                         } else {
3844                             for (value = 0; value < 256; value++) {
3845                                 if (! isALNUM(value)) {
3846                                     ANYOF_BITMAP_SET(data->start_class, value);
3847                                 }
3848                             }
3849                         }
3850                     }
3851                     break;
3852                 case SPACE:
3853                     if (flags & SCF_DO_STCLASS_AND) {
3854                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3855                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3856                             if (OP(scan) == SPACEU) {
3857                                 for (value = 0; value < 256; value++) {
3858                                     if (!isSPACE_L1(value)) {
3859                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3860                                     }
3861                                 }
3862                             } else {
3863                                 for (value = 0; value < 256; value++) {
3864                                     if (!isSPACE(value)) {
3865                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3866                                     }
3867                                 }
3868                             }
3869                         }
3870                     }
3871                     else {
3872                         if (data->start_class->flags & ANYOF_LOCALE) {
3873                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3874                         }
3875                         if (OP(scan) == SPACEU) {
3876                             for (value = 0; value < 256; value++) {
3877                                 if (isSPACE_L1(value)) {
3878                                     ANYOF_BITMAP_SET(data->start_class, value);
3879                                 }
3880                             }
3881                         } else {
3882                             for (value = 0; value < 256; value++) {
3883                                 if (isSPACE(value)) {
3884                                     ANYOF_BITMAP_SET(data->start_class, value);
3885                                 }
3886                             }
3887                         }
3888                     }
3889                     break;
3890                 case NSPACE:
3891                     if (flags & SCF_DO_STCLASS_AND) {
3892                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3893                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3894                             if (OP(scan) == NSPACEU) {
3895                                 for (value = 0; value < 256; value++) {
3896                                     if (isSPACE_L1(value)) {
3897                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3898                                     }
3899                                 }
3900                             } else {
3901                                 for (value = 0; value < 256; value++) {
3902                                     if (isSPACE(value)) {
3903                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
3904                                     }
3905                                 }
3906                             }
3907                         }
3908                     }
3909                     else {
3910                         if (data->start_class->flags & ANYOF_LOCALE)
3911                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3912                         if (OP(scan) == NSPACEU) {
3913                             for (value = 0; value < 256; value++) {
3914                                 if (!isSPACE_L1(value)) {
3915                                     ANYOF_BITMAP_SET(data->start_class, value);
3916                                 }
3917                             }
3918                         }
3919                         else {
3920                             for (value = 0; value < 256; value++) {
3921                                 if (!isSPACE(value)) {
3922                                     ANYOF_BITMAP_SET(data->start_class, value);
3923                                 }
3924                             }
3925                         }
3926                     }
3927                     break;
3928                 case DIGIT:
3929                     if (flags & SCF_DO_STCLASS_AND) {
3930                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
3931                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3932                             for (value = 0; value < 256; value++)
3933                                 if (!isDIGIT(value))
3934                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
3935                         }
3936                     }
3937                     else {
3938                         if (data->start_class->flags & ANYOF_LOCALE)
3939                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3940                         for (value = 0; value < 256; value++)
3941                             if (isDIGIT(value))
3942                                 ANYOF_BITMAP_SET(data->start_class, value);
3943                     }
3944                     break;
3945                 case NDIGIT:
3946                     if (flags & SCF_DO_STCLASS_AND) {
3947                         if (!(data->start_class->flags & ANYOF_LOCALE))
3948                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3949                         for (value = 0; value < 256; value++)
3950                             if (isDIGIT(value))
3951                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
3952                     }
3953                     else {
3954                         if (data->start_class->flags & ANYOF_LOCALE)
3955                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3956                         for (value = 0; value < 256; value++)
3957                             if (!isDIGIT(value))
3958                                 ANYOF_BITMAP_SET(data->start_class, value);
3959                     }
3960                     break;
3961                 CASE_SYNST_FNC(VERTWS);
3962                 CASE_SYNST_FNC(HORIZWS);
3963
3964                 }
3965                 if (flags & SCF_DO_STCLASS_OR)
3966                     cl_and(data->start_class, and_withp);
3967                 flags &= ~SCF_DO_STCLASS;
3968             }
3969         }
3970         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3971             data->flags |= (OP(scan) == MEOL
3972                             ? SF_BEFORE_MEOL
3973                             : SF_BEFORE_SEOL);
3974         }
3975         else if (  PL_regkind[OP(scan)] == BRANCHJ
3976                  /* Lookbehind, or need to calculate parens/evals/stclass: */
3977                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
3978                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3979             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3980                 || OP(scan) == UNLESSM )
3981             {
3982                 /* Negative Lookahead/lookbehind
3983                    In this case we can't do fixed string optimisation.
3984                 */
3985
3986                 I32 deltanext, minnext, fake = 0;
3987                 regnode *nscan;
3988                 struct regnode_charclass_class intrnl;
3989                 int f = 0;
3990
3991                 data_fake.flags = 0;
3992                 if (data) {
3993                     data_fake.whilem_c = data->whilem_c;
3994                     data_fake.last_closep = data->last_closep;
3995                 }
3996                 else
3997                     data_fake.last_closep = &fake;
3998                 data_fake.pos_delta = delta;
3999                 if ( flags & SCF_DO_STCLASS && !scan->flags
4000                      && OP(scan) == IFMATCH ) { /* Lookahead */
4001                     cl_init(pRExC_state, &intrnl);
4002                     data_fake.start_class = &intrnl;
4003                     f |= SCF_DO_STCLASS_AND;
4004                 }
4005                 if (flags & SCF_WHILEM_VISITED_POS)
4006                     f |= SCF_WHILEM_VISITED_POS;
4007                 next = regnext(scan);
4008                 nscan = NEXTOPER(NEXTOPER(scan));
4009                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4010                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4011                 if (scan->flags) {
4012                     if (deltanext) {
4013                         FAIL("Variable length lookbehind not implemented");
4014                     }
4015                     else if (minnext > (I32)U8_MAX) {
4016                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4017                     }
4018                     scan->flags = (U8)minnext;
4019                 }
4020                 if (data) {
4021                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4022                         pars++;
4023                     if (data_fake.flags & SF_HAS_EVAL)
4024                         data->flags |= SF_HAS_EVAL;
4025                     data->whilem_c = data_fake.whilem_c;
4026                 }
4027                 if (f & SCF_DO_STCLASS_AND) {
4028                     if (flags & SCF_DO_STCLASS_OR) {
4029                         /* OR before, AND after: ideally we would recurse with
4030                          * data_fake to get the AND applied by study of the
4031                          * remainder of the pattern, and then derecurse;
4032                          * *** HACK *** for now just treat as "no information".
4033                          * See [perl #56690].
4034                          */
4035                         cl_init(pRExC_state, data->start_class);
4036                     }  else {
4037                         /* AND before and after: combine and continue */
4038                         const int was = (data->start_class->flags & ANYOF_EOS);
4039
4040                         cl_and(data->start_class, &intrnl);
4041                         if (was)
4042                             data->start_class->flags |= ANYOF_EOS;
4043                     }
4044                 }
4045             }
4046 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4047             else {
4048                 /* Positive Lookahead/lookbehind
4049                    In this case we can do fixed string optimisation,
4050                    but we must be careful about it. Note in the case of
4051                    lookbehind the positions will be offset by the minimum
4052                    length of the pattern, something we won't know about
4053                    until after the recurse.
4054                 */
4055                 I32 deltanext, fake = 0;
4056                 regnode *nscan;
4057                 struct regnode_charclass_class intrnl;
4058                 int f = 0;
4059                 /* We use SAVEFREEPV so that when the full compile
4060                     is finished perl will clean up the allocated
4061                     minlens when it's all done. This way we don't
4062                     have to worry about freeing them when we know
4063                     they wont be used, which would be a pain.
4064                  */
4065                 I32 *minnextp;
4066                 Newx( minnextp, 1, I32 );
4067                 SAVEFREEPV(minnextp);
4068
4069                 if (data) {
4070                     StructCopy(data, &data_fake, scan_data_t);
4071                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4072                         f |= SCF_DO_SUBSTR;
4073                         if (scan->flags)
4074                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4075                         data_fake.last_found=newSVsv(data->last_found);
4076                     }
4077                 }
4078                 else
4079                     data_fake.last_closep = &fake;
4080                 data_fake.flags = 0;
4081                 data_fake.pos_delta = delta;
4082                 if (is_inf)
4083                     data_fake.flags |= SF_IS_INF;
4084                 if ( flags & SCF_DO_STCLASS && !scan->flags
4085                      && OP(scan) == IFMATCH ) { /* Lookahead */
4086                     cl_init(pRExC_state, &intrnl);
4087                     data_fake.start_class = &intrnl;
4088                     f |= SCF_DO_STCLASS_AND;
4089                 }
4090                 if (flags & SCF_WHILEM_VISITED_POS)
4091                     f |= SCF_WHILEM_VISITED_POS;
4092                 next = regnext(scan);
4093                 nscan = NEXTOPER(NEXTOPER(scan));
4094
4095                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4096                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4097                 if (scan->flags) {
4098                     if (deltanext) {
4099                         FAIL("Variable length lookbehind not implemented");
4100                     }
4101                     else if (*minnextp > (I32)U8_MAX) {
4102                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4103                     }
4104                     scan->flags = (U8)*minnextp;
4105                 }
4106
4107                 *minnextp += min;
4108
4109                 if (f & SCF_DO_STCLASS_AND) {
4110                     const int was = (data->start_class->flags & ANYOF_EOS);
4111
4112                     cl_and(data->start_class, &intrnl);
4113                     if (was)
4114                         data->start_class->flags |= ANYOF_EOS;
4115                 }
4116                 if (data) {
4117                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4118                         pars++;
4119                     if (data_fake.flags & SF_HAS_EVAL)
4120                         data->flags |= SF_HAS_EVAL;
4121                     data->whilem_c = data_fake.whilem_c;
4122                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4123                         if (RExC_rx->minlen<*minnextp)
4124                             RExC_rx->minlen=*minnextp;
4125                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4126                         SvREFCNT_dec(data_fake.last_found);
4127
4128                         if ( data_fake.minlen_fixed != minlenp )
4129                         {
4130                             data->offset_fixed= data_fake.offset_fixed;
4131                             data->minlen_fixed= data_fake.minlen_fixed;
4132                             data->lookbehind_fixed+= scan->flags;
4133                         }
4134                         if ( data_fake.minlen_float != minlenp )
4135                         {
4136                             data->minlen_float= data_fake.minlen_float;
4137                             data->offset_float_min=data_fake.offset_float_min;
4138                             data->offset_float_max=data_fake.offset_float_max;
4139                             data->lookbehind_float+= scan->flags;
4140                         }
4141                     }
4142                 }
4143
4144
4145             }
4146 #endif
4147         }
4148         else if (OP(scan) == OPEN) {
4149             if (stopparen != (I32)ARG(scan))
4150                 pars++;
4151         }
4152         else if (OP(scan) == CLOSE) {
4153             if (stopparen == (I32)ARG(scan)) {
4154                 break;
4155             }
4156             if ((I32)ARG(scan) == is_par) {
4157                 next = regnext(scan);
4158
4159                 if ( next && (OP(next) != WHILEM) && next < last)
4160                     is_par = 0;         /* Disable optimization */
4161             }
4162             if (data)
4163                 *(data->last_closep) = ARG(scan);
4164         }
4165         else if (OP(scan) == EVAL) {
4166                 if (data)
4167                     data->flags |= SF_HAS_EVAL;
4168         }
4169         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4170             if (flags & SCF_DO_SUBSTR) {
4171                 SCAN_COMMIT(pRExC_state,data,minlenp);
4172                 flags &= ~SCF_DO_SUBSTR;
4173             }
4174             if (data && OP(scan)==ACCEPT) {
4175                 data->flags |= SCF_SEEN_ACCEPT;
4176                 if (stopmin > min)
4177                     stopmin = min;
4178             }
4179         }
4180         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4181         {
4182                 if (flags & SCF_DO_SUBSTR) {
4183                     SCAN_COMMIT(pRExC_state,data,minlenp);
4184                     data->longest = &(data->longest_float);
4185                 }
4186                 is_inf = is_inf_internal = 1;
4187                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4188                     cl_anything(pRExC_state, data->start_class);
4189                 flags &= ~SCF_DO_STCLASS;
4190         }
4191         else if (OP(scan) == GPOS) {
4192             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4193                 !(delta || is_inf || (data && data->pos_delta)))
4194             {
4195                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4196                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4197                 if (RExC_rx->gofs < (U32)min)
4198                     RExC_rx->gofs = min;
4199             } else {
4200                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4201                 RExC_rx->gofs = 0;
4202             }
4203         }
4204 #ifdef TRIE_STUDY_OPT
4205 #ifdef FULL_TRIE_STUDY
4206         else if (PL_regkind[OP(scan)] == TRIE) {
4207             /* NOTE - There is similar code to this block above for handling
4208                BRANCH nodes on the initial study.  If you change stuff here
4209                check there too. */
4210             regnode *trie_node= scan;
4211             regnode *tail= regnext(scan);
4212             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4213             I32 max1 = 0, min1 = I32_MAX;
4214             struct regnode_charclass_class accum;
4215
4216             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4217                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4218             if (flags & SCF_DO_STCLASS)
4219                 cl_init_zero(pRExC_state, &accum);
4220
4221             if (!trie->jump) {
4222                 min1= trie->minlen;
4223                 max1= trie->maxlen;
4224             } else {
4225                 const regnode *nextbranch= NULL;
4226                 U32 word;
4227
4228                 for ( word=1 ; word <= trie->wordcount ; word++)
4229                 {
4230                     I32 deltanext=0, minnext=0, f = 0, fake;
4231                     struct regnode_charclass_class this_class;
4232
4233                     data_fake.flags = 0;
4234                     if (data) {
4235                         data_fake.whilem_c = data->whilem_c;
4236                         data_fake.last_closep = data->last_closep;
4237                     }
4238                     else
4239                         data_fake.last_closep = &fake;
4240                     data_fake.pos_delta = delta;
4241                     if (flags & SCF_DO_STCLASS) {
4242                         cl_init(pRExC_state, &this_class);
4243                         data_fake.start_class = &this_class;
4244                         f = SCF_DO_STCLASS_AND;
4245                     }
4246                     if (flags & SCF_WHILEM_VISITED_POS)
4247                         f |= SCF_WHILEM_VISITED_POS;
4248
4249                     if (trie->jump[word]) {
4250                         if (!nextbranch)
4251                             nextbranch = trie_node + trie->jump[0];
4252                         scan= trie_node + trie->jump[word];
4253                         /* We go from the jump point to the branch that follows
4254                            it. Note this means we need the vestigal unused branches
4255                            even though they arent otherwise used.
4256                          */
4257                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4258                             &deltanext, (regnode *)nextbranch, &data_fake,
4259                             stopparen, recursed, NULL, f,depth+1);
4260                     }
4261                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4262                         nextbranch= regnext((regnode*)nextbranch);
4263
4264                     if (min1 > (I32)(minnext + trie->minlen))
4265                         min1 = minnext + trie->minlen;
4266                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4267                         max1 = minnext + deltanext + trie->maxlen;
4268                     if (deltanext == I32_MAX)
4269                         is_inf = is_inf_internal = 1;
4270
4271                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4272                         pars++;
4273                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4274                         if ( stopmin > min + min1)
4275                             stopmin = min + min1;
4276                         flags &= ~SCF_DO_SUBSTR;
4277                         if (data)
4278                             data->flags |= SCF_SEEN_ACCEPT;
4279                     }
4280                     if (data) {
4281                         if (data_fake.flags & SF_HAS_EVAL)
4282                             data->flags |= SF_HAS_EVAL;
4283                         data->whilem_c = data_fake.whilem_c;
4284                     }
4285                     if (flags & SCF_DO_STCLASS)
4286                         cl_or(pRExC_state, &accum, &this_class);
4287                 }
4288             }
4289             if (flags & SCF_DO_SUBSTR) {
4290                 data->pos_min += min1;
4291                 data->pos_delta += max1 - min1;
4292                 if (max1 != min1 || is_inf)
4293                     data->longest = &(data->longest_float);
4294             }
4295             min += min1;
4296             delta += max1 - min1;
4297             if (flags & SCF_DO_STCLASS_OR) {
4298                 cl_or(pRExC_state, data->start_class, &accum);
4299                 if (min1) {
4300                     cl_and(data->start_class, and_withp);
4301                     flags &= ~SCF_DO_STCLASS;
4302                 }
4303             }
4304             else if (flags & SCF_DO_STCLASS_AND) {
4305                 if (min1) {
4306                     cl_and(data->start_class, &accum);
4307                     flags &= ~SCF_DO_STCLASS;
4308                 }
4309                 else {
4310                     /* Switch to OR mode: cache the old value of
4311                      * data->start_class */
4312                     INIT_AND_WITHP;
4313                     StructCopy(data->start_class, and_withp,
4314                                struct regnode_charclass_class);
4315                     flags &= ~SCF_DO_STCLASS_AND;
4316                     StructCopy(&accum, data->start_class,
4317                                struct regnode_charclass_class);
4318                     flags |= SCF_DO_STCLASS_OR;
4319                     data->start_class->flags |= ANYOF_EOS;
4320                 }
4321             }
4322             scan= tail;
4323             continue;
4324         }
4325 #else
4326         else if (PL_regkind[OP(scan)] == TRIE) {
4327             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4328             U8*bang=NULL;
4329
4330             min += trie->minlen;
4331             delta += (trie->maxlen - trie->minlen);
4332             flags &= ~SCF_DO_STCLASS; /* xxx */
4333             if (flags & SCF_DO_SUBSTR) {
4334                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4335                 data->pos_min += trie->minlen;
4336                 data->pos_delta += (trie->maxlen - trie->minlen);
4337                 if (trie->maxlen != trie->minlen)
4338                     data->longest = &(data->longest_float);
4339             }
4340             if (trie->jump) /* no more substrings -- for now /grr*/
4341                 flags &= ~SCF_DO_SUBSTR;
4342         }
4343 #endif /* old or new */
4344 #endif /* TRIE_STUDY_OPT */
4345
4346         /* Else: zero-length, ignore. */
4347         scan = regnext(scan);
4348     }
4349     if (frame) {
4350         last = frame->last;
4351         scan = frame->next;
4352         stopparen = frame->stop;
4353         frame = frame->prev;
4354         goto fake_study_recurse;
4355     }
4356
4357   finish:
4358     assert(!frame);
4359     DEBUG_STUDYDATA("pre-fin:",data,depth);
4360
4361     *scanp = scan;
4362     *deltap = is_inf_internal ? I32_MAX : delta;
4363     if (flags & SCF_DO_SUBSTR && is_inf)
4364         data->pos_delta = I32_MAX - data->pos_min;
4365     if (is_par > (I32)U8_MAX)
4366         is_par = 0;
4367     if (is_par && pars==1 && data) {
4368         data->flags |= SF_IN_PAR;
4369         data->flags &= ~SF_HAS_PAR;
4370     }
4371     else if (pars && data) {
4372         data->flags |= SF_HAS_PAR;
4373         data->flags &= ~SF_IN_PAR;
4374     }
4375     if (flags & SCF_DO_STCLASS_OR)
4376         cl_and(data->start_class, and_withp);
4377     if (flags & SCF_TRIE_RESTUDY)
4378         data->flags |=  SCF_TRIE_RESTUDY;
4379
4380     DEBUG_STUDYDATA("post-fin:",data,depth);
4381
4382     return min < stopmin ? min : stopmin;
4383 }
4384
4385 STATIC U32
4386 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4387 {
4388     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4389
4390     PERL_ARGS_ASSERT_ADD_DATA;
4391
4392     Renewc(RExC_rxi->data,
4393            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4394            char, struct reg_data);
4395     if(count)
4396         Renew(RExC_rxi->data->what, count + n, U8);
4397     else
4398         Newx(RExC_rxi->data->what, n, U8);
4399     RExC_rxi->data->count = count + n;
4400     Copy(s, RExC_rxi->data->what + count, n, U8);
4401     return count;
4402 }
4403
4404 /*XXX: todo make this not included in a non debugging perl */
4405 #ifndef PERL_IN_XSUB_RE
4406 void
4407 Perl_reginitcolors(pTHX)
4408 {
4409     dVAR;
4410     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4411     if (s) {
4412         char *t = savepv(s);
4413         int i = 0;
4414         PL_colors[0] = t;
4415         while (++i < 6) {
4416             t = strchr(t, '\t');
4417             if (t) {
4418                 *t = '\0';
4419                 PL_colors[i] = ++t;
4420             }
4421             else
4422                 PL_colors[i] = t = (char *)"";
4423         }
4424     } else {
4425         int i = 0;
4426         while (i < 6)
4427             PL_colors[i++] = (char *)"";
4428     }
4429     PL_colorset = 1;
4430 }
4431 #endif
4432
4433
4434 #ifdef TRIE_STUDY_OPT
4435 #define CHECK_RESTUDY_GOTO                                  \
4436         if (                                                \
4437               (data.flags & SCF_TRIE_RESTUDY)               \
4438               && ! restudied++                              \
4439         )     goto reStudy
4440 #else
4441 #define CHECK_RESTUDY_GOTO
4442 #endif
4443
4444 /*
4445  - pregcomp - compile a regular expression into internal code
4446  *
4447  * We can't allocate space until we know how big the compiled form will be,
4448  * but we can't compile it (and thus know how big it is) until we've got a
4449  * place to put the code.  So we cheat:  we compile it twice, once with code
4450  * generation turned off and size counting turned on, and once "for real".
4451  * This also means that we don't allocate space until we are sure that the
4452  * thing really will compile successfully, and we never have to move the
4453  * code and thus invalidate pointers into it.  (Note that it has to be in
4454  * one piece because free() must be able to free it all.) [NB: not true in perl]
4455  *
4456  * Beware that the optimization-preparation code in here knows about some
4457  * of the structure of the compiled regexp.  [I'll say.]
4458  */
4459
4460
4461
4462 #ifndef PERL_IN_XSUB_RE
4463 #define RE_ENGINE_PTR &PL_core_reg_engine
4464 #else
4465 extern const struct regexp_engine my_reg_engine;
4466 #define RE_ENGINE_PTR &my_reg_engine
4467 #endif
4468
4469 #ifndef PERL_IN_XSUB_RE
4470 REGEXP *
4471 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4472 {
4473     dVAR;
4474     HV * const table = GvHV(PL_hintgv);
4475
4476     PERL_ARGS_ASSERT_PREGCOMP;
4477
4478     /* Dispatch a request to compile a regexp to correct
4479        regexp engine. */
4480     if (table) {
4481         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4482         GET_RE_DEBUG_FLAGS_DECL;
4483         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4484             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4485             DEBUG_COMPILE_r({
4486                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4487                     SvIV(*ptr));
4488             });
4489             return CALLREGCOMP_ENG(eng, pattern, flags);
4490         }
4491     }
4492     return Perl_re_compile(aTHX_ pattern, flags);
4493 }
4494 #endif
4495
4496 REGEXP *
4497 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4498 {
4499     dVAR;
4500     REGEXP *rx;
4501     struct regexp *r;
4502     register regexp_internal *ri;
4503     STRLEN plen;
4504     char  *exp;
4505     char* xend;
4506     regnode *scan;
4507     I32 flags;
4508     I32 minlen = 0;
4509     U32 pm_flags;
4510
4511     /* these are all flags - maybe they should be turned
4512      * into a single int with different bit masks */
4513     I32 sawlookahead = 0;
4514     I32 sawplus = 0;
4515     I32 sawopen = 0;
4516     bool used_setjump = FALSE;
4517     regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4518
4519     U8 jump_ret = 0;
4520     dJMPENV;
4521     scan_data_t data;
4522     RExC_state_t RExC_state;
4523     RExC_state_t * const pRExC_state = &RExC_state;
4524 #ifdef TRIE_STUDY_OPT
4525     int restudied;
4526     RExC_state_t copyRExC_state;
4527 #endif
4528     GET_RE_DEBUG_FLAGS_DECL;
4529
4530     PERL_ARGS_ASSERT_RE_COMPILE;
4531
4532     DEBUG_r(if (!PL_colorset) reginitcolors());
4533
4534     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4535     RExC_uni_semantics = 0;
4536     RExC_contains_locale = 0;
4537
4538     /****************** LONG JUMP TARGET HERE***********************/
4539     /* Longjmp back to here if have to switch in midstream to utf8 */
4540     if (! RExC_orig_utf8) {
4541         JMPENV_PUSH(jump_ret);
4542         used_setjump = TRUE;
4543     }
4544
4545     if (jump_ret == 0) {    /* First time through */
4546         exp = SvPV(pattern, plen);
4547         xend = exp + plen;
4548         /* ignore the utf8ness if the pattern is 0 length */
4549         if (plen == 0) {
4550             RExC_utf8 = RExC_orig_utf8 = 0;
4551         }
4552
4553         DEBUG_COMPILE_r({
4554             SV *dsv= sv_newmortal();
4555             RE_PV_QUOTED_DECL(s, RExC_utf8,
4556                 dsv, exp, plen, 60);
4557             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4558                            PL_colors[4],PL_colors[5],s);
4559         });
4560     }
4561     else {  /* longjumped back */
4562         STRLEN len = plen;
4563
4564         /* If the cause for the longjmp was other than changing to utf8, pop
4565          * our own setjmp, and longjmp to the correct handler */
4566         if (jump_ret != UTF8_LONGJMP) {
4567             JMPENV_POP;
4568             JMPENV_JUMP(jump_ret);
4569         }
4570
4571         GET_RE_DEBUG_FLAGS;
4572
4573         /* It's possible to write a regexp in ascii that represents Unicode
4574         codepoints outside of the byte range, such as via \x{100}. If we
4575         detect such a sequence we have to convert the entire pattern to utf8
4576         and then recompile, as our sizing calculation will have been based
4577         on 1 byte == 1 character, but we will need to use utf8 to encode
4578         at least some part of the pattern, and therefore must convert the whole
4579         thing.
4580         -- dmq */
4581         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4582             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4583         exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4584         xend = exp + len;
4585         RExC_orig_utf8 = RExC_utf8 = 1;
4586         SAVEFREEPV(exp);
4587     }
4588
4589 #ifdef TRIE_STUDY_OPT
4590     restudied = 0;
4591 #endif
4592
4593     pm_flags = orig_pm_flags;
4594
4595     if (initial_charset == REGEX_LOCALE_CHARSET) {
4596         RExC_contains_locale = 1;
4597     }
4598     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4599
4600         /* Set to use unicode semantics if the pattern is in utf8 and has the
4601          * 'depends' charset specified, as it means unicode when utf8  */
4602         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4603     }
4604
4605     RExC_precomp = exp;
4606     RExC_flags = pm_flags;
4607     RExC_sawback = 0;
4608
4609     RExC_seen = 0;
4610     RExC_in_lookbehind = 0;
4611     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4612     RExC_seen_evals = 0;
4613     RExC_extralen = 0;
4614
4615     /* First pass: determine size, legality. */
4616     RExC_parse = exp;
4617     RExC_start = exp;
4618     RExC_end = xend;
4619     RExC_naughty = 0;
4620     RExC_npar = 1;
4621     RExC_nestroot = 0;
4622     RExC_size = 0L;
4623     RExC_emit = &PL_regdummy;
4624     RExC_whilem_seen = 0;
4625     RExC_open_parens = NULL;
4626     RExC_close_parens = NULL;
4627     RExC_opend = NULL;
4628     RExC_paren_names = NULL;
4629 #ifdef DEBUGGING
4630     RExC_paren_name_list = NULL;
4631 #endif
4632     RExC_recurse = NULL;
4633     RExC_recurse_count = 0;
4634
4635 #if 0 /* REGC() is (currently) a NOP at the first pass.
4636        * Clever compilers notice this and complain. --jhi */
4637     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4638 #endif
4639     DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4640     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4641         RExC_precomp = NULL;
4642         return(NULL);
4643     }
4644
4645     /* Here, finished first pass.  Get rid of any added setjmp */
4646     if (used_setjump) {
4647         JMPENV_POP;
4648     }
4649
4650     DEBUG_PARSE_r({
4651         PerlIO_printf(Perl_debug_log,
4652             "Required size %"IVdf" nodes\n"
4653             "Starting second pass (creation)\n",
4654             (IV)RExC_size);
4655         RExC_lastnum=0;
4656         RExC_lastparse=NULL;
4657     });
4658
4659     /* The first pass could have found things that force Unicode semantics */
4660     if ((RExC_utf8 || RExC_uni_semantics)
4661          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4662     {
4663         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4664     }
4665
4666     /* Small enough for pointer-storage convention?
4667        If extralen==0, this means that we will not need long jumps. */
4668     if (RExC_size >= 0x10000L && RExC_extralen)
4669         RExC_size += RExC_extralen;
4670     else
4671         RExC_extralen = 0;
4672     if (RExC_whilem_seen > 15)
4673         RExC_whilem_seen = 15;
4674
4675     /* Allocate space and zero-initialize. Note, the two step process
4676        of zeroing when in debug mode, thus anything assigned has to
4677        happen after that */
4678     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4679     r = (struct regexp*)SvANY(rx);
4680     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4681          char, regexp_internal);
4682     if ( r == NULL || ri == NULL )
4683         FAIL("Regexp out of space");
4684 #ifdef DEBUGGING
4685     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4686     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4687 #else
4688     /* bulk initialize base fields with 0. */
4689     Zero(ri, sizeof(regexp_internal), char);
4690 #endif
4691
4692     /* non-zero initialization begins here */
4693     RXi_SET( r, ri );
4694     r->engine= RE_ENGINE_PTR;
4695     r->extflags = pm_flags;
4696     {
4697         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4698         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4699
4700         /* The caret is output if there are any defaults: if not all the STD
4701          * flags are set, or if no character set specifier is needed */
4702         bool has_default =
4703                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4704                     || ! has_charset);
4705         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4706         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4707                             >> RXf_PMf_STD_PMMOD_SHIFT);
4708         const char *fptr = STD_PAT_MODS;        /*"msix"*/
4709         char *p;
4710         /* Allocate for the worst case, which is all the std flags are turned
4711          * on.  If more precision is desired, we could do a population count of
4712          * the flags set.  This could be done with a small lookup table, or by
4713          * shifting, masking and adding, or even, when available, assembly
4714          * language for a machine-language population count.
4715          * We never output a minus, as all those are defaults, so are
4716          * covered by the caret */
4717         const STRLEN wraplen = plen + has_p + has_runon
4718             + has_default       /* If needs a caret */
4719
4720                 /* If needs a character set specifier */
4721             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4722             + (sizeof(STD_PAT_MODS) - 1)
4723             + (sizeof("(?:)") - 1);
4724
4725         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4726         SvPOK_on(rx);
4727         SvFLAGS(rx) |= SvUTF8(pattern);
4728         *p++='('; *p++='?';
4729
4730         /* If a default, cover it using the caret */
4731         if (has_default) {
4732             *p++= DEFAULT_PAT_MOD;
4733         }
4734         if (has_charset) {
4735             STRLEN len;
4736             const char* const name = get_regex_charset_name(r->extflags, &len);
4737             Copy(name, p, len, char);
4738             p += len;
4739         }
4740         if (has_p)
4741             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4742         {
4743             char ch;
4744             while((ch = *fptr++)) {
4745                 if(reganch & 1)
4746                     *p++ = ch;
4747                 reganch >>= 1;
4748             }
4749         }
4750
4751         *p++ = ':';
4752         Copy(RExC_precomp, p, plen, char);
4753         assert ((RX_WRAPPED(rx) - p) < 16);
4754         r->pre_prefix = p - RX_WRAPPED(rx);
4755         p += plen;
4756         if (has_runon)
4757             *p++ = '\n';
4758         *p++ = ')';
4759         *p = 0;
4760         SvCUR_set(rx, p - SvPVX_const(rx));
4761     }
4762
4763     r->intflags = 0;
4764     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4765
4766     if (RExC_seen & REG_SEEN_RECURSE) {
4767         Newxz(RExC_open_parens, RExC_npar,regnode *);
4768         SAVEFREEPV(RExC_open_parens);
4769         Newxz(RExC_close_parens,RExC_npar,regnode *);
4770         SAVEFREEPV(RExC_close_parens);
4771     }
4772
4773     /* Useful during FAIL. */
4774 #ifdef RE_TRACK_PATTERN_OFFSETS
4775     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4776     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4777                           "%s %"UVuf" bytes for offset annotations.\n",
4778                           ri->u.offsets ? "Got" : "Couldn't get",
4779                           (UV)((2*RExC_size+1) * sizeof(U32))));
4780 #endif
4781     SetProgLen(ri,RExC_size);
4782     RExC_rx_sv = rx;
4783     RExC_rx = r;
4784     RExC_rxi = ri;
4785
4786     /* Second pass: emit code. */
4787     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
4788     RExC_parse = exp;
4789     RExC_end = xend;
4790     RExC_naughty = 0;
4791     RExC_npar = 1;
4792     RExC_emit_start = ri->program;
4793     RExC_emit = ri->program;
4794     RExC_emit_bound = ri->program + RExC_size + 1;
4795
4796     /* Store the count of eval-groups for security checks: */
4797     RExC_rx->seen_evals = RExC_seen_evals;
4798     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4799     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4800         ReREFCNT_dec(rx);
4801         return(NULL);
4802     }
4803     /* XXXX To minimize changes to RE engine we always allocate
4804        3-units-long substrs field. */
4805     Newx(r->substrs, 1, struct reg_substr_data);
4806     if (RExC_recurse_count) {
4807         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4808         SAVEFREEPV(RExC_recurse);
4809     }
4810
4811 reStudy:
4812     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4813     Zero(r->substrs, 1, struct reg_substr_data);
4814
4815 #ifdef TRIE_STUDY_OPT
4816     if (!restudied) {
4817         StructCopy(&zero_scan_data, &data, scan_data_t);
4818         copyRExC_state = RExC_state;
4819     } else {
4820         U32 seen=RExC_seen;
4821         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4822
4823         RExC_state = copyRExC_state;
4824         if (seen & REG_TOP_LEVEL_BRANCHES)
4825             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4826         else
4827             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4828         if (data.last_found) {
4829             SvREFCNT_dec(data.longest_fixed);
4830             SvREFCNT_dec(data.longest_float);
4831             SvREFCNT_dec(data.last_found);
4832         }
4833         StructCopy(&zero_scan_data, &data, scan_data_t);
4834     }
4835 #else
4836     StructCopy(&zero_scan_data, &data, scan_data_t);
4837 #endif
4838
4839     /* Dig out information for optimizations. */
4840     r->extflags = RExC_flags; /* was pm_op */
4841     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4842
4843     if (UTF)
4844         SvUTF8_on(rx);  /* Unicode in it? */
4845     ri->regstclass = NULL;
4846     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
4847         r->intflags |= PREGf_NAUGHTY;
4848     scan = ri->program + 1;             /* First BRANCH. */
4849
4850     /* testing for BRANCH here tells us whether there is "must appear"
4851        data in the pattern. If there is then we can use it for optimisations */
4852     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
4853         I32 fake;
4854         STRLEN longest_float_length, longest_fixed_length;
4855         struct regnode_charclass_class ch_class; /* pointed to by data */
4856         int stclass_flag;
4857         I32 last_close = 0; /* pointed to by data */
4858         regnode *first= scan;
4859         regnode *first_next= regnext(first);
4860         /*
4861          * Skip introductions and multiplicators >= 1
4862          * so that we can extract the 'meat' of the pattern that must
4863          * match in the large if() sequence following.
4864          * NOTE that EXACT is NOT covered here, as it is normally
4865          * picked up by the optimiser separately.
4866          *
4867          * This is unfortunate as the optimiser isnt handling lookahead
4868          * properly currently.
4869          *
4870          */
4871         while ((OP(first) == OPEN && (sawopen = 1)) ||
4872                /* An OR of *one* alternative - should not happen now. */
4873             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4874             /* for now we can't handle lookbehind IFMATCH*/
4875             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4876             (OP(first) == PLUS) ||
4877             (OP(first) == MINMOD) ||
4878                /* An {n,m} with n>0 */
4879             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4880             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4881         {
4882                 /*
4883                  * the only op that could be a regnode is PLUS, all the rest
4884                  * will be regnode_1 or regnode_2.
4885                  *
4886                  */
4887                 if (OP(first) == PLUS)
4888                     sawplus = 1;
4889                 else
4890                     first += regarglen[OP(first)];
4891
4892                 first = NEXTOPER(first);
4893                 first_next= regnext(first);
4894         }
4895
4896         /* Starting-point info. */
4897       again:
4898         DEBUG_PEEP("first:",first,0);
4899         /* Ignore EXACT as we deal with it later. */
4900         if (PL_regkind[OP(first)] == EXACT) {
4901             if (OP(first) == EXACT)
4902                 NOOP;   /* Empty, get anchored substr later. */
4903             else
4904                 ri->regstclass = first;
4905         }
4906 #ifdef TRIE_STCLASS
4907         else if (PL_regkind[OP(first)] == TRIE &&
4908                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4909         {
4910             regnode *trie_op;
4911             /* this can happen only on restudy */
4912             if ( OP(first) == TRIE ) {
4913                 struct regnode_1 *trieop = (struct regnode_1 *)
4914                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
4915                 StructCopy(first,trieop,struct regnode_1);
4916                 trie_op=(regnode *)trieop;
4917             } else {
4918                 struct regnode_charclass *trieop = (struct regnode_charclass *)
4919                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4920                 StructCopy(first,trieop,struct regnode_charclass);
4921                 trie_op=(regnode *)trieop;
4922             }
4923             OP(trie_op)+=2;
4924             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4925             ri->regstclass = trie_op;
4926         }
4927 #endif
4928         else if (REGNODE_SIMPLE(OP(first)))
4929             ri->regstclass = first;
4930         else if (PL_regkind[OP(first)] == BOUND ||
4931                  PL_regkind[OP(first)] == NBOUND)
4932             ri->regstclass = first;
4933         else if (PL_regkind[OP(first)] == BOL) {
4934             r->extflags |= (OP(first) == MBOL
4935                            ? RXf_ANCH_MBOL
4936                            : (OP(first) == SBOL
4937                               ? RXf_ANCH_SBOL
4938                               : RXf_ANCH_BOL));
4939             first = NEXTOPER(first);
4940             goto again;
4941         }
4942         else if (OP(first) == GPOS) {
4943             r->extflags |= RXf_ANCH_GPOS;
4944             first = NEXTOPER(first);
4945             goto again;
4946         }
4947         else if ((!sawopen || !RExC_sawback) &&
4948             (OP(first) == STAR &&
4949             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4950             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4951         {
4952             /* turn .* into ^.* with an implied $*=1 */
4953             const int type =
4954                 (OP(NEXTOPER(first)) == REG_ANY)
4955                     ? RXf_ANCH_MBOL
4956                     : RXf_ANCH_SBOL;
4957             r->extflags |= type;
4958             r->intflags |= PREGf_IMPLICIT;
4959             first = NEXTOPER(first);
4960             goto again;
4961         }
4962         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4963             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4964             /* x+ must match at the 1st pos of run of x's */
4965             r->intflags |= PREGf_SKIP;
4966
4967         /* Scan is after the zeroth branch, first is atomic matcher. */
4968 #ifdef TRIE_STUDY_OPT
4969         DEBUG_PARSE_r(
4970             if (!restudied)
4971                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4972                               (IV)(first - scan + 1))
4973         );
4974 #else
4975         DEBUG_PARSE_r(
4976             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4977                 (IV)(first - scan + 1))
4978         );
4979 #endif
4980
4981
4982         /*
4983         * If there's something expensive in the r.e., find the
4984         * longest literal string that must appear and make it the
4985         * regmust.  Resolve ties in favor of later strings, since
4986         * the regstart check works with the beginning of the r.e.
4987         * and avoiding duplication strengthens checking.  Not a
4988         * strong reason, but sufficient in the absence of others.
4989         * [Now we resolve ties in favor of the earlier string if
4990         * it happens that c_offset_min has been invalidated, since the
4991         * earlier string may buy us something the later one won't.]
4992         */
4993
4994         data.longest_fixed = newSVpvs("");
4995         data.longest_float = newSVpvs("");
4996         data.last_found = newSVpvs("");
4997         data.longest = &(data.longest_fixed);
4998         first = scan;
4999         if (!ri->regstclass) {
5000             cl_init(pRExC_state, &ch_class);
5001             data.start_class = &ch_class;
5002             stclass_flag = SCF_DO_STCLASS_AND;
5003         } else                          /* XXXX Check for BOUND? */
5004             stclass_flag = 0;
5005         data.last_closep = &last_close;
5006
5007         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5008             &data, -1, NULL, NULL,
5009             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5010
5011
5012         CHECK_RESTUDY_GOTO;
5013
5014
5015         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5016              && data.last_start_min == 0 && data.last_end > 0
5017              && !RExC_seen_zerolen
5018              && !(RExC_seen & REG_SEEN_VERBARG)
5019              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5020             r->extflags |= RXf_CHECK_ALL;
5021         scan_commit(pRExC_state, &data,&minlen,0);
5022         SvREFCNT_dec(data.last_found);
5023
5024         /* Note that code very similar to this but for anchored string
5025            follows immediately below, changes may need to be made to both.
5026            Be careful.
5027          */
5028         longest_float_length = CHR_SVLEN(data.longest_float);
5029         if (longest_float_length
5030             || (data.flags & SF_FL_BEFORE_EOL
5031                 && (!(data.flags & SF_FL_BEFORE_MEOL)
5032                     || (RExC_flags & RXf_PMf_MULTILINE))))
5033         {
5034             I32 t,ml;
5035
5036             if (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5037                 && data.offset_fixed == data.offset_float_min
5038                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
5039                     goto remove_float;          /* As in (a)+. */
5040
5041             /* copy the information about the longest float from the reg_scan_data
5042                over to the program. */
5043             if (SvUTF8(data.longest_float)) {
5044                 r->float_utf8 = data.longest_float;
5045                 r->float_substr = NULL;
5046             } else {
5047                 r->float_substr = data.longest_float;
5048                 r->float_utf8 = NULL;
5049             }
5050             /* float_end_shift is how many chars that must be matched that
5051                follow this item. We calculate it ahead of time as once the
5052                lookbehind offset is added in we lose the ability to correctly
5053                calculate it.*/
5054             ml = data.minlen_float ? *(data.minlen_float)
5055                                    : (I32)longest_float_length;
5056             r->float_end_shift = ml - data.offset_float_min
5057                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5058                 + data.lookbehind_float;
5059             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5060             r->float_max_offset = data.offset_float_max;
5061             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5062                 r->float_max_offset -= data.lookbehind_float;
5063
5064             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5065                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5066                            || (RExC_flags & RXf_PMf_MULTILINE)));
5067             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5068         }
5069         else {
5070           remove_float:
5071             r->float_substr = r->float_utf8 = NULL;
5072             SvREFCNT_dec(data.longest_float);
5073             longest_float_length = 0;
5074         }
5075
5076         /* Note that code very similar to this but for floating string
5077            is immediately above, changes may need to be made to both.
5078            Be careful.
5079          */
5080         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5081         if (longest_fixed_length
5082             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5083                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5084                     || (RExC_flags & RXf_PMf_MULTILINE))))
5085         {
5086             I32 t,ml;
5087
5088             /* copy the information about the longest fixed
5089                from the reg_scan_data over to the program. */
5090             if (SvUTF8(data.longest_fixed)) {
5091                 r->anchored_utf8 = data.longest_fixed;
5092                 r->anchored_substr = NULL;
5093             } else {
5094                 r->anchored_substr = data.longest_fixed;
5095                 r->anchored_utf8 = NULL;
5096             }
5097             /* fixed_end_shift is how many chars that must be matched that
5098                follow this item. We calculate it ahead of time as once the
5099                lookbehind offset is added in we lose the ability to correctly
5100                calculate it.*/
5101             ml = data.minlen_fixed ? *(data.minlen_fixed)
5102                                    : (I32)longest_fixed_length;
5103             r->anchored_end_shift = ml - data.offset_fixed
5104                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5105                 + data.lookbehind_fixed;
5106             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5107
5108             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5109                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5110                      || (RExC_flags & RXf_PMf_MULTILINE)));
5111             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5112         }
5113         else {
5114             r->anchored_substr = r->anchored_utf8 = NULL;
5115             SvREFCNT_dec(data.longest_fixed);
5116             longest_fixed_length = 0;
5117         }
5118         if (ri->regstclass
5119             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5120             ri->regstclass = NULL;
5121
5122         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5123             && stclass_flag
5124             && !(data.start_class->flags & ANYOF_EOS)
5125             && !cl_is_anything(data.start_class))
5126         {
5127             const U32 n = add_data(pRExC_state, 1, "f");
5128             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5129
5130             Newx(RExC_rxi->data->data[n], 1,
5131                 struct regnode_charclass_class);
5132             StructCopy(data.start_class,
5133                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5134                        struct regnode_charclass_class);
5135             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5136             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5137             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5138                       regprop(r, sv, (regnode*)data.start_class);
5139                       PerlIO_printf(Perl_debug_log,
5140                                     "synthetic stclass \"%s\".\n",
5141                                     SvPVX_const(sv));});
5142         }
5143
5144         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5145         if (longest_fixed_length > longest_float_length) {
5146             r->check_end_shift = r->anchored_end_shift;
5147             r->check_substr = r->anchored_substr;
5148             r->check_utf8 = r->anchored_utf8;
5149             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5150             if (r->extflags & RXf_ANCH_SINGLE)
5151                 r->extflags |= RXf_NOSCAN;
5152         }
5153         else {
5154             r->check_end_shift = r->float_end_shift;
5155             r->check_substr = r->float_substr;
5156             r->check_utf8 = r->float_utf8;
5157             r->check_offset_min = r->float_min_offset;
5158             r->check_offset_max = r->float_max_offset;
5159         }
5160         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5161            This should be changed ASAP!  */
5162         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5163             r->extflags |= RXf_USE_INTUIT;
5164             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5165                 r->extflags |= RXf_INTUIT_TAIL;
5166         }
5167         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5168         if ( (STRLEN)minlen < longest_float_length )
5169             minlen= longest_float_length;
5170         if ( (STRLEN)minlen < longest_fixed_length )
5171             minlen= longest_fixed_length;
5172         */
5173     }
5174     else {
5175         /* Several toplevels. Best we can is to set minlen. */
5176         I32 fake;
5177         struct regnode_charclass_class ch_class;
5178         I32 last_close = 0;
5179
5180         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5181
5182         scan = ri->program + 1;
5183         cl_init(pRExC_state, &ch_class);
5184         data.start_class = &ch_class;
5185         data.last_closep = &last_close;
5186
5187
5188         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5189             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5190
5191         CHECK_RESTUDY_GOTO;
5192
5193         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5194                 = r->float_substr = r->float_utf8 = NULL;
5195
5196         if (!(data.start_class->flags & ANYOF_EOS)
5197             && !cl_is_anything(data.start_class))
5198         {
5199             const U32 n = add_data(pRExC_state, 1, "f");
5200             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5201
5202             Newx(RExC_rxi->data->data[n], 1,
5203                 struct regnode_charclass_class);
5204             StructCopy(data.start_class,
5205                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5206                        struct regnode_charclass_class);
5207             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5208             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5209             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5210                       regprop(r, sv, (regnode*)data.start_class);
5211                       PerlIO_printf(Perl_debug_log,
5212                                     "synthetic stclass \"%s\".\n",
5213                                     SvPVX_const(sv));});
5214         }
5215     }
5216
5217     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5218        the "real" pattern. */
5219     DEBUG_OPTIMISE_r({
5220         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5221                       (IV)minlen, (IV)r->minlen);
5222     });
5223     r->minlenret = minlen;
5224     if (r->minlen < minlen)
5225         r->minlen = minlen;
5226
5227     if (RExC_seen & REG_SEEN_GPOS)
5228         r->extflags |= RXf_GPOS_SEEN;
5229     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5230         r->extflags |= RXf_LOOKBEHIND_SEEN;
5231     if (RExC_seen & REG_SEEN_EVAL)
5232         r->extflags |= RXf_EVAL_SEEN;
5233     if (RExC_seen & REG_SEEN_CANY)
5234         r->extflags |= RXf_CANY_SEEN;
5235     if (RExC_seen & REG_SEEN_VERBARG)
5236         r->intflags |= PREGf_VERBARG_SEEN;
5237     if (RExC_seen & REG_SEEN_CUTGROUP)
5238         r->intflags |= PREGf_CUTGROUP_SEEN;
5239     if (RExC_paren_names)
5240         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5241     else
5242         RXp_PAREN_NAMES(r) = NULL;
5243
5244 #ifdef STUPID_PATTERN_CHECKS
5245     if (RX_PRELEN(rx) == 0)
5246         r->extflags |= RXf_NULL;
5247     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5248         /* XXX: this should happen BEFORE we compile */
5249         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5250     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5251         r->extflags |= RXf_WHITE;
5252     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5253         r->extflags |= RXf_START_ONLY;
5254 #else
5255     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5256             /* XXX: this should happen BEFORE we compile */
5257             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5258     else {
5259         regnode *first = ri->program + 1;
5260         U8 fop = OP(first);
5261
5262         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5263             r->extflags |= RXf_NULL;
5264         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5265             r->extflags |= RXf_START_ONLY;
5266         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5267                              && OP(regnext(first)) == END)
5268             r->extflags |= RXf_WHITE;
5269     }
5270 #endif
5271 #ifdef DEBUGGING
5272     if (RExC_paren_names) {
5273         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5274         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5275     } else
5276 #endif
5277         ri->name_list_idx = 0;
5278
5279     if (RExC_recurse_count) {
5280         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5281             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5282             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5283         }
5284     }
5285     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5286     /* assume we don't need to swap parens around before we match */
5287
5288     DEBUG_DUMP_r({
5289         PerlIO_printf(Perl_debug_log,"Final program:\n");
5290         regdump(r);
5291     });
5292 #ifdef RE_TRACK_PATTERN_OFFSETS
5293     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5294         const U32 len = ri->u.offsets[0];
5295         U32 i;
5296         GET_RE_DEBUG_FLAGS_DECL;
5297         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5298         for (i = 1; i <= len; i++) {
5299             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5300                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5301                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5302             }
5303         PerlIO_printf(Perl_debug_log, "\n");
5304     });
5305 #endif
5306     return rx;
5307 }
5308
5309 #undef RE_ENGINE_PTR
5310
5311
5312 SV*
5313 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5314                     const U32 flags)
5315 {
5316     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5317
5318     PERL_UNUSED_ARG(value);
5319
5320     if (flags & RXapif_FETCH) {
5321         return reg_named_buff_fetch(rx, key, flags);
5322     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5323         Perl_croak_no_modify(aTHX);
5324         return NULL;
5325     } else if (flags & RXapif_EXISTS) {
5326         return reg_named_buff_exists(rx, key, flags)
5327             ? &PL_sv_yes
5328             : &PL_sv_no;
5329     } else if (flags & RXapif_REGNAMES) {
5330         return reg_named_buff_all(rx, flags);
5331     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5332         return reg_named_buff_scalar(rx, flags);
5333     } else {
5334         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5335         return NULL;
5336     }
5337 }
5338
5339 SV*
5340 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5341                          const U32 flags)
5342 {
5343     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5344     PERL_UNUSED_ARG(lastkey);
5345
5346     if (flags & RXapif_FIRSTKEY)
5347         return reg_named_buff_firstkey(rx, flags);
5348     else if (flags & RXapif_NEXTKEY)
5349         return reg_named_buff_nextkey(rx, flags);
5350     else {
5351         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5352         return NULL;
5353     }
5354 }
5355
5356 SV*
5357 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5358                           const U32 flags)
5359 {
5360     AV *retarray = NULL;
5361     SV *ret;
5362     struct regexp *const rx = (struct regexp *)SvANY(r);
5363
5364     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5365
5366     if (flags & RXapif_ALL)
5367         retarray=newAV();
5368
5369     if (rx && RXp_PAREN_NAMES(rx)) {
5370         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5371         if (he_str) {
5372             IV i;
5373             SV* sv_dat=HeVAL(he_str);
5374             I32 *nums=(I32*)SvPVX(sv_dat);
5375             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5376                 if ((I32)(rx->nparens) >= nums[i]
5377                     && rx->offs[nums[i]].start != -1
5378                     && rx->offs[nums[i]].end != -1)
5379                 {
5380                     ret = newSVpvs("");
5381                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5382                     if (!retarray)
5383                         return ret;
5384                 } else {
5385                     ret = newSVsv(&PL_sv_undef);
5386                 }
5387                 if (retarray)
5388                     av_push(retarray, ret);
5389             }
5390             if (retarray)
5391                 return newRV_noinc(MUTABLE_SV(retarray));
5392         }
5393     }
5394     return NULL;
5395 }
5396
5397 bool
5398 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5399                            const U32 flags)
5400 {
5401     struct regexp *const rx = (struct regexp *)SvANY(r);
5402
5403     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5404
5405     if (rx && RXp_PAREN_NAMES(rx)) {
5406         if (flags & RXapif_ALL) {
5407             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5408         } else {
5409             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5410             if (sv) {
5411                 SvREFCNT_dec(sv);
5412                 return TRUE;
5413             } else {
5414                 return FALSE;
5415             }
5416         }
5417     } else {
5418         return FALSE;
5419     }
5420 }
5421
5422 SV*
5423 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5424 {
5425     struct regexp *const rx = (struct regexp *)SvANY(r);
5426
5427     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5428
5429     if ( rx && RXp_PAREN_NAMES(rx) ) {
5430         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5431
5432         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5433     } else {
5434         return FALSE;
5435     }
5436 }
5437
5438 SV*
5439 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5440 {
5441     struct regexp *const rx = (struct regexp *)SvANY(r);
5442     GET_RE_DEBUG_FLAGS_DECL;
5443
5444     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5445
5446     if (rx && RXp_PAREN_NAMES(rx)) {
5447         HV *hv = RXp_PAREN_NAMES(rx);
5448         HE *temphe;
5449         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5450             IV i;
5451             IV parno = 0;
5452             SV* sv_dat = HeVAL(temphe);
5453             I32 *nums = (I32*)SvPVX(sv_dat);
5454             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5455                 if ((I32)(rx->lastparen) >= nums[i] &&
5456                     rx->offs[nums[i]].start != -1 &&
5457                     rx->offs[nums[i]].end != -1)
5458                 {
5459                     parno = nums[i];
5460                     break;
5461                 }
5462             }
5463             if (parno || flags & RXapif_ALL) {
5464                 return newSVhek(HeKEY_hek(temphe));
5465             }
5466         }
5467     }
5468     return NULL;
5469 }
5470
5471 SV*
5472 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5473 {
5474     SV *ret;
5475     AV *av;
5476     I32 length;
5477     struct regexp *const rx = (struct regexp *)SvANY(r);
5478
5479     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5480
5481     if (rx && RXp_PAREN_NAMES(rx)) {
5482         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5483             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5484         } else if (flags & RXapif_ONE) {
5485             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5486             av = MUTABLE_AV(SvRV(ret));
5487             length = av_len(av);
5488             SvREFCNT_dec(ret);
5489             return newSViv(length + 1);
5490         } else {
5491             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5492             return NULL;
5493         }
5494     }
5495     return &PL_sv_undef;
5496 }
5497
5498 SV*
5499 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5500 {
5501     struct regexp *const rx = (struct regexp *)SvANY(r);
5502     AV *av = newAV();
5503
5504     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5505
5506     if (rx && RXp_PAREN_NAMES(rx)) {
5507         HV *hv= RXp_PAREN_NAMES(rx);
5508         HE *temphe;
5509         (void)hv_iterinit(hv);
5510         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5511             IV i;
5512             IV parno = 0;
5513             SV* sv_dat = HeVAL(temphe);
5514             I32 *nums = (I32*)SvPVX(sv_dat);
5515             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5516                 if ((I32)(rx->lastparen) >= nums[i] &&
5517                     rx->offs[nums[i]].start != -1 &&
5518                     rx->offs[nums[i]].end != -1)
5519                 {
5520                     parno = nums[i];
5521                     break;
5522                 }
5523             }
5524             if (parno || flags & RXapif_ALL) {
5525                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5526             }
5527         }
5528     }
5529
5530     return newRV_noinc(MUTABLE_SV(av));
5531 }
5532
5533 void
5534 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5535                              SV * const sv)
5536 {
5537     struct regexp *const rx = (struct regexp *)SvANY(r);
5538     char *s = NULL;
5539     I32 i = 0;
5540     I32 s1, t1;
5541
5542     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5543
5544     if (!rx->subbeg) {
5545         sv_setsv(sv,&PL_sv_undef);
5546         return;
5547     }
5548     else
5549     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5550         /* $` */
5551         i = rx->offs[0].start;
5552         s = rx->subbeg;
5553     }
5554     else
5555     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5556         /* $' */
5557         s = rx->subbeg + rx->offs[0].end;
5558         i = rx->sublen - rx->offs[0].end;
5559     }
5560     else
5561     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5562         (s1 = rx->offs[paren].start) != -1 &&
5563         (t1 = rx->offs[paren].end) != -1)
5564     {
5565         /* $& $1 ... */
5566         i = t1 - s1;
5567         s = rx->subbeg + s1;
5568     } else {
5569         sv_setsv(sv,&PL_sv_undef);
5570         return;
5571     }
5572     assert(rx->sublen >= (s - rx->subbeg) + i );
5573     if (i >= 0) {
5574         const int oldtainted = PL_tainted;
5575         TAINT_NOT;
5576         sv_setpvn(sv, s, i);
5577         PL_tainted = oldtainted;
5578         if ( (rx->extflags & RXf_CANY_SEEN)
5579             ? (RXp_MATCH_UTF8(rx)
5580                         && (!i || is_utf8_string((U8*)s, i)))
5581             : (RXp_MATCH_UTF8(rx)) )
5582         {
5583             SvUTF8_on(sv);
5584         }
5585         else
5586             SvUTF8_off(sv);
5587         if (PL_tainting) {
5588             if (RXp_MATCH_TAINTED(rx)) {
5589                 if (SvTYPE(sv) >= SVt_PVMG) {
5590                     MAGIC* const mg = SvMAGIC(sv);
5591                     MAGIC* mgt;
5592                     PL_tainted = 1;
5593                     SvMAGIC_set(sv, mg->mg_moremagic);
5594                     SvTAINT(sv);
5595                     if ((mgt = SvMAGIC(sv))) {
5596                         mg->mg_moremagic = mgt;
5597                         SvMAGIC_set(sv, mg);
5598                     }
5599                 } else {
5600                     PL_tainted = 1;
5601                     SvTAINT(sv);
5602                 }
5603             } else
5604                 SvTAINTED_off(sv);
5605         }
5606     } else {
5607         sv_setsv(sv,&PL_sv_undef);
5608         return;
5609     }
5610 }
5611
5612 void
5613 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5614                                                          SV const * const value)
5615 {
5616     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5617
5618     PERL_UNUSED_ARG(rx);
5619     PERL_UNUSED_ARG(paren);
5620     PERL_UNUSED_ARG(value);
5621
5622     if (!PL_localizing)
5623         Perl_croak_no_modify(aTHX);
5624 }
5625
5626 I32
5627 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5628                               const I32 paren)
5629 {
5630     struct regexp *const rx = (struct regexp *)SvANY(r);
5631     I32 i;
5632     I32 s1, t1;
5633
5634     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5635
5636     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5637         switch (paren) {
5638       /* $` / ${^PREMATCH} */
5639       case RX_BUFF_IDX_PREMATCH:
5640         if (rx->offs[0].start != -1) {
5641                         i = rx->offs[0].start;
5642                         if (i > 0) {
5643                                 s1 = 0;
5644                                 t1 = i;
5645                                 goto getlen;
5646                         }
5647             }
5648         return 0;
5649       /* $' / ${^POSTMATCH} */
5650       case RX_BUFF_IDX_POSTMATCH:
5651             if (rx->offs[0].end != -1) {
5652                         i = rx->sublen - rx->offs[0].end;
5653                         if (i > 0) {
5654                                 s1 = rx->offs[0].end;
5655                                 t1 = rx->sublen;
5656                                 goto getlen;
5657                         }
5658             }
5659         return 0;
5660       /* $& / ${^MATCH}, $1, $2, ... */
5661       default:
5662             if (paren <= (I32)rx->nparens &&
5663             (s1 = rx->offs[paren].start) != -1 &&
5664             (t1 = rx->offs[paren].end) != -1)
5665             {
5666             i = t1 - s1;
5667             goto getlen;
5668         } else {
5669             if (ckWARN(WARN_UNINITIALIZED))
5670                 report_uninit((const SV *)sv);
5671             return 0;
5672         }
5673     }
5674   getlen:
5675     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5676         const char * const s = rx->subbeg + s1;
5677         const U8 *ep;
5678         STRLEN el;
5679
5680         i = t1 - s1;
5681         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5682                         i = el;
5683     }
5684     return i;
5685 }
5686
5687 SV*
5688 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5689 {
5690     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5691         PERL_UNUSED_ARG(rx);
5692         if (0)
5693             return NULL;
5694         else
5695             return newSVpvs("Regexp");
5696 }
5697
5698 /* Scans the name of a named buffer from the pattern.
5699  * If flags is REG_RSN_RETURN_NULL returns null.
5700  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5701  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5702  * to the parsed name as looked up in the RExC_paren_names hash.
5703  * If there is an error throws a vFAIL().. type exception.
5704  */
5705
5706 #define REG_RSN_RETURN_NULL    0
5707 #define REG_RSN_RETURN_NAME    1
5708 #define REG_RSN_RETURN_DATA    2
5709
5710 STATIC SV*
5711 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5712 {
5713     char *name_start = RExC_parse;
5714
5715     PERL_ARGS_ASSERT_REG_SCAN_NAME;
5716
5717     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5718          /* skip IDFIRST by using do...while */
5719         if (UTF)
5720             do {
5721                 RExC_parse += UTF8SKIP(RExC_parse);
5722             } while (isALNUM_utf8((U8*)RExC_parse));
5723         else
5724             do {
5725                 RExC_parse++;
5726             } while (isALNUM(*RExC_parse));
5727     }
5728
5729     if ( flags ) {
5730         SV* sv_name
5731             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5732                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5733         if ( flags == REG_RSN_RETURN_NAME)
5734             return sv_name;
5735         else if (flags==REG_RSN_RETURN_DATA) {
5736             HE *he_str = NULL;
5737             SV *sv_dat = NULL;
5738             if ( ! sv_name )      /* should not happen*/
5739                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5740             if (RExC_paren_names)
5741                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5742             if ( he_str )
5743                 sv_dat = HeVAL(he_str);
5744             if ( ! sv_dat )
5745                 vFAIL("Reference to nonexistent named group");
5746             return sv_dat;
5747         }
5748         else {
5749             Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5750         }
5751         /* NOT REACHED */
5752     }
5753     return NULL;
5754 }
5755
5756 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
5757     int rem=(int)(RExC_end - RExC_parse);                       \
5758     int cut;                                                    \
5759     int num;                                                    \
5760     int iscut=0;                                                \
5761     if (rem>10) {                                               \
5762         rem=10;                                                 \
5763         iscut=1;                                                \
5764     }                                                           \
5765     cut=10-rem;                                                 \
5766     if (RExC_lastparse!=RExC_parse)                             \
5767         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
5768             rem, RExC_parse,                                    \
5769             cut + 4,                                            \
5770             iscut ? "..." : "<"                                 \
5771         );                                                      \
5772     else                                                        \
5773         PerlIO_printf(Perl_debug_log,"%16s","");                \
5774                                                                 \
5775     if (SIZE_ONLY)                                              \
5776        num = RExC_size + 1;                                     \
5777     else                                                        \
5778        num=REG_NODE_NUM(RExC_emit);                             \
5779     if (RExC_lastnum!=num)                                      \
5780        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
5781     else                                                        \
5782        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
5783     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
5784         (int)((depth*2)), "",                                   \
5785         (funcname)                                              \
5786     );                                                          \
5787     RExC_lastnum=num;                                           \
5788     RExC_lastparse=RExC_parse;                                  \
5789 })
5790
5791
5792
5793 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
5794     DEBUG_PARSE_MSG((funcname));                            \
5795     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
5796 })
5797 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
5798     DEBUG_PARSE_MSG((funcname));                            \
5799     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
5800 })
5801
5802 /* This section of code defines the inversion list object and its methods.  The
5803  * interfaces are highly subject to change, so as much as possible is static to
5804  * this file.  An inversion list is here implemented as a malloc'd C array with
5805  * some added info.  More will be coming when functionality is added later.
5806  *
5807  * Some of the methods should always be private to the implementation, and some
5808  * should eventually be made public */
5809
5810 #define INVLIST_INITIAL_LEN 10
5811 #define INVLIST_ARRAY_KEY "array"
5812 #define INVLIST_MAX_KEY "max"
5813 #define INVLIST_LEN_KEY "len"
5814
5815 PERL_STATIC_INLINE UV*
5816 S_invlist_array(pTHX_ HV* const invlist)
5817 {
5818     /* Returns the pointer to the inversion list's array.  Every time the
5819      * length changes, this needs to be called in case malloc or realloc moved
5820      * it */
5821
5822     SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5823
5824     PERL_ARGS_ASSERT_INVLIST_ARRAY;
5825
5826     if (list_ptr == NULL) {
5827         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5828                                                             INVLIST_ARRAY_KEY);
5829     }
5830
5831     return INT2PTR(UV *, SvUV(*list_ptr));
5832 }
5833
5834 PERL_STATIC_INLINE void
5835 S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5836 {
5837     PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5838
5839     /* Sets the array stored in the inversion list to the memory beginning with
5840      * the parameter */
5841
5842     if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5843         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5844                                                             INVLIST_ARRAY_KEY);
5845     }
5846 }
5847
5848 PERL_STATIC_INLINE UV
5849 S_invlist_len(pTHX_ HV* const invlist)
5850 {
5851     /* Returns the current number of elements in the inversion list's array */
5852
5853     SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5854
5855     PERL_ARGS_ASSERT_INVLIST_LEN;
5856
5857     if (len_ptr == NULL) {
5858         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5859                                                             INVLIST_LEN_KEY);
5860     }
5861
5862     return SvUV(*len_ptr);
5863 }
5864
5865 PERL_STATIC_INLINE UV
5866 S_invlist_max(pTHX_ HV* const invlist)
5867 {
5868     /* Returns the maximum number of elements storable in the inversion list's
5869      * array, without having to realloc() */
5870
5871     SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5872
5873     PERL_ARGS_ASSERT_INVLIST_MAX;
5874
5875     if (max_ptr == NULL) {
5876         Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5877                                                             INVLIST_MAX_KEY);
5878     }
5879
5880     return SvUV(*max_ptr);
5881 }
5882
5883 PERL_STATIC_INLINE void
5884 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5885 {
5886     /* Sets the current number of elements stored in the inversion list */
5887
5888     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5889
5890     if (len != 0 && len > invlist_max(invlist)) {
5891         Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5892     }
5893
5894     if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5895         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5896                                                             INVLIST_LEN_KEY);
5897     }
5898 }
5899
5900 PERL_STATIC_INLINE void
5901 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5902 {
5903
5904     /* Sets the maximum number of elements storable in the inversion list
5905      * without having to realloc() */
5906
5907     PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5908
5909     if (max < invlist_len(invlist)) {
5910         Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5911     }
5912
5913     if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5914         Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5915                                                             INVLIST_LEN_KEY);
5916     }
5917 }
5918
5919 #ifndef PERL_IN_XSUB_RE
5920 HV*
5921 Perl__new_invlist(pTHX_ IV initial_size)
5922 {
5923
5924     /* Return a pointer to a newly constructed inversion list, with enough
5925      * space to store 'initial_size' elements.  If that number is negative, a
5926      * system default is used instead */
5927
5928     HV* invlist = newHV();
5929     UV* list;
5930
5931     if (initial_size < 0) {
5932         initial_size = INVLIST_INITIAL_LEN;
5933     }
5934
5935     /* Allocate the initial space */
5936     Newx(list, initial_size, UV);
5937     invlist_set_array(invlist, list);
5938
5939     /* set_len has to come before set_max, as the latter inspects the len */
5940     invlist_set_len(invlist, 0);
5941     invlist_set_max(invlist, initial_size);
5942
5943     return invlist;
5944 }
5945 #endif
5946
5947 PERL_STATIC_INLINE void
5948 S_invlist_destroy(pTHX_ HV* const invlist)
5949 {
5950    /* Inversion list destructor */
5951
5952     SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5953
5954     PERL_ARGS_ASSERT_INVLIST_DESTROY;
5955
5956     if (list_ptr != NULL) {
5957         UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5958         Safefree(list);
5959     }
5960 }
5961
5962 STATIC void
5963 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5964 {
5965     /* Change the maximum size of an inversion list (up or down) */
5966
5967     UV* orig_array;
5968     UV* array;
5969     const UV old_max = invlist_max(invlist);
5970
5971     PERL_ARGS_ASSERT_INVLIST_EXTEND;
5972
5973     if (old_max == new_max) {   /* If a no-op */
5974         return;
5975     }
5976
5977     array = orig_array = invlist_array(invlist);
5978     Renew(array, new_max, UV);
5979
5980     /* If the size change moved the list in memory, set the new one */
5981     if (array != orig_array) {
5982         invlist_set_array(invlist, array);
5983     }
5984
5985     invlist_set_max(invlist, new_max);
5986
5987 }
5988
5989 PERL_STATIC_INLINE void
5990 S_invlist_trim(pTHX_ HV* const invlist)
5991 {
5992     PERL_ARGS_ASSERT_INVLIST_TRIM;
5993
5994     /* Change the length of the inversion list to how many entries it currently
5995      * has */
5996
5997     invlist_extend(invlist, invlist_len(invlist));
5998 }
5999
6000 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6001  * etc */
6002
6003 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
6004
6005 #ifndef PERL_IN_XSUB_RE
6006 void
6007 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
6008 {
6009    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6010     * the end of the inversion list.  The range must be above any existing
6011     * ones. */
6012
6013     UV* array = invlist_array(invlist);
6014     UV max = invlist_max(invlist);
6015     UV len = invlist_len(invlist);
6016
6017     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6018
6019     if (len > 0) {
6020
6021         /* Here, the existing list is non-empty. The current max entry in the
6022          * list is generally the first value not in the set, except when the
6023          * set extends to the end of permissible values, in which case it is
6024          * the first entry in that final set, and so this call is an attempt to
6025          * append out-of-order */
6026
6027         UV final_element = len - 1;
6028         if (array[final_element] > start
6029             || ELEMENT_IN_INVLIST_SET(final_element))
6030         {
6031             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
6032         }
6033
6034         /* Here, it is a legal append.  If the new range begins with the first
6035          * value not in the set, it is extending the set, so the new first
6036          * value not in the set is one greater than the newly extended range.
6037          * */
6038         if (array[final_element] == start) {
6039             if (end != UV_MAX) {
6040                 array[final_element] = end + 1;
6041             }
6042             else {
6043                 /* But if the end is the maximum representable on the machine,
6044                  * just let the range that this would extend have no end */
6045                 invlist_set_len(invlist, len - 1);
6046             }
6047             return;
6048         }
6049     }
6050
6051     /* Here the new range doesn't extend any existing set.  Add it */
6052
6053     len += 2;   /* Includes an element each for the start and end of range */
6054
6055     /* If overflows the existing space, extend, which may cause the array to be
6056      * moved */
6057     if (max < len) {
6058         invlist_extend(invlist, len);
6059         array = invlist_array(invlist);
6060     }
6061
6062     invlist_set_len(invlist, len);
6063
6064     /* The next item on the list starts the range, the one after that is
6065      * one past the new range.  */
6066     array[len - 2] = start;
6067     if (end != UV_MAX) {
6068         array[len - 1] = end + 1;
6069     }
6070     else {
6071         /* But if the end is the maximum representable on the machine, just let
6072          * the range have no end */
6073         invlist_set_len(invlist, len - 1);
6074     }
6075 }
6076 #endif
6077
6078 STATIC HV*
6079 S_invlist_union(pTHX_ HV* const a, HV* const b)
6080 {
6081     /* Return a new inversion list which is the union of two inversion lists.
6082      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6083      * Richard Gillam, published by Addison-Wesley, and explained at some
6084      * length there.  The preface says to incorporate its examples into your
6085      * code at your own risk.
6086      *
6087      * The algorithm is like a merge sort.
6088      *
6089      * XXX A potential performance improvement is to keep track as we go along
6090      * if only one of the inputs contributes to the result, meaning the other
6091      * is a subset of that one.  In that case, we can skip the final copy and
6092      * return the larger of the input lists */
6093
6094     UV* array_a = invlist_array(a);   /* a's array */
6095     UV* array_b = invlist_array(b);
6096     UV len_a = invlist_len(a);  /* length of a's array */
6097     UV len_b = invlist_len(b);
6098
6099     HV* u;                      /* the resulting union */
6100     UV* array_u;
6101     UV len_u;
6102
6103     UV i_a = 0;             /* current index into a's array */
6104     UV i_b = 0;
6105     UV i_u = 0;
6106
6107     /* running count, as explained in the algorithm source book; items are
6108      * stopped accumulating and are output when the count changes to/from 0.
6109      * The count is incremented when we start a range that's in the set, and
6110      * decremented when we start a range that's not in the set.  So its range
6111      * is 0 to 2.  Only when the count is zero is something not in the set.
6112      */
6113     UV count = 0;
6114
6115     PERL_ARGS_ASSERT_INVLIST_UNION;
6116
6117     /* Size the union for the worst case: that the sets are completely
6118      * disjoint */
6119     u = _new_invlist(len_a + len_b);
6120     array_u = invlist_array(u);
6121
6122     /* Go through each list item by item, stopping when exhausted one of
6123      * them */
6124     while (i_a < len_a && i_b < len_b) {
6125         UV cp;      /* The element to potentially add to the union's array */
6126         bool cp_in_set;   /* is it in the the input list's set or not */
6127
6128         /* We need to take one or the other of the two inputs for the union.
6129          * Since we are merging two sorted lists, we take the smaller of the
6130          * next items.  In case of a tie, we take the one that is in its set
6131          * first.  If we took one not in the set first, it would decrement the
6132          * count, possibly to 0 which would cause it to be output as ending the
6133          * range, and the next time through we would take the same number, and
6134          * output it again as beginning the next range.  By doing it the
6135          * opposite way, there is no possibility that the count will be
6136          * momentarily decremented to 0, and thus the two adjoining ranges will
6137          * be seamlessly merged.  (In a tie and both are in the set or both not
6138          * in the set, it doesn't matter which we take first.) */
6139         if (array_a[i_a] < array_b[i_b]
6140             || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6141         {
6142             cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6143             cp= array_a[i_a++];
6144         }
6145         else {
6146             cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6147             cp= array_b[i_b++];
6148         }
6149
6150         /* Here, have chosen which of the two inputs to look at.  Only output
6151          * if the running count changes to/from 0, which marks the
6152          * beginning/end of a range in that's in the set */
6153         if (cp_in_set) {
6154             if (count == 0) {
6155                 array_u[i_u++] = cp;
6156             }
6157             count++;
6158         }
6159         else {
6160             count--;
6161             if (count == 0) {
6162                 array_u[i_u++] = cp;
6163             }
6164         }
6165     }
6166
6167     /* Here, we are finished going through at least one of the lists, which
6168      * means there is something remaining in at most one.  We check if the list
6169      * that hasn't been exhausted is positioned such that we are in the middle
6170      * of a range in its set or not.  (We are in the set if the next item in
6171      * the array marks the beginning of something not in the set)   If in the
6172      * set, we decrement 'count'; if 0, there is potentially more to output.
6173      * There are four cases:
6174      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6175      *     in the union is entirely from the non-exhausted set.
6176      *  2) Both were in their sets, count is 2.  Nothing further should
6177      *     be output, as everything that remains will be in the exhausted
6178      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6179      *     that
6180      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6181      *     Nothing further should be output because the union includes
6182      *     everything from the exhausted set.  Not decrementing insures that.
6183      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6184      *     decrementing to 0 insures that we look at the remainder of the
6185      *     non-exhausted set */
6186     if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6187         || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6188     {
6189         count--;
6190     }
6191
6192     /* The final length is what we've output so far, plus what else is about to
6193      * be output.  (If 'count' is non-zero, then the input list we exhausted
6194      * has everything remaining up to the machine's limit in its set, and hence
6195      * in the union, so there will be no further output. */
6196     len_u = i_u;
6197     if (count == 0) {
6198         /* At most one of the subexpressions will be non-zero */
6199         len_u += (len_a - i_a) + (len_b - i_b);
6200     }
6201
6202     /* Set result to final length, which can change the pointer to array_u, so
6203      * re-find it */
6204     if (len_u != invlist_len(u)) {
6205         invlist_set_len(u, len_u);
6206         invlist_trim(u);
6207         array_u = invlist_array(u);
6208     }
6209
6210     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6211      * the other) ended with everything above it not in its set.  That means
6212      * that the remaining part of the union is precisely the same as the
6213      * non-exhausted list, so can just copy it unchanged.  (If both list were
6214      * exhausted at the same time, then the operations below will be both 0.)
6215      */
6216     if (count == 0) {
6217         IV copy_count; /* At most one will have a non-zero copy count */
6218         if ((copy_count = len_a - i_a) > 0) {
6219             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6220         }
6221         else if ((copy_count = len_b - i_b) > 0) {
6222             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6223         }
6224     }
6225
6226     return u;
6227 }
6228
6229 STATIC HV*
6230 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6231 {
6232     /* Return the intersection of two inversion lists.  The basis for this
6233      * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6234      * by Addison-Wesley, and explained at some length there.  The preface says
6235      * to incorporate its examples into your code at your own risk.
6236      *
6237      * The algorithm is like a merge sort, and is essentially the same as the
6238      * union above
6239      */
6240
6241     UV* array_a = invlist_array(a);   /* a's array */
6242     UV* array_b = invlist_array(b);
6243     UV len_a = invlist_len(a);  /* length of a's array */
6244     UV len_b = invlist_len(b);
6245
6246     HV* r;                   /* the resulting intersection */
6247     UV* array_r;
6248     UV len_r;
6249
6250     UV i_a = 0;             /* current index into a's array */
6251     UV i_b = 0;
6252     UV i_r = 0;
6253
6254     /* running count, as explained in the algorithm source book; items are
6255      * stopped accumulating and are output when the count changes to/from 2.
6256      * The count is incremented when we start a range that's in the set, and
6257      * decremented when we start a range that's not in the set.  So its range
6258      * is 0 to 2.  Only when the count is 2 is something in the intersection.
6259      */
6260     UV count = 0;
6261
6262     PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6263
6264     /* Size the intersection for the worst case: that the intersection ends up
6265      * fragmenting everything to be completely disjoint */
6266     r= _new_invlist(len_a + len_b);
6267     array_r = invlist_array(r);
6268
6269     /* Go through each list item by item, stopping when exhausted one of
6270      * them */
6271     while (i_a < len_a && i_b < len_b) {
6272         UV cp;      /* The element to potentially add to the intersection's
6273                        array */
6274         bool cp_in_set; /* Is it in the input list's set or not */
6275
6276         /* We need to take one or the other of the two inputs for the union.
6277          * Since we are merging two sorted lists, we take the smaller of the
6278          * next items.  In case of a tie, we take the one that is not in its
6279          * set first (a difference from the union algorithm).  If we took one
6280          * in the set first, it would increment the count, possibly to 2 which
6281          * would cause it to be output as starting a range in the intersection,
6282          * and the next time through we would take that same number, and output
6283          * it again as ending the set.  By doing it the opposite of this, we
6284          * there is no possibility that the count will be momentarily
6285          * incremented to 2.  (In a tie and both are in the set or both not in
6286          * the set, it doesn't matter which we take first.) */
6287         if (array_a[i_a] < array_b[i_b]
6288             || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6289         {
6290             cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6291             cp= array_a[i_a++];
6292         }
6293         else {
6294             cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6295             cp= array_b[i_b++];
6296         }
6297
6298         /* Here, have chosen which of the two inputs to look at.  Only output
6299          * if the running count changes to/from 2, which marks the
6300          * beginning/end of a range that's in the intersection */
6301         if (cp_in_set) {
6302             count++;
6303             if (count == 2) {
6304                 array_r[i_r++] = cp;
6305             }
6306         }
6307         else {
6308             if (count == 2) {
6309                 array_r[i_r++] = cp;
6310             }
6311             count--;
6312         }
6313     }
6314
6315     /* Here, we are finished going through at least one of the sets, which
6316      * means there is something remaining in at most one.  See the comments in
6317      * the union code */
6318     if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6319         || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6320     {
6321         count--;
6322     }
6323
6324     /* The final length is what we've output so far plus what else is in the
6325      * intersection.  Only one of the subexpressions below will be non-zero */
6326     len_r = i_r;
6327     if (count == 2) {
6328         len_r += (len_a - i_a) + (len_b - i_b);
6329     }
6330
6331     /* Set result to final length, which can change the pointer to array_r, so
6332      * re-find it */
6333     if (len_r != invlist_len(r)) {
6334         invlist_set_len(r, len_r);
6335         invlist_trim(r);
6336         array_r = invlist_array(r);
6337     }
6338
6339     /* Finish outputting any remaining */
6340     if (count == 2) { /* Only one of will have a non-zero copy count */
6341         IV copy_count;
6342         if ((copy_count = len_a - i_a) > 0) {
6343             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6344         }
6345         else if ((copy_count = len_b - i_b) > 0) {
6346             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6347         }
6348     }
6349
6350     return r;
6351 }
6352
6353 STATIC HV*
6354 S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
6355 {
6356     /* Add the range from 'start' to 'end' inclusive to the inversion list's
6357      * set.  A pointer to the inversion list is returned.  This may actually be
6358      * a new list, in which case the passed in one has been destroyed.  The
6359      * passed in inversion list can be NULL, in which case a new one is created
6360      * with just the one range in it */
6361
6362     HV* range_invlist;
6363     HV* added_invlist;
6364     UV len;
6365
6366     if (invlist == NULL) {
6367         invlist = _new_invlist(2);
6368         len = 0;
6369     }
6370     else {
6371         len = invlist_len(invlist);
6372     }
6373
6374     /* If comes after the final entry, can just append it to the end */
6375     if (len == 0
6376         || start >= invlist_array(invlist)
6377                                     [invlist_len(invlist) - 1])
6378     {
6379         _append_range_to_invlist(invlist, start, end);
6380         return invlist;
6381     }
6382
6383     /* Here, can't just append things, create and return a new inversion list
6384      * which is the union of this range and the existing inversion list */
6385     range_invlist = _new_invlist(2);
6386     _append_range_to_invlist(range_invlist, start, end);
6387
6388     added_invlist = invlist_union(invlist, range_invlist);
6389
6390     /* The passed in list can be freed, as well as our temporary */
6391     invlist_destroy(range_invlist);
6392     if (invlist != added_invlist) {
6393         invlist_destroy(invlist);
6394     }
6395
6396     return added_invlist;
6397 }
6398
6399 PERL_STATIC_INLINE HV*
6400 S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6401     return add_range_to_invlist(invlist, cp, cp);
6402 }
6403
6404 /* End of inversion list object */
6405
6406 /*
6407  - reg - regular expression, i.e. main body or parenthesized thing
6408  *
6409  * Caller must absorb opening parenthesis.
6410  *
6411  * Combining parenthesis handling with the base level of regular expression
6412  * is a trifle forced, but the need to tie the tails of the branches to what
6413  * follows makes it hard to avoid.
6414  */
6415 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6416 #ifdef DEBUGGING
6417 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6418 #else
6419 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6420 #endif
6421
6422 STATIC regnode *
6423 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6424     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6425 {
6426     dVAR;
6427     register regnode *ret;              /* Will be the head of the group. */
6428     register regnode *br;
6429     register regnode *lastbr;
6430     register regnode *ender = NULL;
6431     register I32 parno = 0;
6432     I32 flags;
6433     U32 oregflags = RExC_flags;
6434     bool have_branch = 0;
6435     bool is_open = 0;
6436     I32 freeze_paren = 0;
6437     I32 after_freeze = 0;
6438
6439     /* for (?g), (?gc), and (?o) warnings; warning
6440        about (?c) will warn about (?g) -- japhy    */
6441
6442 #define WASTED_O  0x01
6443 #define WASTED_G  0x02
6444 #define WASTED_C  0x04
6445 #define WASTED_GC (0x02|0x04)
6446     I32 wastedflags = 0x00;
6447
6448     char * parse_start = RExC_parse; /* MJD */
6449     char * const oregcomp_parse = RExC_parse;
6450
6451     GET_RE_DEBUG_FLAGS_DECL;
6452
6453     PERL_ARGS_ASSERT_REG;
6454     DEBUG_PARSE("reg ");
6455
6456     *flagp = 0;                         /* Tentatively. */
6457
6458
6459     /* Make an OPEN node, if parenthesized. */
6460     if (paren) {
6461         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6462             char *start_verb = RExC_parse;
6463             STRLEN verb_len = 0;
6464             char *start_arg = NULL;
6465             unsigned char op = 0;
6466             int argok = 1;
6467             int internal_argval = 0; /* internal_argval is only useful if !argok */
6468             while ( *RExC_parse && *RExC_parse != ')' ) {
6469                 if ( *RExC_parse == ':' ) {
6470                     start_arg = RExC_parse + 1;
6471                     break;
6472                 }
6473                 RExC_parse++;
6474             }
6475             ++start_verb;
6476             verb_len = RExC_parse - start_verb;
6477             if ( start_arg ) {
6478                 RExC_parse++;
6479                 while ( *RExC_parse && *RExC_parse != ')' )
6480                     RExC_parse++;
6481                 if ( *RExC_parse != ')' )
6482                     vFAIL("Unterminated verb pattern argument");
6483                 if ( RExC_parse == start_arg )
6484                     start_arg = NULL;
6485             } else {
6486                 if ( *RExC_parse != ')' )
6487                     vFAIL("Unterminated verb pattern");
6488             }
6489
6490             switch ( *start_verb ) {
6491             case 'A':  /* (*ACCEPT) */
6492                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6493                     op = ACCEPT;
6494                     internal_argval = RExC_nestroot;
6495                 }
6496                 break;
6497             case 'C':  /* (*COMMIT) */
6498                 if ( memEQs(start_verb,verb_len,"COMMIT") )
6499                     op = COMMIT;
6500                 break;
6501             case 'F':  /* (*FAIL) */
6502                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6503                     op = OPFAIL;
6504                     argok = 0;
6505                 }
6506                 break;
6507             case ':':  /* (*:NAME) */
6508             case 'M':  /* (*MARK:NAME) */
6509                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6510                     op = MARKPOINT;
6511                     argok = -1;
6512                 }
6513                 break;
6514             case 'P':  /* (*PRUNE) */
6515                 if ( memEQs(start_verb,verb_len,"PRUNE") )
6516                     op = PRUNE;
6517                 break;
6518             case 'S':   /* (*SKIP) */
6519                 if ( memEQs(start_verb,verb_len,"SKIP") )
6520                     op = SKIP;
6521                 break;
6522             case 'T':  /* (*THEN) */
6523                 /* [19:06] <TimToady> :: is then */
6524                 if ( memEQs(start_verb,verb_len,"THEN") ) {
6525                     op = CUTGROUP;
6526                     RExC_seen |= REG_SEEN_CUTGROUP;
6527                 }
6528                 break;
6529             }
6530             if ( ! op ) {
6531                 RExC_parse++;
6532                 vFAIL3("Unknown verb pattern '%.*s'",
6533                     verb_len, start_verb);
6534             }
6535             if ( argok ) {
6536                 if ( start_arg && internal_argval ) {
6537                     vFAIL3("Verb pattern '%.*s' may not have an argument",
6538                         verb_len, start_verb);
6539                 } else if ( argok < 0 && !start_arg ) {
6540                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6541                         verb_len, start_verb);
6542                 } else {
6543                     ret = reganode(pRExC_state, op, internal_argval);
6544                     if ( ! internal_argval && ! SIZE_ONLY ) {
6545                         if (start_arg) {
6546                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6547                             ARG(ret) = add_data( pRExC_state, 1, "S" );
6548                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6549                             ret->flags = 0;
6550                         } else {
6551                             ret->flags = 1;
6552                         }
6553                     }
6554                 }
6555                 if (!internal_argval)
6556                     RExC_seen |= REG_SEEN_VERBARG;
6557             } else if ( start_arg ) {
6558                 vFAIL3("Verb pattern '%.*s' may not have an argument",
6559                         verb_len, start_verb);
6560             } else {
6561                 ret = reg_node(pRExC_state, op);
6562             }
6563             nextchar(pRExC_state);
6564             return ret;
6565         } else
6566         if (*RExC_parse == '?') { /* (?...) */
6567             bool is_logical = 0;
6568             const char * const seqstart = RExC_parse;
6569             bool has_use_defaults = FALSE;
6570
6571             RExC_parse++;
6572             paren = *RExC_parse++;
6573             ret = NULL;                 /* For look-ahead/behind. */
6574             switch (paren) {
6575
6576             case 'P':   /* (?P...) variants for those used to PCRE/Python */
6577                 paren = *RExC_parse++;
6578                 if ( paren == '<')         /* (?P<...>) named capture */
6579                     goto named_capture;
6580                 else if (paren == '>') {   /* (?P>name) named recursion */
6581                     goto named_recursion;
6582                 }
6583                 else if (paren == '=') {   /* (?P=...)  named backref */
6584                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
6585                        you change this make sure you change that */
6586                     char* name_start = RExC_parse;
6587                     U32 num = 0;
6588                     SV *sv_dat = reg_scan_name(pRExC_state,
6589                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6590                     if (RExC_parse == name_start || *RExC_parse != ')')
6591                         vFAIL2("Sequence %.3s... not terminated",parse_start);
6592
6593                     if (!SIZE_ONLY) {
6594                         num = add_data( pRExC_state, 1, "S" );
6595                         RExC_rxi->data->data[num]=(void*)sv_dat;
6596                         SvREFCNT_inc_simple_void(sv_dat);
6597                     }
6598                     RExC_sawback = 1;
6599                     ret = reganode(pRExC_state,
6600                                    ((! FOLD)
6601                                      ? NREF
6602                                      : (MORE_ASCII_RESTRICTED)
6603                                        ? NREFFA
6604                                        : (AT_LEAST_UNI_SEMANTICS)
6605                                          ? NREFFU
6606                                          : (LOC)
6607                                            ? NREFFL
6608                                            : NREFF),
6609                                     num);
6610                     *flagp |= HASWIDTH;
6611
6612                     Set_Node_Offset(ret, parse_start+1);
6613                     Set_Node_Cur_Length(ret); /* MJD */
6614
6615                     nextchar(pRExC_state);
6616                     return ret;
6617                 }
6618                 RExC_parse++;
6619                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6620                 /*NOTREACHED*/
6621             case '<':           /* (?<...) */
6622                 if (*RExC_parse == '!')
6623                     paren = ',';
6624                 else if (*RExC_parse != '=')
6625               named_capture:
6626                 {               /* (?<...>) */
6627                     char *name_start;
6628                     SV *svname;
6629                     paren= '>';
6630             case '\'':          /* (?'...') */
6631                     name_start= RExC_parse;
6632                     svname = reg_scan_name(pRExC_state,
6633                         SIZE_ONLY ?  /* reverse test from the others */
6634                         REG_RSN_RETURN_NAME :
6635                         REG_RSN_RETURN_NULL);
6636                     if (RExC_parse == name_start) {
6637                         RExC_parse++;
6638                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6639                         /*NOTREACHED*/
6640                     }
6641                     if (*RExC_parse != paren)
6642                         vFAIL2("Sequence (?%c... not terminated",
6643                             paren=='>' ? '<' : paren);
6644                     if (SIZE_ONLY) {
6645                         HE *he_str;
6646                         SV *sv_dat = NULL;
6647                         if (!svname) /* shouldn't happen */
6648                             Perl_croak(aTHX_
6649                                 "panic: reg_scan_name returned NULL");
6650                         if (!RExC_paren_names) {
6651                             RExC_paren_names= newHV();
6652                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
6653 #ifdef DEBUGGING
6654                             RExC_paren_name_list= newAV();
6655                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6656 #endif
6657                         }
6658                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6659                         if ( he_str )
6660                             sv_dat = HeVAL(he_str);
6661                         if ( ! sv_dat ) {
6662                             /* croak baby croak */
6663                             Perl_croak(aTHX_
6664                                 "panic: paren_name hash element allocation failed");
6665                         } else if ( SvPOK(sv_dat) ) {
6666                             /* (?|...) can mean we have dupes so scan to check
6667                                its already been stored. Maybe a flag indicating
6668                                we are inside such a construct would be useful,
6669                                but the arrays are likely to be quite small, so
6670                                for now we punt -- dmq */
6671                             IV count = SvIV(sv_dat);
6672                             I32 *pv = (I32*)SvPVX(sv_dat);
6673                             IV i;
6674                             for ( i = 0 ; i < count ; i++ ) {
6675                                 if ( pv[i] == RExC_npar ) {
6676                                     count = 0;
6677                                     break;
6678                                 }
6679                             }
6680                             if ( count ) {
6681                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6682                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6683                                 pv[count] = RExC_npar;
6684                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6685                             }
6686                         } else {
6687                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
6688                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6689                             SvIOK_on(sv_dat);
6690                             SvIV_set(sv_dat, 1);
6691                         }
6692 #ifdef DEBUGGING
6693                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6694                             SvREFCNT_dec(svname);
6695 #endif
6696
6697                         /*sv_dump(sv_dat);*/
6698                     }
6699                     nextchar(pRExC_state);
6700                     paren = 1;
6701                     goto capturing_parens;
6702                 }
6703                 RExC_seen |= REG_SEEN_LOOKBEHIND;
6704                 RExC_in_lookbehind++;
6705                 RExC_parse++;
6706             case '=':           /* (?=...) */
6707                 RExC_seen_zerolen++;
6708                 break;
6709             case '!':           /* (?!...) */
6710                 RExC_seen_zerolen++;
6711                 if (*RExC_parse == ')') {
6712                     ret=reg_node(pRExC_state, OPFAIL);
6713                     nextchar(pRExC_state);
6714                     return ret;
6715                 }
6716                 break;
6717             case '|':           /* (?|...) */
6718                 /* branch reset, behave like a (?:...) except that
6719                    buffers in alternations share the same numbers */
6720                 paren = ':';
6721                 after_freeze = freeze_paren = RExC_npar;
6722                 break;
6723             case ':':           /* (?:...) */
6724             case '>':           /* (?>...) */
6725                 break;
6726             case '$':           /* (?$...) */
6727             case '@':           /* (?@...) */
6728                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6729                 break;
6730             case '#':           /* (?#...) */
6731                 while (*RExC_parse && *RExC_parse != ')')
6732                     RExC_parse++;
6733                 if (*RExC_parse != ')')
6734                     FAIL("Sequence (?#... not terminated");
6735                 nextchar(pRExC_state);
6736                 *flagp = TRYAGAIN;
6737                 return NULL;
6738             case '0' :           /* (?0) */
6739             case 'R' :           /* (?R) */
6740                 if (*RExC_parse != ')')
6741                     FAIL("Sequence (?R) not terminated");
6742                 ret = reg_node(pRExC_state, GOSTART);
6743                 *flagp |= POSTPONED;
6744                 nextchar(pRExC_state);
6745                 return ret;
6746                 /*notreached*/
6747             { /* named and numeric backreferences */
6748                 I32 num;
6749             case '&':            /* (?&NAME) */
6750                 parse_start = RExC_parse - 1;
6751               named_recursion:
6752                 {
6753                     SV *sv_dat = reg_scan_name(pRExC_state,
6754                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6755                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6756                 }
6757                 goto gen_recurse_regop;
6758                 /* NOT REACHED */
6759             case '+':
6760                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6761                     RExC_parse++;
6762                     vFAIL("Illegal pattern");
6763                 }
6764                 goto parse_recursion;
6765                 /* NOT REACHED*/
6766             case '-': /* (?-1) */
6767                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6768                     RExC_parse--; /* rewind to let it be handled later */
6769                     goto parse_flags;
6770                 }
6771                 /*FALLTHROUGH */
6772             case '1': case '2': case '3': case '4': /* (?1) */
6773             case '5': case '6': case '7': case '8': case '9':
6774                 RExC_parse--;
6775               parse_recursion:
6776                 num = atoi(RExC_parse);
6777                 parse_start = RExC_parse - 1; /* MJD */
6778                 if (*RExC_parse == '-')
6779                     RExC_parse++;
6780                 while (isDIGIT(*RExC_parse))
6781                         RExC_parse++;
6782                 if (*RExC_parse!=')')
6783                     vFAIL("Expecting close bracket");
6784
6785               gen_recurse_regop:
6786                 if ( paren == '-' ) {
6787                     /*
6788                     Diagram of capture buffer numbering.
6789                     Top line is the normal capture buffer numbers
6790                     Bottom line is the negative indexing as from
6791                     the X (the (?-2))
6792
6793                     +   1 2    3 4 5 X          6 7
6794                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6795                     -   5 4    3 2 1 X          x x
6796
6797                     */
6798                     num = RExC_npar + num;
6799                     if (num < 1)  {
6800                         RExC_parse++;
6801                         vFAIL("Reference to nonexistent group");
6802                     }
6803                 } else if ( paren == '+' ) {
6804                     num = RExC_npar + num - 1;
6805                 }
6806
6807                 ret = reganode(pRExC_state, GOSUB, num);
6808                 if (!SIZE_ONLY) {
6809                     if (num > (I32)RExC_rx->nparens) {
6810                         RExC_parse++;
6811                         vFAIL("Reference to nonexistent group");
6812                     }
6813                     ARG2L_SET( ret, RExC_recurse_count++);
6814                     RExC_emit++;
6815                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6816                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6817                 } else {
6818                     RExC_size++;
6819                 }
6820                 RExC_seen |= REG_SEEN_RECURSE;
6821                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6822                 Set_Node_Offset(ret, parse_start); /* MJD */
6823
6824                 *flagp |= POSTPONED;
6825                 nextchar(pRExC_state);
6826                 return ret;
6827             } /* named and numeric backreferences */
6828             /* NOT REACHED */
6829
6830             case '?':           /* (??...) */
6831                 is_logical = 1;
6832                 if (*RExC_parse != '{') {
6833                     RExC_parse++;
6834                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6835                     /*NOTREACHED*/
6836                 }
6837                 *flagp |= POSTPONED;
6838                 paren = *RExC_parse++;
6839                 /* FALL THROUGH */
6840             case '{':           /* (?{...}) */
6841             {
6842                 I32 count = 1;
6843                 U32 n = 0;
6844                 char c;
6845                 char *s = RExC_parse;
6846
6847                 RExC_seen_zerolen++;
6848                 RExC_seen |= REG_SEEN_EVAL;
6849                 while (count && (c = *RExC_parse)) {
6850                     if (c == '\\') {
6851                         if (RExC_parse[1])
6852                             RExC_parse++;
6853                     }
6854                     else if (c == '{')
6855                         count++;
6856                     else if (c == '}')
6857                         count--;
6858                     RExC_parse++;
6859                 }
6860                 if (*RExC_parse != ')') {
6861                     RExC_parse = s;
6862                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6863                 }
6864                 if (!SIZE_ONLY) {
6865                     PAD *pad;
6866                     OP_4tree *sop, *rop;
6867                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6868
6869                     ENTER;
6870                     Perl_save_re_context(aTHX);
6871                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6872                     sop->op_private |= OPpREFCOUNTED;
6873                     /* re_dup will OpREFCNT_inc */
6874                     OpREFCNT_set(sop, 1);
6875                     LEAVE;
6876
6877                     n = add_data(pRExC_state, 3, "nop");
6878                     RExC_rxi->data->data[n] = (void*)rop;
6879                     RExC_rxi->data->data[n+1] = (void*)sop;
6880                     RExC_rxi->data->data[n+2] = (void*)pad;
6881                     SvREFCNT_dec(sv);
6882                 }
6883                 else {                                          /* First pass */
6884                     if (PL_reginterp_cnt < ++RExC_seen_evals
6885                         && IN_PERL_RUNTIME)
6886                         /* No compiled RE interpolated, has runtime
6887                            components ===> unsafe.  */
6888                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
6889                     if (PL_tainting && PL_tainted)
6890                         FAIL("Eval-group in insecure regular expression");
6891 #if PERL_VERSION > 8
6892                     if (IN_PERL_COMPILETIME)
6893                         PL_cv_has_eval = 1;
6894 #endif
6895                 }
6896
6897                 nextchar(pRExC_state);
6898                 if (is_logical) {
6899                     ret = reg_node(pRExC_state, LOGICAL);
6900                     if (!SIZE_ONLY)
6901                         ret->flags = 2;
6902                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6903                     /* deal with the length of this later - MJD */
6904                     return ret;
6905                 }
6906                 ret = reganode(pRExC_state, EVAL, n);
6907                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6908                 Set_Node_Offset(ret, parse_start);
6909                 return ret;
6910             }
6911             case '(':           /* (?(?{...})...) and (?(?=...)...) */
6912             {
6913                 int is_define= 0;
6914                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
6915                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6916                         || RExC_parse[1] == '<'
6917                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
6918                         I32 flag;
6919
6920                         ret = reg_node(pRExC_state, LOGICAL);
6921                         if (!SIZE_ONLY)
6922                             ret->flags = 1;
6923                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6924                         goto insert_if;
6925                     }
6926                 }
6927                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
6928                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6929                 {
6930                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
6931                     char *name_start= RExC_parse++;
6932                     U32 num = 0;
6933                     SV *sv_dat=reg_scan_name(pRExC_state,
6934                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6935                     if (RExC_parse == name_start || *RExC_parse != ch)
6936                         vFAIL2("Sequence (?(%c... not terminated",
6937                             (ch == '>' ? '<' : ch));
6938                     RExC_parse++;
6939                     if (!SIZE_ONLY) {
6940                         num = add_data( pRExC_state, 1, "S" );
6941                         RExC_rxi->data->data[num]=(void*)sv_dat;
6942                         SvREFCNT_inc_simple_void(sv_dat);
6943                     }
6944                     ret = reganode(pRExC_state,NGROUPP,num);
6945                     goto insert_if_check_paren;
6946                 }
6947                 else if (RExC_parse[0] == 'D' &&
6948                          RExC_parse[1] == 'E' &&
6949                          RExC_parse[2] == 'F' &&
6950                          RExC_parse[3] == 'I' &&
6951                          RExC_parse[4] == 'N' &&
6952                          RExC_parse[5] == 'E')
6953                 {
6954                     ret = reganode(pRExC_state,DEFINEP,0);
6955                     RExC_parse +=6 ;
6956                     is_define = 1;
6957                     goto insert_if_check_paren;
6958                 }
6959                 else if (RExC_parse[0] == 'R') {
6960                     RExC_parse++;
6961                     parno = 0;
6962                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6963                         parno = atoi(RExC_parse++);
6964                         while (isDIGIT(*RExC_parse))
6965                             RExC_parse++;
6966                     } else if (RExC_parse[0] == '&') {
6967                         SV *sv_dat;
6968                         RExC_parse++;
6969                         sv_dat = reg_scan_name(pRExC_state,
6970                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6971                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6972                     }
6973                     ret = reganode(pRExC_state,INSUBP,parno);
6974                     goto insert_if_check_paren;
6975                 }
6976                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6977                     /* (?(1)...) */
6978                     char c;
6979                     parno = atoi(RExC_parse++);
6980
6981                     while (isDIGIT(*RExC_parse))
6982                         RExC_parse++;
6983                     ret = reganode(pRExC_state, GROUPP, parno);
6984
6985                  insert_if_check_paren:
6986                     if ((c = *nextchar(pRExC_state)) != ')')
6987                         vFAIL("Switch condition not recognized");
6988                   insert_if:
6989                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6990                     br = regbranch(pRExC_state, &flags, 1,depth+1);
6991                     if (br == NULL)
6992                         br = reganode(pRExC_state, LONGJMP, 0);
6993                     else
6994                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6995                     c = *nextchar(pRExC_state);
6996                     if (flags&HASWIDTH)
6997                         *flagp |= HASWIDTH;
6998                     if (c == '|') {
6999                         if (is_define)
7000                             vFAIL("(?(DEFINE)....) does not allow branches");
7001                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7002                         regbranch(pRExC_state, &flags, 1,depth+1);
7003                         REGTAIL(pRExC_state, ret, lastbr);
7004                         if (flags&HASWIDTH)
7005                             *flagp |= HASWIDTH;
7006                         c = *nextchar(pRExC_state);
7007                     }
7008                     else
7009                         lastbr = NULL;
7010                     if (c != ')')
7011                         vFAIL("Switch (?(condition)... contains too many branches");
7012                     ender = reg_node(pRExC_state, TAIL);
7013                     REGTAIL(pRExC_state, br, ender);
7014                     if (lastbr) {
7015                         REGTAIL(pRExC_state, lastbr, ender);
7016                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7017                     }
7018                     else
7019                         REGTAIL(pRExC_state, ret, ender);
7020                     RExC_size++; /* XXX WHY do we need this?!!
7021                                     For large programs it seems to be required
7022                                     but I can't figure out why. -- dmq*/
7023                     return ret;
7024                 }
7025                 else {
7026                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7027                 }
7028             }
7029             case 0:
7030                 RExC_parse--; /* for vFAIL to print correctly */
7031                 vFAIL("Sequence (? incomplete");
7032                 break;
7033             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
7034                                        that follow */
7035                 has_use_defaults = TRUE;
7036                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7037                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
7038                                                 ? REGEX_UNICODE_CHARSET
7039                                                 : REGEX_DEPENDS_CHARSET);
7040                 goto parse_flags;
7041             default:
7042                 --RExC_parse;
7043                 parse_flags:      /* (?i) */
7044             {
7045                 U32 posflags = 0, negflags = 0;
7046                 U32 *flagsp = &posflags;
7047                 bool has_charset_modifier = 0;
7048                 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
7049                                     ? REGEX_UNICODE_CHARSET
7050                                     : REGEX_DEPENDS_CHARSET;
7051
7052                 while (*RExC_parse) {
7053                     /* && strchr("iogcmsx", *RExC_parse) */
7054                     /* (?g), (?gc) and (?o) are useless here
7055                        and must be globally applied -- japhy */
7056                     switch (*RExC_parse) {
7057                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7058                     case LOCALE_PAT_MOD:
7059                         if (has_charset_modifier || flagsp == &negflags) {
7060                             goto fail_modifiers;
7061                         }
7062                         cs = REGEX_LOCALE_CHARSET;
7063                         has_charset_modifier = 1;
7064                         RExC_contains_locale = 1;
7065                         break;
7066                     case UNICODE_PAT_MOD:
7067                         if (has_charset_modifier || flagsp == &negflags) {
7068                             goto fail_modifiers;
7069                         }
7070                         cs = REGEX_UNICODE_CHARSET;
7071                         has_charset_modifier = 1;
7072                         break;
7073                     case ASCII_RESTRICT_PAT_MOD:
7074                         if (has_charset_modifier || flagsp == &negflags) {
7075                             goto fail_modifiers;
7076                         }
7077                         if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
7078                             /* Doubled modifier implies more restricted */
7079                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7080                             RExC_parse++;
7081                         }
7082                         else {
7083                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
7084                         }
7085                         has_charset_modifier = 1;
7086                         break;
7087                     case DEPENDS_PAT_MOD:
7088                         if (has_use_defaults
7089                             || has_charset_modifier
7090                             || flagsp == &negflags)
7091                         {
7092                             goto fail_modifiers;
7093                         }
7094
7095                         /* The dual charset means unicode semantics if the
7096                          * pattern (or target, not known until runtime) are
7097                          * utf8, or something in the pattern indicates unicode
7098                          * semantics */
7099                         cs = (RExC_utf8 || RExC_uni_semantics)
7100                              ? REGEX_UNICODE_CHARSET
7101                              : REGEX_DEPENDS_CHARSET;
7102                         has_charset_modifier = 1;
7103                         break;
7104                     case ONCE_PAT_MOD: /* 'o' */
7105                     case GLOBAL_PAT_MOD: /* 'g' */
7106                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7107                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7108                             if (! (wastedflags & wflagbit) ) {
7109                                 wastedflags |= wflagbit;
7110                                 vWARN5(
7111                                     RExC_parse + 1,
7112                                     "Useless (%s%c) - %suse /%c modifier",
7113                                     flagsp == &negflags ? "?-" : "?",
7114                                     *RExC_parse,
7115                                     flagsp == &negflags ? "don't " : "",
7116                                     *RExC_parse
7117                                 );
7118                             }
7119                         }
7120                         break;
7121
7122                     case CONTINUE_PAT_MOD: /* 'c' */
7123                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7124                             if (! (wastedflags & WASTED_C) ) {
7125                                 wastedflags |= WASTED_GC;
7126                                 vWARN3(
7127                                     RExC_parse + 1,
7128                                     "Useless (%sc) - %suse /gc modifier",
7129                                     flagsp == &negflags ? "?-" : "?",
7130                                     flagsp == &negflags ? "don't " : ""
7131                                 );
7132                             }
7133                         }
7134                         break;
7135                     case KEEPCOPY_PAT_MOD: /* 'p' */
7136                         if (flagsp == &negflags) {
7137                             if (SIZE_ONLY)
7138                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7139                         } else {
7140                             *flagsp |= RXf_PMf_KEEPCOPY;
7141                         }
7142                         break;
7143                     case '-':
7144                         /* A flag is a default iff it is following a minus, so
7145                          * if there is a minus, it means will be trying to
7146                          * re-specify a default which is an error */
7147                         if (has_use_defaults || flagsp == &negflags) {
7148             fail_modifiers:
7149                             RExC_parse++;
7150                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7151                             /*NOTREACHED*/
7152                         }
7153                         flagsp = &negflags;
7154                         wastedflags = 0;  /* reset so (?g-c) warns twice */
7155                         break;
7156                     case ':':
7157                         paren = ':';
7158                         /*FALLTHROUGH*/
7159                     case ')':
7160                         RExC_flags |= posflags;
7161                         RExC_flags &= ~negflags;
7162                         set_regex_charset(&RExC_flags, cs);
7163                         if (paren != ':') {
7164                             oregflags |= posflags;
7165                             oregflags &= ~negflags;
7166                             set_regex_charset(&oregflags, cs);
7167                         }
7168                         nextchar(pRExC_state);
7169                         if (paren != ':') {
7170                             *flagp = TRYAGAIN;
7171                             return NULL;
7172                         } else {
7173                             ret = NULL;
7174                             goto parse_rest;
7175                         }
7176                         /*NOTREACHED*/
7177                     default:
7178                         RExC_parse++;
7179                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7180                         /*NOTREACHED*/
7181                     }
7182                     ++RExC_parse;
7183                 }
7184             }} /* one for the default block, one for the switch */
7185         }
7186         else {                  /* (...) */
7187           capturing_parens:
7188             parno = RExC_npar;
7189             RExC_npar++;
7190
7191             ret = reganode(pRExC_state, OPEN, parno);
7192             if (!SIZE_ONLY ){
7193                 if (!RExC_nestroot)
7194                     RExC_nestroot = parno;
7195                 if (RExC_seen & REG_SEEN_RECURSE
7196                     && !RExC_open_parens[parno-1])
7197                 {
7198                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7199                         "Setting open paren #%"IVdf" to %d\n",
7200                         (IV)parno, REG_NODE_NUM(ret)));
7201                     RExC_open_parens[parno-1]= ret;
7202                 }
7203             }
7204             Set_Node_Length(ret, 1); /* MJD */
7205             Set_Node_Offset(ret, RExC_parse); /* MJD */
7206             is_open = 1;
7207         }
7208     }
7209     else                        /* ! paren */
7210         ret = NULL;
7211
7212    parse_rest:
7213     /* Pick up the branches, linking them together. */
7214     parse_start = RExC_parse;   /* MJD */
7215     br = regbranch(pRExC_state, &flags, 1,depth+1);
7216
7217     /*     branch_len = (paren != 0); */
7218
7219     if (br == NULL)
7220         return(NULL);
7221     if (*RExC_parse == '|') {
7222         if (!SIZE_ONLY && RExC_extralen) {
7223             reginsert(pRExC_state, BRANCHJ, br, depth+1);
7224         }
7225         else {                  /* MJD */
7226             reginsert(pRExC_state, BRANCH, br, depth+1);
7227             Set_Node_Length(br, paren != 0);
7228             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7229         }
7230         have_branch = 1;
7231         if (SIZE_ONLY)
7232             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
7233     }
7234     else if (paren == ':') {
7235         *flagp |= flags&SIMPLE;
7236     }
7237     if (is_open) {                              /* Starts with OPEN. */
7238         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
7239     }
7240     else if (paren != '?')              /* Not Conditional */
7241         ret = br;
7242     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7243     lastbr = br;
7244     while (*RExC_parse == '|') {
7245         if (!SIZE_ONLY && RExC_extralen) {
7246             ender = reganode(pRExC_state, LONGJMP,0);
7247             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7248         }
7249         if (SIZE_ONLY)
7250             RExC_extralen += 2;         /* Account for LONGJMP. */
7251         nextchar(pRExC_state);
7252         if (freeze_paren) {
7253             if (RExC_npar > after_freeze)
7254                 after_freeze = RExC_npar;
7255             RExC_npar = freeze_paren;
7256         }
7257         br = regbranch(pRExC_state, &flags, 0, depth+1);
7258
7259         if (br == NULL)
7260             return(NULL);
7261         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
7262         lastbr = br;
7263         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7264     }
7265
7266     if (have_branch || paren != ':') {
7267         /* Make a closing node, and hook it on the end. */
7268         switch (paren) {
7269         case ':':
7270             ender = reg_node(pRExC_state, TAIL);
7271             break;
7272         case 1:
7273             ender = reganode(pRExC_state, CLOSE, parno);
7274             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7275                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7276                         "Setting close paren #%"IVdf" to %d\n",
7277                         (IV)parno, REG_NODE_NUM(ender)));
7278                 RExC_close_parens[parno-1]= ender;
7279                 if (RExC_nestroot == parno)
7280                     RExC_nestroot = 0;
7281             }
7282             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7283             Set_Node_Length(ender,1); /* MJD */
7284             break;
7285         case '<':
7286         case ',':
7287         case '=':
7288         case '!':
7289             *flagp &= ~HASWIDTH;
7290             /* FALL THROUGH */
7291         case '>':
7292             ender = reg_node(pRExC_state, SUCCEED);
7293             break;
7294         case 0:
7295             ender = reg_node(pRExC_state, END);
7296             if (!SIZE_ONLY) {
7297                 assert(!RExC_opend); /* there can only be one! */
7298                 RExC_opend = ender;
7299             }
7300             break;
7301         }
7302         REGTAIL(pRExC_state, lastbr, ender);
7303
7304         if (have_branch && !SIZE_ONLY) {
7305             if (depth==1)
7306                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7307
7308             /* Hook the tails of the branches to the closing node. */
7309             for (br = ret; br; br = regnext(br)) {
7310                 const U8 op = PL_regkind[OP(br)];
7311                 if (op == BRANCH) {
7312                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7313                 }
7314                 else if (op == BRANCHJ) {
7315                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7316                 }
7317             }
7318         }
7319     }
7320
7321     {
7322         const char *p;
7323         static const char parens[] = "=!<,>";
7324
7325         if (paren && (p = strchr(parens, paren))) {
7326             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7327             int flag = (p - parens) > 1;
7328
7329             if (paren == '>')
7330                 node = SUSPEND, flag = 0;
7331             reginsert(pRExC_state, node,ret, depth+1);
7332             Set_Node_Cur_Length(ret);
7333             Set_Node_Offset(ret, parse_start + 1);
7334             ret->flags = flag;
7335             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7336         }
7337     }
7338
7339     /* Check for proper termination. */
7340     if (paren) {
7341         RExC_flags = oregflags;
7342         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7343             RExC_parse = oregcomp_parse;
7344             vFAIL("Unmatched (");
7345         }
7346     }
7347     else if (!paren && RExC_parse < RExC_end) {
7348         if (*RExC_parse == ')') {
7349             RExC_parse++;
7350             vFAIL("Unmatched )");
7351         }
7352         else
7353             FAIL("Junk on end of regexp");      /* "Can't happen". */
7354         /* NOTREACHED */
7355     }
7356
7357     if (RExC_in_lookbehind) {
7358         RExC_in_lookbehind--;
7359     }
7360     if (after_freeze > RExC_npar)
7361         RExC_npar = after_freeze;
7362     return(ret);
7363 }
7364
7365 /*
7366  - regbranch - one alternative of an | operator
7367  *
7368  * Implements the concatenation operator.
7369  */
7370 STATIC regnode *
7371 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7372 {
7373     dVAR;
7374     register regnode *ret;
7375     register regnode *chain = NULL;
7376     register regnode *latest;
7377     I32 flags = 0, c = 0;
7378     GET_RE_DEBUG_FLAGS_DECL;
7379
7380     PERL_ARGS_ASSERT_REGBRANCH;
7381
7382     DEBUG_PARSE("brnc");
7383
7384     if (first)
7385         ret = NULL;
7386     else {
7387         if (!SIZE_ONLY && RExC_extralen)
7388             ret = reganode(pRExC_state, BRANCHJ,0);
7389         else {
7390             ret = reg_node(pRExC_state, BRANCH);
7391             Set_Node_Length(ret, 1);
7392         }
7393     }
7394
7395     if (!first && SIZE_ONLY)
7396         RExC_extralen += 1;                     /* BRANCHJ */
7397
7398     *flagp = WORST;                     /* Tentatively. */
7399
7400     RExC_parse--;
7401     nextchar(pRExC_state);
7402     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7403         flags &= ~TRYAGAIN;
7404         latest = regpiece(pRExC_state, &flags,depth+1);
7405         if (latest == NULL) {
7406             if (flags & TRYAGAIN)
7407                 continue;
7408             return(NULL);
7409         }
7410         else if (ret == NULL)
7411             ret = latest;
7412         *flagp |= flags&(HASWIDTH|POSTPONED);
7413         if (chain == NULL)      /* First piece. */
7414             *flagp |= flags&SPSTART;
7415         else {
7416             RExC_naughty++;
7417             REGTAIL(pRExC_state, chain, latest);
7418         }
7419         chain = latest;
7420         c++;
7421     }
7422     if (chain == NULL) {        /* Loop ran zero times. */
7423         chain = reg_node(pRExC_state, NOTHING);
7424         if (ret == NULL)
7425             ret = chain;
7426     }
7427     if (c == 1) {
7428         *flagp |= flags&SIMPLE;
7429     }
7430
7431     return ret;
7432 }
7433
7434 /*
7435  - regpiece - something followed by possible [*+?]
7436  *
7437  * Note that the branching code sequences used for ? and the general cases
7438  * of * and + are somewhat optimized:  they use the same NOTHING node as
7439  * both the endmarker for their branch list and the body of the last branch.
7440  * It might seem that this node could be dispensed with entirely, but the
7441  * endmarker role is not redundant.
7442  */
7443 STATIC regnode *
7444 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7445 {
7446     dVAR;
7447     register regnode *ret;
7448     register char op;
7449     register char *next;
7450     I32 flags;
7451     const char * const origparse = RExC_parse;
7452     I32 min;
7453     I32 max = REG_INFTY;
7454     char *parse_start;
7455     const char *maxpos = NULL;
7456     GET_RE_DEBUG_FLAGS_DECL;
7457
7458     PERL_ARGS_ASSERT_REGPIECE;
7459
7460     DEBUG_PARSE("piec");
7461
7462     ret = regatom(pRExC_state, &flags,depth+1);
7463     if (ret == NULL) {
7464         if (flags & TRYAGAIN)
7465             *flagp |= TRYAGAIN;
7466         return(NULL);
7467     }
7468
7469     op = *RExC_parse;
7470
7471     if (op == '{' && regcurly(RExC_parse)) {
7472         maxpos = NULL;
7473         parse_start = RExC_parse; /* MJD */
7474         next = RExC_parse + 1;
7475         while (isDIGIT(*next) || *next == ',') {
7476             if (*next == ',') {
7477                 if (maxpos)
7478                     break;
7479                 else
7480                     maxpos = next;
7481             }
7482             next++;
7483         }
7484         if (*next == '}') {             /* got one */
7485             if (!maxpos)
7486                 maxpos = next;
7487             RExC_parse++;
7488             min = atoi(RExC_parse);
7489             if (*maxpos == ',')
7490                 maxpos++;
7491             else
7492                 maxpos = RExC_parse;
7493             max = atoi(maxpos);
7494             if (!max && *maxpos != '0')
7495                 max = REG_INFTY;                /* meaning "infinity" */
7496             else if (max >= REG_INFTY)
7497                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7498             RExC_parse = next;
7499             nextchar(pRExC_state);
7500
7501         do_curly:
7502             if ((flags&SIMPLE)) {
7503                 RExC_naughty += 2 + RExC_naughty / 2;
7504                 reginsert(pRExC_state, CURLY, ret, depth+1);
7505                 Set_Node_Offset(ret, parse_start+1); /* MJD */
7506                 Set_Node_Cur_Length(ret);
7507             }
7508             else {
7509                 regnode * const w = reg_node(pRExC_state, WHILEM);
7510
7511                 w->flags = 0;
7512                 REGTAIL(pRExC_state, ret, w);
7513                 if (!SIZE_ONLY && RExC_extralen) {
7514                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
7515                     reginsert(pRExC_state, NOTHING,ret, depth+1);
7516                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
7517                 }
7518                 reginsert(pRExC_state, CURLYX,ret, depth+1);
7519                                 /* MJD hk */
7520                 Set_Node_Offset(ret, parse_start+1);
7521                 Set_Node_Length(ret,
7522                                 op == '{' ? (RExC_parse - parse_start) : 1);
7523
7524                 if (!SIZE_ONLY && RExC_extralen)
7525                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
7526                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7527                 if (SIZE_ONLY)
7528                     RExC_whilem_seen++, RExC_extralen += 3;
7529                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
7530             }
7531             ret->flags = 0;
7532
7533             if (min > 0)
7534                 *flagp = WORST;
7535             if (max > 0)
7536                 *flagp |= HASWIDTH;
7537             if (max < min)
7538                 vFAIL("Can't do {n,m} with n > m");
7539             if (!SIZE_ONLY) {
7540                 ARG1_SET(ret, (U16)min);
7541                 ARG2_SET(ret, (U16)max);
7542             }
7543
7544             goto nest_check;
7545         }
7546     }
7547
7548     if (!ISMULT1(op)) {
7549         *flagp = flags;
7550         return(ret);
7551     }
7552
7553 #if 0                           /* Now runtime fix should be reliable. */
7554
7555     /* if this is reinstated, don't forget to put this back into perldiag:
7556
7557             =item Regexp *+ operand could be empty at {#} in regex m/%s/
7558
7559            (F) The part of the regexp subject to either the * or + quantifier
7560            could match an empty string. The {#} shows in the regular
7561            expression about where the problem was discovered.
7562
7563     */
7564
7565     if (!(flags&HASWIDTH) && op != '?')
7566       vFAIL("Regexp *+ operand could be empty");
7567 #endif
7568
7569     parse_start = RExC_parse;
7570     nextchar(pRExC_state);
7571
7572     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7573
7574     if (op == '*' && (flags&SIMPLE)) {
7575         reginsert(pRExC_state, STAR, ret, depth+1);
7576         ret->flags = 0;
7577         RExC_naughty += 4;
7578     }
7579     else if (op == '*') {
7580         min = 0;
7581         goto do_curly;
7582     }
7583     else if (op == '+' && (flags&SIMPLE)) {
7584         reginsert(pRExC_state, PLUS, ret, depth+1);
7585         ret->flags = 0;
7586         RExC_naughty += 3;
7587     }
7588     else if (op == '+') {
7589         min = 1;
7590         goto do_curly;
7591     }
7592     else if (op == '?') {
7593         min = 0; max = 1;
7594         goto do_curly;
7595     }
7596   nest_check:
7597     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7598         ckWARN3reg(RExC_parse,
7599                    "%.*s matches null string many times",
7600                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7601                    origparse);
7602     }
7603
7604     if (RExC_parse < RExC_end && *RExC_parse == '?') {
7605         nextchar(pRExC_state);
7606         reginsert(pRExC_state, MINMOD, ret, depth+1);
7607         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7608     }
7609 #ifndef REG_ALLOW_MINMOD_SUSPEND
7610     else
7611 #endif
7612     if (RExC_parse < RExC_end && *RExC_parse == '+') {
7613         regnode *ender;
7614         nextchar(pRExC_state);
7615         ender = reg_node(pRExC_state, SUCCEED);
7616         REGTAIL(pRExC_state, ret, ender);
7617         reginsert(pRExC_state, SUSPEND, ret, depth+1);
7618         ret->flags = 0;
7619         ender = reg_node(pRExC_state, TAIL);
7620         REGTAIL(pRExC_state, ret, ender);
7621         /*ret= ender;*/
7622     }
7623
7624     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7625         RExC_parse++;
7626         vFAIL("Nested quantifiers");
7627     }
7628
7629     return(ret);
7630 }
7631
7632
7633 /* reg_namedseq(pRExC_state,UVp)
7634
7635    This is expected to be called by a parser routine that has
7636    recognized '\N' and needs to handle the rest. RExC_parse is
7637    expected to point at the first char following the N at the time
7638    of the call.
7639
7640    The \N may be inside (indicated by valuep not being NULL) or outside a
7641    character class.
7642
7643    \N may begin either a named sequence, or if outside a character class, mean
7644    to match a non-newline.  For non single-quoted regexes, the tokenizer has
7645    attempted to decide which, and in the case of a named sequence converted it
7646    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7647    where c1... are the characters in the sequence.  For single-quoted regexes,
7648    the tokenizer passes the \N sequence through unchanged; this code will not
7649    attempt to determine this nor expand those.  The net effect is that if the
7650    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7651    signals that this \N occurrence means to match a non-newline.
7652
7653    Only the \N{U+...} form should occur in a character class, for the same
7654    reason that '.' inside a character class means to just match a period: it
7655    just doesn't make sense.
7656
7657    If valuep is non-null then it is assumed that we are parsing inside
7658    of a charclass definition and the first codepoint in the resolved
7659    string is returned via *valuep and the routine will return NULL.
7660    In this mode if a multichar string is returned from the charnames
7661    handler, a warning will be issued, and only the first char in the
7662    sequence will be examined. If the string returned is zero length
7663    then the value of *valuep is undefined and NON-NULL will
7664    be returned to indicate failure. (This will NOT be a valid pointer
7665    to a regnode.)
7666
7667    If valuep is null then it is assumed that we are parsing normal text and a
7668    new EXACT node is inserted into the program containing the resolved string,
7669    and a pointer to the new node is returned.  But if the string is zero length
7670    a NOTHING node is emitted instead.
7671
7672    On success RExC_parse is set to the char following the endbrace.
7673    Parsing failures will generate a fatal error via vFAIL(...)
7674  */
7675 STATIC regnode *
7676 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
7677 {
7678     char * endbrace;    /* '}' following the name */
7679     regnode *ret = NULL;
7680 #ifdef DEBUGGING
7681     char* parse_start = RExC_parse - 2;     /* points to the '\N' */
7682 #endif
7683     char* p;
7684
7685     GET_RE_DEBUG_FLAGS_DECL;
7686
7687     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7688
7689     GET_RE_DEBUG_FLAGS;
7690
7691     /* The [^\n] meaning of \N ignores spaces and comments under the /x
7692      * modifier.  The other meaning does not */
7693     p = (RExC_flags & RXf_PMf_EXTENDED)
7694         ? regwhite( pRExC_state, RExC_parse )
7695         : RExC_parse;
7696
7697     /* Disambiguate between \N meaning a named character versus \N meaning
7698      * [^\n].  The former is assumed when it can't be the latter. */
7699     if (*p != '{' || regcurly(p)) {
7700         RExC_parse = p;
7701         if (valuep) {
7702             /* no bare \N in a charclass */
7703             vFAIL("\\N in a character class must be a named character: \\N{...}");
7704         }
7705         nextchar(pRExC_state);
7706         ret = reg_node(pRExC_state, REG_ANY);
7707         *flagp |= HASWIDTH|SIMPLE;
7708         RExC_naughty++;
7709         RExC_parse--;
7710         Set_Node_Length(ret, 1); /* MJD */
7711         return ret;
7712     }
7713
7714     /* Here, we have decided it should be a named sequence */
7715
7716     /* The test above made sure that the next real character is a '{', but
7717      * under the /x modifier, it could be separated by space (or a comment and
7718      * \n) and this is not allowed (for consistency with \x{...} and the
7719      * tokenizer handling of \N{NAME}). */
7720     if (*RExC_parse != '{') {
7721         vFAIL("Missing braces on \\N{}");
7722     }
7723
7724     RExC_parse++;       /* Skip past the '{' */
7725
7726     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7727         || ! (endbrace == RExC_parse            /* nothing between the {} */
7728               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
7729                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7730     {
7731         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
7732         vFAIL("\\N{NAME} must be resolved by the lexer");
7733     }
7734
7735     if (endbrace == RExC_parse) {   /* empty: \N{} */
7736         if (! valuep) {
7737             RExC_parse = endbrace + 1;
7738             return reg_node(pRExC_state,NOTHING);
7739         }
7740
7741         if (SIZE_ONLY) {
7742             ckWARNreg(RExC_parse,
7743                     "Ignoring zero length \\N{} in character class"
7744             );
7745             RExC_parse = endbrace + 1;
7746         }
7747         *valuep = 0;
7748         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7749     }
7750
7751     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
7752     RExC_parse += 2;    /* Skip past the 'U+' */
7753
7754     if (valuep) {   /* In a bracketed char class */
7755         /* We only pay attention to the first char of
7756         multichar strings being returned. I kinda wonder
7757         if this makes sense as it does change the behaviour
7758         from earlier versions, OTOH that behaviour was broken
7759         as well. XXX Solution is to recharacterize as
7760         [rest-of-class]|multi1|multi2... */
7761
7762         STRLEN length_of_hex;
7763         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7764             | PERL_SCAN_DISALLOW_PREFIX
7765             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7766
7767         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7768         if (endchar < endbrace) {
7769             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7770         }
7771
7772         length_of_hex = (STRLEN)(endchar - RExC_parse);
7773         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7774
7775         /* The tokenizer should have guaranteed validity, but it's possible to
7776          * bypass it by using single quoting, so check */
7777         if (length_of_hex == 0
7778             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7779         {
7780             RExC_parse += length_of_hex;        /* Includes all the valid */
7781             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
7782                             ? UTF8SKIP(RExC_parse)
7783                             : 1;
7784             /* Guard against malformed utf8 */
7785             if (RExC_parse >= endchar) RExC_parse = endchar;
7786             vFAIL("Invalid hexadecimal number in \\N{U+...}");
7787         }
7788
7789         RExC_parse = endbrace + 1;
7790         if (endchar == endbrace) return NULL;
7791
7792         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
7793     }
7794     else {      /* Not a char class */
7795         char *s;            /* String to put in generated EXACT node */
7796         STRLEN len = 0;     /* Its current byte length */
7797         char *endchar;      /* Points to '.' or '}' ending cur char in the input
7798                                stream */
7799         ret = reg_node(pRExC_state,
7800                            (U8) ((! FOLD) ? EXACT
7801                                           : (LOC)
7802                                              ? EXACTFL
7803                                              : (MORE_ASCII_RESTRICTED)
7804                                                ? EXACTFA
7805                                                : (AT_LEAST_UNI_SEMANTICS)
7806                                                  ? EXACTFU
7807                                                  : EXACTF));
7808         s= STRING(ret);
7809
7810         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
7811          * the input which is of the form now 'c1.c2.c3...}' until find the
7812          * ending brace or exceed length 255.  The characters that exceed this
7813          * limit are dropped.  The limit could be relaxed should it become
7814          * desirable by reparsing this as (?:\N{NAME}), so could generate
7815          * multiple EXACT nodes, as is done for just regular input.  But this
7816          * is primarily a named character, and not intended to be a huge long
7817          * string, so 255 bytes should be good enough */
7818         while (1) {
7819             STRLEN length_of_hex;
7820             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7821                             | PERL_SCAN_DISALLOW_PREFIX
7822                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7823             UV cp;  /* Ord of current character */
7824             bool use_this_char_fold = FOLD;
7825
7826             /* Code points are separated by dots.  If none, there is only one
7827              * code point, and is terminated by the brace */
7828             endchar = RExC_parse + strcspn(RExC_parse, ".}");
7829
7830             /* The values are Unicode even on EBCDIC machines */
7831             length_of_hex = (STRLEN)(endchar - RExC_parse);
7832             cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7833             if ( length_of_hex == 0
7834                 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7835             {
7836                 RExC_parse += length_of_hex;        /* Includes all the valid */
7837                 RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
7838                                 ? UTF8SKIP(RExC_parse)
7839                                 : 1;
7840                 /* Guard against malformed utf8 */
7841                 if (RExC_parse >= endchar) RExC_parse = endchar;
7842                 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7843             }
7844
7845             /* XXX ? Change to ANYOF node
7846             if (FOLD
7847                 && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
7848                 && is_TRICKYFOLD_cp(cp))
7849             {
7850             }
7851             */
7852
7853             /* Under /aa, we can't mix ASCII with non- in a fold.  If we are
7854              * folding, and the source isn't ASCII, look through all the
7855              * characters it folds to.  If any one of them is ASCII, forbid
7856              * this fold.  (cp is uni, so the 127 below is correct even for
7857              * EBCDIC).  Similarly under locale rules, we don't mix under 256
7858              * with above 255.  XXX It really doesn't make sense to have \N{}
7859              * which means a Unicode rules under locale.  I (khw) think this
7860              * should be warned about, but the counter argument is that people
7861              * who have programmed around Perl's earlier lack of specifying the
7862              * rules and used \N{} to force Unicode things in a local
7863              * environment shouldn't get suddenly a warning */
7864             if (use_this_char_fold) {
7865                 if (LOC && cp < 256) {  /* Fold not known until run-time */
7866                     use_this_char_fold = FALSE;
7867                 }
7868                 else if ((cp > 127 && MORE_ASCII_RESTRICTED)
7869                          || (cp > 255 && LOC))
7870                 {
7871                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
7872                 U8* s = tmpbuf;
7873                 U8* e;
7874                 STRLEN foldlen;
7875
7876                 (void) toFOLD_uni(cp, tmpbuf, &foldlen);
7877                 e = s + foldlen;
7878
7879                 while (s < e) {
7880                     if (isASCII(*s)
7881                         || (LOC && (UTF8_IS_INVARIANT(*s)
7882                                     || UTF8_IS_DOWNGRADEABLE_START(*s))))
7883                     {
7884                         use_this_char_fold = FALSE;
7885                         break;
7886                     }
7887                     s += UTF8SKIP(s);
7888                 }
7889                 }
7890             }
7891
7892             if (! use_this_char_fold) { /* Not folding, just append to the
7893                                            string */
7894                 STRLEN unilen;
7895
7896                 /* Quit before adding this character if would exceed limit */
7897                 if (len + UNISKIP(cp) > U8_MAX) break;
7898
7899                 unilen = reguni(pRExC_state, cp, s);
7900                 if (unilen > 0) {
7901                     s   += unilen;
7902                     len += unilen;
7903                 }
7904             } else {    /* Folding, output the folded equivalent */
7905                 STRLEN foldlen,numlen;
7906                 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7907                 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7908
7909                 /* Quit before exceeding size limit */
7910                 if (len + foldlen > U8_MAX) break;
7911
7912                 for (foldbuf = tmpbuf;
7913                     foldlen;
7914                     foldlen -= numlen)
7915                 {
7916                     cp = utf8_to_uvchr(foldbuf, &numlen);
7917                     if (numlen > 0) {
7918                         const STRLEN unilen = reguni(pRExC_state, cp, s);
7919                         s       += unilen;
7920                         len     += unilen;
7921                         /* In EBCDIC the numlen and unilen can differ. */
7922                         foldbuf += numlen;
7923                         if (numlen >= foldlen)
7924                             break;
7925                     }
7926                     else
7927                         break; /* "Can't happen." */
7928                 }
7929             }
7930
7931             /* Point to the beginning of the next character in the sequence. */
7932             RExC_parse = endchar + 1;
7933
7934             /* Quit if no more characters */
7935             if (RExC_parse >= endbrace) break;
7936         }
7937
7938
7939         if (SIZE_ONLY) {
7940             if (RExC_parse < endbrace) {
7941                 ckWARNreg(RExC_parse - 1,
7942                           "Using just the first characters returned by \\N{}");
7943             }
7944
7945             RExC_size += STR_SZ(len);
7946         } else {
7947             STR_LEN(ret) = len;
7948             RExC_emit += STR_SZ(len);
7949         }
7950
7951         RExC_parse = endbrace + 1;
7952
7953         *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7954                                with malformed in t/re/pat_advanced.t */
7955         RExC_parse --;
7956         Set_Node_Cur_Length(ret); /* MJD */
7957         nextchar(pRExC_state);
7958     }
7959
7960     return ret;
7961 }
7962
7963
7964 /*
7965  * reg_recode
7966  *
7967  * It returns the code point in utf8 for the value in *encp.
7968  *    value: a code value in the source encoding
7969  *    encp:  a pointer to an Encode object
7970  *
7971  * If the result from Encode is not a single character,
7972  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7973  */
7974 STATIC UV
7975 S_reg_recode(pTHX_ const char value, SV **encp)
7976 {
7977     STRLEN numlen = 1;
7978     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7979     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7980     const STRLEN newlen = SvCUR(sv);
7981     UV uv = UNICODE_REPLACEMENT;
7982
7983     PERL_ARGS_ASSERT_REG_RECODE;
7984
7985     if (newlen)
7986         uv = SvUTF8(sv)
7987              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7988              : *(U8*)s;
7989
7990     if (!newlen || numlen != newlen) {
7991         uv = UNICODE_REPLACEMENT;
7992         *encp = NULL;
7993     }
7994     return uv;
7995 }
7996
7997
7998 /*
7999  - regatom - the lowest level
8000
8001    Try to identify anything special at the start of the pattern. If there
8002    is, then handle it as required. This may involve generating a single regop,
8003    such as for an assertion; or it may involve recursing, such as to
8004    handle a () structure.
8005
8006    If the string doesn't start with something special then we gobble up
8007    as much literal text as we can.
8008
8009    Once we have been able to handle whatever type of thing started the
8010    sequence, we return.
8011
8012    Note: we have to be careful with escapes, as they can be both literal
8013    and special, and in the case of \10 and friends can either, depending
8014    on context. Specifically there are two separate switches for handling
8015    escape sequences, with the one for handling literal escapes requiring
8016    a dummy entry for all of the special escapes that are actually handled
8017    by the other.
8018 */
8019
8020 STATIC regnode *
8021 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8022 {
8023     dVAR;
8024     register regnode *ret = NULL;
8025     I32 flags;
8026     char *parse_start = RExC_parse;
8027     U8 op;
8028     GET_RE_DEBUG_FLAGS_DECL;
8029     DEBUG_PARSE("atom");
8030     *flagp = WORST;             /* Tentatively. */
8031
8032     PERL_ARGS_ASSERT_REGATOM;
8033
8034 tryagain:
8035     switch ((U8)*RExC_parse) {
8036     case '^':
8037         RExC_seen_zerolen++;
8038         nextchar(pRExC_state);
8039         if (RExC_flags & RXf_PMf_MULTILINE)
8040             ret = reg_node(pRExC_state, MBOL);
8041         else if (RExC_flags & RXf_PMf_SINGLELINE)
8042             ret = reg_node(pRExC_state, SBOL);
8043         else
8044             ret = reg_node(pRExC_state, BOL);
8045         Set_Node_Length(ret, 1); /* MJD */
8046         break;
8047     case '$':
8048         nextchar(pRExC_state);
8049         if (*RExC_parse)
8050             RExC_seen_zerolen++;
8051         if (RExC_flags & RXf_PMf_MULTILINE)
8052             ret = reg_node(pRExC_state, MEOL);
8053         else if (RExC_flags & RXf_PMf_SINGLELINE)
8054             ret = reg_node(pRExC_state, SEOL);
8055         else
8056             ret = reg_node(pRExC_state, EOL);
8057         Set_Node_Length(ret, 1); /* MJD */
8058         break;
8059     case '.':
8060         nextchar(pRExC_state);
8061         if (RExC_flags & RXf_PMf_SINGLELINE)
8062             ret = reg_node(pRExC_state, SANY);
8063         else
8064             ret = reg_node(pRExC_state, REG_ANY);
8065         *flagp |= HASWIDTH|SIMPLE;
8066         RExC_naughty++;
8067         Set_Node_Length(ret, 1); /* MJD */
8068         break;
8069     case '[':
8070     {
8071         char * const oregcomp_parse = ++RExC_parse;
8072         ret = regclass(pRExC_state,depth+1);
8073         if (*RExC_parse != ']') {
8074             RExC_parse = oregcomp_parse;
8075             vFAIL("Unmatched [");
8076         }
8077         nextchar(pRExC_state);
8078         *flagp |= HASWIDTH|SIMPLE;
8079         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8080         break;
8081     }
8082     case '(':
8083         nextchar(pRExC_state);
8084         ret = reg(pRExC_state, 1, &flags,depth+1);
8085         if (ret == NULL) {
8086                 if (flags & TRYAGAIN) {
8087                     if (RExC_parse == RExC_end) {
8088                          /* Make parent create an empty node if needed. */
8089                         *flagp |= TRYAGAIN;
8090                         return(NULL);
8091                     }
8092                     goto tryagain;
8093                 }
8094                 return(NULL);
8095         }
8096         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8097         break;
8098     case '|':
8099     case ')':
8100         if (flags & TRYAGAIN) {
8101             *flagp |= TRYAGAIN;
8102             return NULL;
8103         }
8104         vFAIL("Internal urp");
8105                                 /* Supposed to be caught earlier. */
8106         break;
8107     case '{':
8108         if (!regcurly(RExC_parse)) {
8109             RExC_parse++;
8110             goto defchar;
8111         }
8112         /* FALL THROUGH */
8113     case '?':
8114     case '+':
8115     case '*':
8116         RExC_parse++;
8117         vFAIL("Quantifier follows nothing");
8118         break;
8119     case LATIN_SMALL_LETTER_SHARP_S:
8120     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8121     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8122 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
8123 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
8124     case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
8125 #endif
8126         do_foldchar:
8127         if (!LOC && FOLD) {
8128             U32 len,cp;
8129             len=0; /* silence a spurious compiler warning */
8130             if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
8131                 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
8132                 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
8133                 ret = reganode(pRExC_state, FOLDCHAR, cp);
8134                 Set_Node_Length(ret, 1); /* MJD */
8135                 nextchar(pRExC_state); /* kill whitespace under /x */
8136                 return ret;
8137             }
8138         }
8139         goto outer_default;
8140     case '\\':
8141         /* Special Escapes
8142
8143            This switch handles escape sequences that resolve to some kind
8144            of special regop and not to literal text. Escape sequnces that
8145            resolve to literal text are handled below in the switch marked
8146            "Literal Escapes".
8147
8148            Every entry in this switch *must* have a corresponding entry
8149            in the literal escape switch. However, the opposite is not
8150            required, as the default for this switch is to jump to the
8151            literal text handling code.
8152         */
8153         switch ((U8)*++RExC_parse) {
8154         case LATIN_SMALL_LETTER_SHARP_S:
8155         case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8156         case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8157                    goto do_foldchar;
8158         /* Special Escapes */
8159         case 'A':
8160             RExC_seen_zerolen++;
8161             ret = reg_node(pRExC_state, SBOL);
8162             *flagp |= SIMPLE;
8163             goto finish_meta_pat;
8164         case 'G':
8165             ret = reg_node(pRExC_state, GPOS);
8166             RExC_seen |= REG_SEEN_GPOS;
8167             *flagp |= SIMPLE;
8168             goto finish_meta_pat;
8169         case 'K':
8170             RExC_seen_zerolen++;
8171             ret = reg_node(pRExC_state, KEEPS);
8172             *flagp |= SIMPLE;
8173             /* XXX:dmq : disabling in-place substitution seems to
8174              * be necessary here to avoid cases of memory corruption, as
8175              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8176              */
8177             RExC_seen |= REG_SEEN_LOOKBEHIND;
8178             goto finish_meta_pat;
8179         case 'Z':
8180             ret = reg_node(pRExC_state, SEOL);
8181             *flagp |= SIMPLE;
8182             RExC_seen_zerolen++;                /* Do not optimize RE away */
8183             goto finish_meta_pat;
8184         case 'z':
8185             ret = reg_node(pRExC_state, EOS);
8186             *flagp |= SIMPLE;
8187             RExC_seen_zerolen++;                /* Do not optimize RE away */
8188             goto finish_meta_pat;
8189         case 'C':
8190             ret = reg_node(pRExC_state, CANY);
8191             RExC_seen |= REG_SEEN_CANY;
8192             *flagp |= HASWIDTH|SIMPLE;
8193             goto finish_meta_pat;
8194         case 'X':
8195             ret = reg_node(pRExC_state, CLUMP);
8196             *flagp |= HASWIDTH;
8197             goto finish_meta_pat;
8198         case 'w':
8199             switch (get_regex_charset(RExC_flags)) {
8200                 case REGEX_LOCALE_CHARSET:
8201                     op = ALNUML;
8202                     break;
8203                 case REGEX_UNICODE_CHARSET:
8204                     op = ALNUMU;
8205                     break;
8206                 case REGEX_ASCII_RESTRICTED_CHARSET:
8207                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8208                     op = ALNUMA;
8209                     break;
8210                 case REGEX_DEPENDS_CHARSET:
8211                     op = ALNUM;
8212                     break;
8213                 default:
8214                     goto bad_charset;
8215             }
8216             ret = reg_node(pRExC_state, op);
8217             *flagp |= HASWIDTH|SIMPLE;
8218             goto finish_meta_pat;
8219         case 'W':
8220             switch (get_regex_charset(RExC_flags)) {
8221                 case REGEX_LOCALE_CHARSET:
8222                     op = NALNUML;
8223                     break;
8224                 case REGEX_UNICODE_CHARSET:
8225                     op = NALNUMU;
8226                     break;
8227                 case REGEX_ASCII_RESTRICTED_CHARSET:
8228                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8229                     op = NALNUMA;
8230                     break;
8231                 case REGEX_DEPENDS_CHARSET:
8232                     op = NALNUM;
8233                     break;
8234                 default:
8235                     goto bad_charset;
8236             }
8237             ret = reg_node(pRExC_state, op);
8238             *flagp |= HASWIDTH|SIMPLE;
8239             goto finish_meta_pat;
8240         case 'b':
8241             RExC_seen_zerolen++;
8242             RExC_seen |= REG_SEEN_LOOKBEHIND;
8243             switch (get_regex_charset(RExC_flags)) {
8244                 case REGEX_LOCALE_CHARSET:
8245                     op = BOUNDL;
8246                     break;
8247                 case REGEX_UNICODE_CHARSET:
8248                     op = BOUNDU;
8249                     break;
8250                 case REGEX_ASCII_RESTRICTED_CHARSET:
8251                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8252                     op = BOUNDA;
8253                     break;
8254                 case REGEX_DEPENDS_CHARSET:
8255                     op = BOUND;
8256                     break;
8257                 default:
8258                     goto bad_charset;
8259             }
8260             ret = reg_node(pRExC_state, op);
8261             FLAGS(ret) = get_regex_charset(RExC_flags);
8262             *flagp |= SIMPLE;
8263             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8264                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8265             }
8266             goto finish_meta_pat;
8267         case 'B':
8268             RExC_seen_zerolen++;
8269             RExC_seen |= REG_SEEN_LOOKBEHIND;
8270             switch (get_regex_charset(RExC_flags)) {
8271                 case REGEX_LOCALE_CHARSET:
8272                     op = NBOUNDL;
8273                     break;
8274                 case REGEX_UNICODE_CHARSET:
8275                     op = NBOUNDU;
8276                     break;
8277                 case REGEX_ASCII_RESTRICTED_CHARSET:
8278                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8279                     op = NBOUNDA;
8280                     break;
8281                 case REGEX_DEPENDS_CHARSET:
8282                     op = NBOUND;
8283                     break;
8284                 default:
8285                     goto bad_charset;
8286             }
8287             ret = reg_node(pRExC_state, op);
8288             FLAGS(ret) = get_regex_charset(RExC_flags);
8289             *flagp |= SIMPLE;
8290             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8291                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8292             }
8293             goto finish_meta_pat;
8294         case 's':
8295             switch (get_regex_charset(RExC_flags)) {
8296                 case REGEX_LOCALE_CHARSET:
8297                     op = SPACEL;
8298                     break;
8299                 case REGEX_UNICODE_CHARSET:
8300                     op = SPACEU;
8301                     break;
8302                 case REGEX_ASCII_RESTRICTED_CHARSET:
8303                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8304                     op = SPACEA;
8305                     break;
8306                 case REGEX_DEPENDS_CHARSET:
8307                     op = SPACE;
8308                     break;
8309                 default:
8310                     goto bad_charset;
8311             }
8312             ret = reg_node(pRExC_state, op);
8313             *flagp |= HASWIDTH|SIMPLE;
8314             goto finish_meta_pat;
8315         case 'S':
8316             switch (get_regex_charset(RExC_flags)) {
8317                 case REGEX_LOCALE_CHARSET:
8318                     op = NSPACEL;
8319                     break;
8320                 case REGEX_UNICODE_CHARSET:
8321                     op = NSPACEU;
8322                     break;
8323                 case REGEX_ASCII_RESTRICTED_CHARSET:
8324                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8325                     op = NSPACEA;
8326                     break;
8327                 case REGEX_DEPENDS_CHARSET:
8328                     op = NSPACE;
8329                     break;
8330                 default:
8331                     goto bad_charset;
8332             }
8333             ret = reg_node(pRExC_state, op);
8334             *flagp |= HASWIDTH|SIMPLE;
8335             goto finish_meta_pat;
8336         case 'd':
8337             switch (get_regex_charset(RExC_flags)) {
8338                 case REGEX_LOCALE_CHARSET:
8339                     op = DIGITL;
8340                     break;
8341                 case REGEX_ASCII_RESTRICTED_CHARSET:
8342                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8343                     op = DIGITA;
8344                     break;
8345                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8346                 case REGEX_UNICODE_CHARSET:
8347                     op = DIGIT;
8348                     break;
8349                 default:
8350                     goto bad_charset;
8351             }
8352             ret = reg_node(pRExC_state, op);
8353             *flagp |= HASWIDTH|SIMPLE;
8354             goto finish_meta_pat;
8355         case 'D':
8356             switch (get_regex_charset(RExC_flags)) {
8357                 case REGEX_LOCALE_CHARSET:
8358                     op = NDIGITL;
8359                     break;
8360                 case REGEX_ASCII_RESTRICTED_CHARSET:
8361                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8362                     op = NDIGITA;
8363                     break;
8364                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8365                 case REGEX_UNICODE_CHARSET:
8366                     op = NDIGIT;
8367                     break;
8368                 default:
8369                     goto bad_charset;
8370             }
8371             ret = reg_node(pRExC_state, op);
8372             *flagp |= HASWIDTH|SIMPLE;
8373             goto finish_meta_pat;
8374         case 'R':
8375             ret = reg_node(pRExC_state, LNBREAK);
8376             *flagp |= HASWIDTH|SIMPLE;
8377             goto finish_meta_pat;
8378         case 'h':
8379             ret = reg_node(pRExC_state, HORIZWS);
8380             *flagp |= HASWIDTH|SIMPLE;
8381             goto finish_meta_pat;
8382         case 'H':
8383             ret = reg_node(pRExC_state, NHORIZWS);
8384             *flagp |= HASWIDTH|SIMPLE;
8385             goto finish_meta_pat;
8386         case 'v':
8387             ret = reg_node(pRExC_state, VERTWS);
8388             *flagp |= HASWIDTH|SIMPLE;
8389             goto finish_meta_pat;
8390         case 'V':
8391             ret = reg_node(pRExC_state, NVERTWS);
8392             *flagp |= HASWIDTH|SIMPLE;
8393          finish_meta_pat:
8394             nextchar(pRExC_state);
8395             Set_Node_Length(ret, 2); /* MJD */
8396             break;
8397         case 'p':
8398         case 'P':
8399             {
8400                 char* const oldregxend = RExC_end;
8401 #ifdef DEBUGGING
8402                 char* parse_start = RExC_parse - 2;
8403 #endif
8404
8405                 if (RExC_parse[1] == '{') {
8406                   /* a lovely hack--pretend we saw [\pX] instead */
8407                     RExC_end = strchr(RExC_parse, '}');
8408                     if (!RExC_end) {
8409                         const U8 c = (U8)*RExC_parse;
8410                         RExC_parse += 2;
8411                         RExC_end = oldregxend;
8412                         vFAIL2("Missing right brace on \\%c{}", c);
8413                     }
8414                     RExC_end++;
8415                 }
8416                 else {
8417                     RExC_end = RExC_parse + 2;
8418                     if (RExC_end > oldregxend)
8419                         RExC_end = oldregxend;
8420                 }
8421                 RExC_parse--;
8422
8423                 ret = regclass(pRExC_state,depth+1);
8424
8425                 RExC_end = oldregxend;
8426                 RExC_parse--;
8427
8428                 Set_Node_Offset(ret, parse_start + 2);
8429                 Set_Node_Cur_Length(ret);
8430                 nextchar(pRExC_state);
8431                 *flagp |= HASWIDTH|SIMPLE;
8432             }
8433             break;
8434         case 'N':
8435             /* Handle \N and \N{NAME} here and not below because it can be
8436             multicharacter. join_exact() will join them up later on.
8437             Also this makes sure that things like /\N{BLAH}+/ and
8438             \N{BLAH} being multi char Just Happen. dmq*/
8439             ++RExC_parse;
8440             ret= reg_namedseq(pRExC_state, NULL, flagp);
8441             break;
8442         case 'k':    /* Handle \k<NAME> and \k'NAME' */
8443         parse_named_seq:
8444         {
8445             char ch= RExC_parse[1];
8446             if (ch != '<' && ch != '\'' && ch != '{') {
8447                 RExC_parse++;
8448                 vFAIL2("Sequence %.2s... not terminated",parse_start);
8449             } else {
8450                 /* this pretty much dupes the code for (?P=...) in reg(), if
8451                    you change this make sure you change that */
8452                 char* name_start = (RExC_parse += 2);
8453                 U32 num = 0;
8454                 SV *sv_dat = reg_scan_name(pRExC_state,
8455                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8456                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8457                 if (RExC_parse == name_start || *RExC_parse != ch)
8458                     vFAIL2("Sequence %.3s... not terminated",parse_start);
8459
8460                 if (!SIZE_ONLY) {
8461                     num = add_data( pRExC_state, 1, "S" );
8462                     RExC_rxi->data->data[num]=(void*)sv_dat;
8463                     SvREFCNT_inc_simple_void(sv_dat);
8464                 }
8465
8466                 RExC_sawback = 1;
8467                 ret = reganode(pRExC_state,
8468                                ((! FOLD)
8469                                  ? NREF
8470                                  : (MORE_ASCII_RESTRICTED)
8471                                    ? NREFFA
8472                                    : (AT_LEAST_UNI_SEMANTICS)
8473                                      ? NREFFU
8474                                      : (LOC)
8475                                        ? NREFFL
8476                                        : NREFF),
8477                                 num);
8478                 *flagp |= HASWIDTH;
8479
8480                 /* override incorrect value set in reganode MJD */
8481                 Set_Node_Offset(ret, parse_start+1);
8482                 Set_Node_Cur_Length(ret); /* MJD */
8483                 nextchar(pRExC_state);
8484
8485             }
8486             break;
8487         }
8488         case 'g':
8489         case '1': case '2': case '3': case '4':
8490         case '5': case '6': case '7': case '8': case '9':
8491             {
8492                 I32 num;
8493                 bool isg = *RExC_parse == 'g';
8494                 bool isrel = 0;
8495                 bool hasbrace = 0;
8496                 if (isg) {
8497                     RExC_parse++;
8498                     if (*RExC_parse == '{') {
8499                         RExC_parse++;
8500                         hasbrace = 1;
8501                     }
8502                     if (*RExC_parse == '-') {
8503                         RExC_parse++;
8504                         isrel = 1;
8505                     }
8506                     if (hasbrace && !isDIGIT(*RExC_parse)) {
8507                         if (isrel) RExC_parse--;
8508                         RExC_parse -= 2;
8509                         goto parse_named_seq;
8510                 }   }
8511                 num = atoi(RExC_parse);
8512                 if (isg && num == 0)
8513                     vFAIL("Reference to invalid group 0");
8514                 if (isrel) {
8515                     num = RExC_npar - num;
8516                     if (num < 1)
8517                         vFAIL("Reference to nonexistent or unclosed group");
8518                 }
8519                 if (!isg && num > 9 && num >= RExC_npar)
8520                     goto defchar;
8521                 else {
8522                     char * const parse_start = RExC_parse - 1; /* MJD */
8523                     while (isDIGIT(*RExC_parse))
8524                         RExC_parse++;
8525                     if (parse_start == RExC_parse - 1)
8526                         vFAIL("Unterminated \\g... pattern");
8527                     if (hasbrace) {
8528                         if (*RExC_parse != '}')
8529                             vFAIL("Unterminated \\g{...} pattern");
8530                         RExC_parse++;
8531                     }
8532                     if (!SIZE_ONLY) {
8533                         if (num > (I32)RExC_rx->nparens)
8534                             vFAIL("Reference to nonexistent group");
8535                     }
8536                     RExC_sawback = 1;
8537                     ret = reganode(pRExC_state,
8538                                    ((! FOLD)
8539                                      ? REF
8540                                      : (MORE_ASCII_RESTRICTED)
8541                                        ? REFFA
8542                                        : (AT_LEAST_UNI_SEMANTICS)
8543                                          ? REFFU
8544                                          : (LOC)
8545                                            ? REFFL
8546                                            : REFF),
8547                                     num);
8548                     *flagp |= HASWIDTH;
8549
8550                     /* override incorrect value set in reganode MJD */
8551                     Set_Node_Offset(ret, parse_start+1);
8552                     Set_Node_Cur_Length(ret); /* MJD */
8553                     RExC_parse--;
8554                     nextchar(pRExC_state);
8555                 }
8556             }
8557             break;
8558         case '\0':
8559             if (RExC_parse >= RExC_end)
8560                 FAIL("Trailing \\");
8561             /* FALL THROUGH */
8562         default:
8563             /* Do not generate "unrecognized" warnings here, we fall
8564                back into the quick-grab loop below */
8565             parse_start--;
8566             goto defchar;
8567         }
8568         break;
8569
8570     case '#':
8571         if (RExC_flags & RXf_PMf_EXTENDED) {
8572             if ( reg_skipcomment( pRExC_state ) )
8573                 goto tryagain;
8574         }
8575         /* FALL THROUGH */
8576
8577     default:
8578         outer_default:{
8579             register STRLEN len;
8580             register UV ender;
8581             register char *p;
8582             char *s;
8583             STRLEN foldlen;
8584             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8585             regnode * orig_emit;
8586
8587             parse_start = RExC_parse - 1;
8588
8589             RExC_parse++;
8590
8591         defchar:
8592             ender = 0;
8593             orig_emit = RExC_emit; /* Save the original output node position in
8594                                       case we need to output a different node
8595                                       type */
8596             ret = reg_node(pRExC_state,
8597                            (U8) ((! FOLD) ? EXACT
8598                                           : (LOC)
8599                                              ? EXACTFL
8600                                              : (MORE_ASCII_RESTRICTED)
8601                                                ? EXACTFA
8602                                                : (AT_LEAST_UNI_SEMANTICS)
8603                                                  ? EXACTFU
8604                                                  : EXACTF)
8605                     );
8606             s = STRING(ret);
8607             for (len = 0, p = RExC_parse - 1;
8608               len < 127 && p < RExC_end;
8609               len++)
8610             {
8611                 char * const oldp = p;
8612
8613                 if (RExC_flags & RXf_PMf_EXTENDED)
8614                     p = regwhite( pRExC_state, p );
8615                 switch ((U8)*p) {
8616                 case LATIN_SMALL_LETTER_SHARP_S:
8617                 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8618                 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8619                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8620                                 goto normal_default;
8621                 case '^':
8622                 case '$':
8623                 case '.':
8624                 case '[':
8625                 case '(':
8626                 case ')':
8627                 case '|':
8628                     goto loopdone;
8629                 case '\\':
8630                     /* Literal Escapes Switch
8631
8632                        This switch is meant to handle escape sequences that
8633                        resolve to a literal character.
8634
8635                        Every escape sequence that represents something
8636                        else, like an assertion or a char class, is handled
8637                        in the switch marked 'Special Escapes' above in this
8638                        routine, but also has an entry here as anything that
8639                        isn't explicitly mentioned here will be treated as
8640                        an unescaped equivalent literal.
8641                     */
8642
8643                     switch ((U8)*++p) {
8644                     /* These are all the special escapes. */
8645                     case LATIN_SMALL_LETTER_SHARP_S:
8646                     case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8647                     case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8648                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8649                                 goto normal_default;
8650                     case 'A':             /* Start assertion */
8651                     case 'b': case 'B':   /* Word-boundary assertion*/
8652                     case 'C':             /* Single char !DANGEROUS! */
8653                     case 'd': case 'D':   /* digit class */
8654                     case 'g': case 'G':   /* generic-backref, pos assertion */
8655                     case 'h': case 'H':   /* HORIZWS */
8656                     case 'k': case 'K':   /* named backref, keep marker */
8657                     case 'N':             /* named char sequence */
8658                     case 'p': case 'P':   /* Unicode property */
8659                               case 'R':   /* LNBREAK */
8660                     case 's': case 'S':   /* space class */
8661                     case 'v': case 'V':   /* VERTWS */
8662                     case 'w': case 'W':   /* word class */
8663                     case 'X':             /* eXtended Unicode "combining character sequence" */
8664                     case 'z': case 'Z':   /* End of line/string assertion */
8665                         --p;
8666                         goto loopdone;
8667
8668                     /* Anything after here is an escape that resolves to a
8669                        literal. (Except digits, which may or may not)
8670                      */
8671                     case 'n':
8672                         ender = '\n';
8673                         p++;
8674                         break;
8675                     case 'r':
8676                         ender = '\r';
8677                         p++;
8678                         break;
8679                     case 't':
8680                         ender = '\t';
8681                         p++;
8682                         break;
8683                     case 'f':
8684                         ender = '\f';
8685                         p++;
8686                         break;
8687                     case 'e':
8688                           ender = ASCII_TO_NATIVE('\033');
8689                         p++;
8690                         break;
8691                     case 'a':
8692                           ender = ASCII_TO_NATIVE('\007');
8693                         p++;
8694                         break;
8695                     case 'o':
8696                         {
8697                             STRLEN brace_len = len;
8698                             UV result;
8699                             const char* error_msg;
8700
8701                             bool valid = grok_bslash_o(p,
8702                                                        &result,
8703                                                        &brace_len,
8704                                                        &error_msg,
8705                                                        1);
8706                             p += brace_len;
8707                             if (! valid) {
8708                                 RExC_parse = p; /* going to die anyway; point
8709                                                    to exact spot of failure */
8710                                 vFAIL(error_msg);
8711                             }
8712                             else
8713                             {
8714                                 ender = result;
8715                             }
8716                             if (PL_encoding && ender < 0x100) {
8717                                 goto recode_encoding;
8718                             }
8719                             if (ender > 0xff) {
8720                                 REQUIRE_UTF8;
8721                             }
8722                             break;
8723                         }
8724                     case 'x':
8725                         if (*++p == '{') {
8726                             char* const e = strchr(p, '}');
8727
8728                             if (!e) {
8729                                 RExC_parse = p + 1;
8730                                 vFAIL("Missing right brace on \\x{}");
8731                             }
8732                             else {
8733                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8734                                     | PERL_SCAN_DISALLOW_PREFIX;
8735                                 STRLEN numlen = e - p - 1;
8736                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
8737                                 if (ender > 0xff)
8738                                     REQUIRE_UTF8;
8739                                 p = e + 1;
8740                             }
8741                         }
8742                         else {
8743                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8744                             STRLEN numlen = 2;
8745                             ender = grok_hex(p, &numlen, &flags, NULL);
8746                             p += numlen;
8747                         }
8748                         if (PL_encoding && ender < 0x100)
8749                             goto recode_encoding;
8750                         break;
8751                     case 'c':
8752                         p++;
8753                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8754                         break;
8755                     case '0': case '1': case '2': case '3':case '4':
8756                     case '5': case '6': case '7': case '8':case '9':
8757                         if (*p == '0' ||
8758                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8759                         {
8760                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8761                             STRLEN numlen = 3;
8762                             ender = grok_oct(p, &numlen, &flags, NULL);
8763                             if (ender > 0xff) {
8764                                 REQUIRE_UTF8;
8765                             }
8766                             p += numlen;
8767                         }
8768                         else {
8769                             --p;
8770                             goto loopdone;
8771                         }
8772                         if (PL_encoding && ender < 0x100)
8773                             goto recode_encoding;
8774                         break;
8775                     recode_encoding:
8776                         {
8777                             SV* enc = PL_encoding;
8778                             ender = reg_recode((const char)(U8)ender, &enc);
8779                             if (!enc && SIZE_ONLY)
8780                                 ckWARNreg(p, "Invalid escape in the specified encoding");
8781                             REQUIRE_UTF8;
8782                         }
8783                         break;
8784                     case '\0':
8785                         if (p >= RExC_end)
8786                             FAIL("Trailing \\");
8787                         /* FALL THROUGH */
8788                     default:
8789                         if (!SIZE_ONLY&& isALPHA(*p)) {
8790                             /* Include any { following the alpha to emphasize
8791                              * that it could be part of an escape at some point
8792                              * in the future */
8793                             int len = (*(p + 1) == '{') ? 2 : 1;
8794                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8795                         }
8796                         goto normal_default;
8797                     }
8798                     break;
8799                 default:
8800                   normal_default:
8801                     if (UTF8_IS_START(*p) && UTF) {
8802                         STRLEN numlen;
8803                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8804                                                &numlen, UTF8_ALLOW_DEFAULT);
8805                         p += numlen;
8806                     }
8807                     else
8808                         ender = (U8) *p++;
8809                     break;
8810                 } /* End of switch on the literal */
8811
8812                 /* Certain characters are problematic because their folded
8813                  * length is so different from their original length that it
8814                  * isn't handleable by the optimizer.  They are therefore not
8815                  * placed in an EXACTish node; and are here handled specially.
8816                  * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8817                  * putting it in a special node keeps regexec from having to
8818                  * deal with a non-utf8 multi-char fold */
8819                 if (FOLD
8820                     && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
8821                     && is_TRICKYFOLD_cp(ender))
8822                 {
8823                     /* If is in middle of outputting characters into an
8824                      * EXACTish node, go output what we have so far, and
8825                      * position the parse so that this will be called again
8826                      * immediately */
8827                     if (len) {
8828                         p  = oldp;
8829                         goto loopdone;
8830                     }
8831                     else {
8832
8833                         /* Here we are ready to output our tricky fold
8834                          * character.  What's done is to pretend it's in a
8835                          * [bracketed] class, and let the code that deals with
8836                          * those handle it, as that code has all the
8837                          * intelligence necessary.  First save the current
8838                          * parse state, get rid of the already allocated EXACT
8839                          * node that the ANYOFV node will replace, and point
8840                          * the parse to a buffer which we fill with the
8841                          * character we want the regclass code to think is
8842                          * being parsed */
8843                         char* const oldregxend = RExC_end;
8844                         char tmpbuf[2];
8845                         RExC_emit = orig_emit;
8846                         RExC_parse = tmpbuf;
8847                         if (UTF) {
8848                             tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
8849                             tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
8850                             RExC_end = RExC_parse + 2;
8851                         }
8852                         else {
8853                             tmpbuf[0] = (char) ender;
8854                             RExC_end = RExC_parse + 1;
8855                         }
8856
8857                         ret = regclass(pRExC_state,depth+1);
8858
8859                         /* Here, have parsed the buffer.  Reset the parse to
8860                          * the actual input, and return */
8861                         RExC_end = oldregxend;
8862                         RExC_parse = p - 1;
8863
8864                         Set_Node_Offset(ret, RExC_parse);
8865                         Set_Node_Cur_Length(ret);
8866                         nextchar(pRExC_state);
8867                         *flagp |= HASWIDTH|SIMPLE;
8868                         return ret;
8869                     }
8870                 }
8871
8872                 if ( RExC_flags & RXf_PMf_EXTENDED)
8873                     p = regwhite( pRExC_state, p );
8874                 if (UTF && FOLD) {
8875                     /* Prime the casefolded buffer.  Locale rules, which apply
8876                      * only to code points < 256, aren't known until execution,
8877                      * so for them, just output the original character using
8878                      * utf8 */
8879                     if (LOC && ender < 256) {
8880                         if (UNI_IS_INVARIANT(ender)) {
8881                             *tmpbuf = (U8) ender;
8882                             foldlen = 1;
8883                         } else {
8884                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8885                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8886                             foldlen = 2;
8887                         }
8888                     }
8889                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
8890                                                  */
8891                         ender = toLOWER(ender);
8892                         *tmpbuf = (U8) ender;
8893                         foldlen = 1;
8894                     }
8895                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8896
8897                         /* Locale and /aa require more selectivity about the
8898                          * fold, so are handled below.  Otherwise, here, just
8899                          * use the fold */
8900                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8901                     }
8902                     else {
8903                         /* Under locale rules or /aa we are not to mix,
8904                          * respectively, ords < 256 or ASCII with non-.  So
8905                          * reject folds that mix them, using only the
8906                          * non-folded code point.  So do the fold to a
8907                          * temporary, and inspect each character in it. */
8908                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8909                         U8* s = trialbuf;
8910                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8911                         U8* e = s + foldlen;
8912                         bool fold_ok = TRUE;
8913
8914                         while (s < e) {
8915                             if (isASCII(*s)
8916                                 || (LOC && (UTF8_IS_INVARIANT(*s)
8917                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
8918                             {
8919                                 fold_ok = FALSE;
8920                                 break;
8921                             }
8922                             s += UTF8SKIP(s);
8923                         }
8924                         if (fold_ok) {
8925                             Copy(trialbuf, tmpbuf, foldlen, U8);
8926                             ender = tmpender;
8927                         }
8928                         else {
8929                             uvuni_to_utf8(tmpbuf, ender);
8930                             foldlen = UNISKIP(ender);
8931                         }
8932                     }
8933                 }
8934                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8935                     if (len)
8936                         p = oldp;
8937                     else if (UTF) {
8938                          if (FOLD) {
8939                               /* Emit all the Unicode characters. */
8940                               STRLEN numlen;
8941                               for (foldbuf = tmpbuf;
8942                                    foldlen;
8943                                    foldlen -= numlen) {
8944                                    ender = utf8_to_uvchr(foldbuf, &numlen);
8945                                    if (numlen > 0) {
8946                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
8947                                         s       += unilen;
8948                                         len     += unilen;
8949                                         /* In EBCDIC the numlen
8950                                          * and unilen can differ. */
8951                                         foldbuf += numlen;
8952                                         if (numlen >= foldlen)
8953                                              break;
8954                                    }
8955                                    else
8956                                         break; /* "Can't happen." */
8957                               }
8958                          }
8959                          else {
8960                               const STRLEN unilen = reguni(pRExC_state, ender, s);
8961                               if (unilen > 0) {
8962                                    s   += unilen;
8963                                    len += unilen;
8964                               }
8965                          }
8966                     }
8967                     else {
8968                         len++;
8969                         REGC((char)ender, s++);
8970                     }
8971                     break;
8972                 }
8973                 if (UTF) {
8974                      if (FOLD) {
8975                           /* Emit all the Unicode characters. */
8976                           STRLEN numlen;
8977                           for (foldbuf = tmpbuf;
8978                                foldlen;
8979                                foldlen -= numlen) {
8980                                ender = utf8_to_uvchr(foldbuf, &numlen);
8981                                if (numlen > 0) {
8982                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
8983                                     len     += unilen;
8984                                     s       += unilen;
8985                                     /* In EBCDIC the numlen
8986                                      * and unilen can differ. */
8987                                     foldbuf += numlen;
8988                                     if (numlen >= foldlen)
8989                                          break;
8990                                }
8991                                else
8992                                     break;
8993                           }
8994                      }
8995                      else {
8996                           const STRLEN unilen = reguni(pRExC_state, ender, s);
8997                           if (unilen > 0) {
8998                                s   += unilen;
8999                                len += unilen;
9000                           }
9001                      }
9002                      len--;
9003                 }
9004                 else
9005                     REGC((char)ender, s++);
9006             }
9007         loopdone:   /* Jumped to when encounters something that shouldn't be in
9008                        the node */
9009             RExC_parse = p - 1;
9010             Set_Node_Cur_Length(ret); /* MJD */
9011             nextchar(pRExC_state);
9012             {
9013                 /* len is STRLEN which is unsigned, need to copy to signed */
9014                 IV iv = len;
9015                 if (iv < 0)
9016                     vFAIL("Internal disaster");
9017             }
9018             if (len > 0)
9019                 *flagp |= HASWIDTH;
9020             if (len == 1 && UNI_IS_INVARIANT(ender))
9021                 *flagp |= SIMPLE;
9022
9023             if (SIZE_ONLY)
9024                 RExC_size += STR_SZ(len);
9025             else {
9026                 STR_LEN(ret) = len;
9027                 RExC_emit += STR_SZ(len);
9028             }
9029         }
9030         break;
9031     }
9032
9033     return(ret);
9034
9035 /* Jumped to when an unrecognized character set is encountered */
9036 bad_charset:
9037     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9038     return(NULL);
9039 }
9040
9041 STATIC char *
9042 S_regwhite( RExC_state_t *pRExC_state, char *p )
9043 {
9044     const char *e = RExC_end;
9045
9046     PERL_ARGS_ASSERT_REGWHITE;
9047
9048     while (p < e) {
9049         if (isSPACE(*p))
9050             ++p;
9051         else if (*p == '#') {
9052             bool ended = 0;
9053             do {
9054                 if (*p++ == '\n') {
9055                     ended = 1;
9056                     break;
9057                 }
9058             } while (p < e);
9059             if (!ended)
9060                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9061         }
9062         else
9063             break;
9064     }
9065     return p;
9066 }
9067
9068 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9069    Character classes ([:foo:]) can also be negated ([:^foo:]).
9070    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9071    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9072    but trigger failures because they are currently unimplemented. */
9073
9074 #define POSIXCC_DONE(c)   ((c) == ':')
9075 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9076 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9077
9078 STATIC I32
9079 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9080 {
9081     dVAR;
9082     I32 namedclass = OOB_NAMEDCLASS;
9083
9084     PERL_ARGS_ASSERT_REGPPOSIXCC;
9085
9086     if (value == '[' && RExC_parse + 1 < RExC_end &&
9087         /* I smell either [: or [= or [. -- POSIX has been here, right? */
9088         POSIXCC(UCHARAT(RExC_parse))) {
9089         const char c = UCHARAT(RExC_parse);
9090         char* const s = RExC_parse++;
9091
9092         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9093             RExC_parse++;
9094         if (RExC_parse == RExC_end)
9095             /* Grandfather lone [:, [=, [. */
9096             RExC_parse = s;
9097         else {
9098             const char* const t = RExC_parse++; /* skip over the c */
9099             assert(*t == c);
9100
9101             if (UCHARAT(RExC_parse) == ']') {
9102                 const char *posixcc = s + 1;
9103                 RExC_parse++; /* skip over the ending ] */
9104
9105                 if (*s == ':') {
9106                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9107                     const I32 skip = t - posixcc;
9108
9109                     /* Initially switch on the length of the name.  */
9110                     switch (skip) {
9111                     case 4:
9112                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9113                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9114                         break;
9115                     case 5:
9116                         /* Names all of length 5.  */
9117                         /* alnum alpha ascii blank cntrl digit graph lower
9118                            print punct space upper  */
9119                         /* Offset 4 gives the best switch position.  */
9120                         switch (posixcc[4]) {
9121                         case 'a':
9122                             if (memEQ(posixcc, "alph", 4)) /* alpha */
9123                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9124                             break;
9125                         case 'e':
9126                             if (memEQ(posixcc, "spac", 4)) /* space */
9127                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9128                             break;
9129                         case 'h':
9130                             if (memEQ(posixcc, "grap", 4)) /* graph */
9131                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9132                             break;
9133                         case 'i':
9134                             if (memEQ(posixcc, "asci", 4)) /* ascii */
9135                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9136                             break;
9137                         case 'k':
9138                             if (memEQ(posixcc, "blan", 4)) /* blank */
9139                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9140                             break;
9141                         case 'l':
9142                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9143                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9144                             break;
9145                         case 'm':
9146                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
9147                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9148                             break;
9149                         case 'r':
9150                             if (memEQ(posixcc, "lowe", 4)) /* lower */
9151                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9152                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
9153                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9154                             break;
9155                         case 't':
9156                             if (memEQ(posixcc, "digi", 4)) /* digit */
9157                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9158                             else if (memEQ(posixcc, "prin", 4)) /* print */
9159                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9160                             else if (memEQ(posixcc, "punc", 4)) /* punct */
9161                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9162                             break;
9163                         }
9164                         break;
9165                     case 6:
9166                         if (memEQ(posixcc, "xdigit", 6))
9167                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9168                         break;
9169                     }
9170
9171                     if (namedclass == OOB_NAMEDCLASS)
9172                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9173                                       t - s - 1, s + 1);
9174                     assert (posixcc[skip] == ':');
9175                     assert (posixcc[skip+1] == ']');
9176                 } else if (!SIZE_ONLY) {
9177                     /* [[=foo=]] and [[.foo.]] are still future. */
9178
9179                     /* adjust RExC_parse so the warning shows after
9180                        the class closes */
9181                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9182                         RExC_parse++;
9183                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9184                 }
9185             } else {
9186                 /* Maternal grandfather:
9187                  * "[:" ending in ":" but not in ":]" */
9188                 RExC_parse = s;
9189             }
9190         }
9191     }
9192
9193     return namedclass;
9194 }
9195
9196 STATIC void
9197 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9198 {
9199     dVAR;
9200
9201     PERL_ARGS_ASSERT_CHECKPOSIXCC;
9202
9203     if (POSIXCC(UCHARAT(RExC_parse))) {
9204         const char *s = RExC_parse;
9205         const char  c = *s++;
9206
9207         while (isALNUM(*s))
9208             s++;
9209         if (*s && c == *s && s[1] == ']') {
9210             ckWARN3reg(s+2,
9211                        "POSIX syntax [%c %c] belongs inside character classes",
9212                        c, c);
9213
9214             /* [[=foo=]] and [[.foo.]] are still future. */
9215             if (POSIXCC_NOTYET(c)) {
9216                 /* adjust RExC_parse so the error shows after
9217                    the class closes */
9218                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9219                     NOOP;
9220                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9221             }
9222         }
9223     }
9224 }
9225
9226 /* No locale test, and always Unicode semantics */
9227 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
9228 ANYOF_##NAME:                                                                  \
9229         for (value = 0; value < 256; value++)                                  \
9230             if (TEST)                                                          \
9231             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9232     yesno = '+';                                                               \
9233     what = WORD;                                                               \
9234     break;                                                                     \
9235 case ANYOF_N##NAME:                                                            \
9236         for (value = 0; value < 256; value++)                                  \
9237             if (!TEST)                                                         \
9238             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9239     yesno = '!';                                                               \
9240     what = WORD;                                                               \
9241     break
9242
9243 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9244  * there are two tests passed in, to use depending on that. There aren't any
9245  * cases where the label is different from the name, so no need for that
9246  * parameter */
9247 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD)                                    \
9248 ANYOF_##NAME:                                                                  \
9249     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
9250     else if (UNI_SEMANTICS) {                                                  \
9251         for (value = 0; value < 256; value++) {                                \
9252             if (TEST_8(value)) stored +=                                       \
9253                       set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9254         }                                                                      \
9255     }                                                                          \
9256     else {                                                                     \
9257         for (value = 0; value < 128; value++) {                                \
9258             if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
9259                 set_regclass_bit(pRExC_state, ret,                     \
9260                                    (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);                 \
9261         }                                                                      \
9262     }                                                                          \
9263     yesno = '+';                                                               \
9264     what = WORD;                                                               \
9265     break;                                                                     \
9266 case ANYOF_N##NAME:                                                            \
9267     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
9268     else if (UNI_SEMANTICS) {                                                  \
9269         for (value = 0; value < 256; value++) {                                \
9270             if (! TEST_8(value)) stored +=                                     \
9271                     set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);    \
9272         }                                                                      \
9273     }                                                                          \
9274     else {                                                                     \
9275         for (value = 0; value < 128; value++) {                                \
9276             if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
9277                         pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);    \
9278         }                                                                      \
9279         if (AT_LEAST_ASCII_RESTRICTED) {                                       \
9280             for (value = 128; value < 256; value++) {                          \
9281              stored += set_regclass_bit(                                     \
9282                            pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9283             }                                                                  \
9284             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;                             \
9285         }                                                                      \
9286         else {                                                                 \
9287             /* For a non-ut8 target string with DEPENDS semantics, all above   \
9288              * ASCII Latin1 code points match the complement of any of the     \
9289              * classes.  But in utf8, they have their Unicode semantics, so    \
9290              * can't just set them in the bitmap, or else regexec.c will think \
9291              * they matched when they shouldn't. */                            \
9292             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;                     \
9293         }                                                                      \
9294     }                                                                          \
9295     yesno = '!';                                                               \
9296     what = WORD;                                                               \
9297     break
9298
9299 STATIC U8
9300 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9301 {
9302
9303     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9304      * Locale folding is done at run-time, so this function should not be
9305      * called for nodes that are for locales.
9306      *
9307      * This function sets the bit corresponding to the fold of the input
9308      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
9309      * 'F' is 'f'.
9310      *
9311      * It also knows about the characters that are in the bitmap that have
9312      * folds that are matchable only outside it, and sets the appropriate lists
9313      * and flags.
9314      *
9315      * It returns the number of bits that actually changed from 0 to 1 */
9316
9317     U8 stored = 0;
9318     U8 fold;
9319
9320     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9321
9322     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9323                                     : PL_fold[value];
9324
9325     /* It assumes the bit for 'value' has already been set */
9326     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9327         ANYOF_BITMAP_SET(node, fold);
9328         stored++;
9329     }
9330     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9331         /* Certain Latin1 characters have matches outside the bitmap.  To get
9332          * here, 'value' is one of those characters.   None of these matches is
9333          * valid for ASCII characters under /aa, which have been excluded by
9334          * the 'if' above.  The matches fall into three categories:
9335          * 1) They are singly folded-to or -from an above 255 character, as
9336          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9337          *    WITH DIAERESIS;
9338          * 2) They are part of a multi-char fold with another character in the
9339          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9340          * 3) They are part of a multi-char fold with a character not in the
9341          *    bitmap, such as various ligatures.
9342          * We aren't dealing fully with multi-char folds, except we do deal
9343          * with the pattern containing a character that has a multi-char fold
9344          * (not so much the inverse).
9345          * For types 1) and 3), the matches only happen when the target string
9346          * is utf8; that's not true for 2), and we set a flag for it.
9347          *
9348          * The code below adds to the passed in inversion list the single fold
9349          * closures for 'value'.  The values are hard-coded here so that an
9350          * innocent-looking character class, like /[ks]/i won't have to go out
9351          * to disk to find the possible matches.  XXX It would be better to
9352          * generate these via regen, in case a new version of the Unicode
9353          * standard adds new mappings, though that is not really likely. */
9354         switch (value) {
9355             case 'k':
9356             case 'K':
9357                 /* KELVIN SIGN */
9358                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9359                 break;
9360             case 's':
9361             case 'S':
9362                 /* LATIN SMALL LETTER LONG S */
9363                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9364                 break;
9365             case MICRO_SIGN:
9366                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9367                                                  GREEK_SMALL_LETTER_MU);
9368                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9369                                                  GREEK_CAPITAL_LETTER_MU);
9370                 break;
9371             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9372             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9373                 /* ANGSTROM SIGN */
9374                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9375                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
9376                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9377                                                      PL_fold_latin1[value]);
9378                 }
9379                 break;
9380             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9381                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9382                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9383                 break;
9384             case LATIN_SMALL_LETTER_SHARP_S:
9385                 /* 0x1E9E is LATIN CAPITAL LETTER SHARP S */
9386                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x1E9E);
9387
9388                 /* Under /a, /d, and /u, this can match the two chars "ss" */
9389                 if (! MORE_ASCII_RESTRICTED) {
9390                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
9391
9392                     /* And under /u or /a, it can match even if the target is
9393                      * not utf8 */
9394                     if (AT_LEAST_UNI_SEMANTICS) {
9395                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9396                     }
9397                 }
9398                 break;
9399             case 'F': case 'f':
9400             case 'I': case 'i':
9401             case 'L': case 'l':
9402             case 'T': case 't':
9403                 /* These all are targets of multi-character folds, which can
9404                  * occur with only non-Latin1 characters in the fold, so they
9405                  * can match if the target string isn't UTF-8 */
9406                 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9407                 break;
9408             case 'A': case 'a':
9409             case 'H': case 'h':
9410             case 'J': case 'j':
9411             case 'N': case 'n':
9412             case 'W': case 'w':
9413             case 'Y': case 'y':
9414                 /* These all are targets of multi-character folds, which occur
9415                  * only with a non-Latin1 character as part of the fold, so
9416                  * they can't match unless the target string is in UTF-8, so no
9417                  * action here is necessary */
9418                 break;
9419             default:
9420                 /* Use deprecated warning to increase the chances of this
9421                  * being output */
9422                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9423                 break;
9424         }
9425     }
9426     else if (DEPENDS_SEMANTICS
9427             && ! isASCII(value)
9428             && PL_fold_latin1[value] != value)
9429     {
9430            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9431             * folds only when the target string is in UTF-8.  We add the fold
9432             * here to the list of things to match outside the bitmap, which
9433             * won't be looked at unless it is UTF8 (or else if something else
9434             * says to look even if not utf8, but those things better not happen
9435             * under DEPENDS semantics. */
9436         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
9437     }
9438
9439     return stored;
9440 }
9441
9442
9443 PERL_STATIC_INLINE U8
9444 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9445 {
9446     /* This inline function sets a bit in the bitmap if not already set, and if
9447      * appropriate, its fold, returning the number of bits that actually
9448      * changed from 0 to 1 */
9449
9450     U8 stored;
9451
9452     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9453
9454     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
9455         return 0;
9456     }
9457
9458     ANYOF_BITMAP_SET(node, value);
9459     stored = 1;
9460
9461     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
9462         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
9463     }
9464
9465     return stored;
9466 }
9467
9468 STATIC void
9469 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9470 {
9471     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9472      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
9473      * the multi-character folds of characters in the node */
9474     SV *sv;
9475
9476     PERL_ARGS_ASSERT_ADD_ALTERNATE;
9477
9478     if (! *alternate_ptr) {
9479         *alternate_ptr = newAV();
9480     }
9481     sv = newSVpvn_utf8((char*)string, len, TRUE);
9482     av_push(*alternate_ptr, sv);
9483     return;
9484 }
9485
9486 /*
9487    parse a class specification and produce either an ANYOF node that
9488    matches the pattern or perhaps will be optimized into an EXACTish node
9489    instead. The node contains a bit map for the first 256 characters, with the
9490    corresponding bit set if that character is in the list.  For characters
9491    above 255, a range list is used */
9492
9493 STATIC regnode *
9494 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9495 {
9496     dVAR;
9497     register UV nextvalue;
9498     register IV prevvalue = OOB_UNICODE;
9499     register IV range = 0;
9500     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9501     register regnode *ret;
9502     STRLEN numlen;
9503     IV namedclass;
9504     char *rangebegin = NULL;
9505     bool need_class = 0;
9506     SV *listsv = NULL;
9507     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9508                                       than just initialized.  */
9509     UV n;
9510
9511     /* code points this node matches that can't be stored in the bitmap */
9512     HV* nonbitmap = NULL;
9513
9514     /* The items that are to match that aren't stored in the bitmap, but are a
9515      * result of things that are stored there.  This is the fold closure of
9516      * such a character, either because it has DEPENDS semantics and shouldn't
9517      * be matched unless the target string is utf8, or is a code point that is
9518      * too large for the bit map, as for example, the fold of the MICRO SIGN is
9519      * above 255.  This all is solely for performance reasons.  By having this
9520      * code know the outside-the-bitmap folds that the bitmapped characters are
9521      * involved with, we don't have to go out to disk to find the list of
9522      * matches, unless the character class includes code points that aren't
9523      * storable in the bit map.  That means that a character class with an 's'
9524      * in it, for example, doesn't need to go out to disk to find everything
9525      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
9526      * empty unless there is something whose fold we don't know about, and will
9527      * have to go out to the disk to find. */
9528     HV* l1_fold_invlist = NULL;
9529
9530     /* List of multi-character folds that are matched by this node */
9531     AV* unicode_alternate  = NULL;
9532 #ifdef EBCDIC
9533     UV literal_endpoint = 0;
9534 #endif
9535     UV stored = 0;  /* how many chars stored in the bitmap */
9536
9537     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9538         case we need to change the emitted regop to an EXACT. */
9539     const char * orig_parse = RExC_parse;
9540     GET_RE_DEBUG_FLAGS_DECL;
9541
9542     PERL_ARGS_ASSERT_REGCLASS;
9543 #ifndef DEBUGGING
9544     PERL_UNUSED_ARG(depth);
9545 #endif
9546
9547     DEBUG_PARSE("clas");
9548
9549     /* Assume we are going to generate an ANYOF node. */
9550     ret = reganode(pRExC_state, ANYOF, 0);
9551
9552
9553     if (!SIZE_ONLY) {
9554         ANYOF_FLAGS(ret) = 0;
9555     }
9556
9557     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
9558         RExC_naughty++;
9559         RExC_parse++;
9560         if (!SIZE_ONLY)
9561             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9562     }
9563
9564     if (SIZE_ONLY) {
9565         RExC_size += ANYOF_SKIP;
9566         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9567     }
9568     else {
9569         RExC_emit += ANYOF_SKIP;
9570         if (LOC) {
9571             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9572         }
9573         ANYOF_BITMAP_ZERO(ret);
9574         listsv = newSVpvs("# comment\n");
9575         initial_listsv_len = SvCUR(listsv);
9576     }
9577
9578     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9579
9580     if (!SIZE_ONLY && POSIXCC(nextvalue))
9581         checkposixcc(pRExC_state);
9582
9583     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9584     if (UCHARAT(RExC_parse) == ']')
9585         goto charclassloop;
9586
9587 parseit:
9588     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9589
9590     charclassloop:
9591
9592         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9593
9594         if (!range)
9595             rangebegin = RExC_parse;
9596         if (UTF) {
9597             value = utf8n_to_uvchr((U8*)RExC_parse,
9598                                    RExC_end - RExC_parse,
9599                                    &numlen, UTF8_ALLOW_DEFAULT);
9600             RExC_parse += numlen;
9601         }
9602         else
9603             value = UCHARAT(RExC_parse++);
9604
9605         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9606         if (value == '[' && POSIXCC(nextvalue))
9607             namedclass = regpposixcc(pRExC_state, value);
9608         else if (value == '\\') {
9609             if (UTF) {
9610                 value = utf8n_to_uvchr((U8*)RExC_parse,
9611                                    RExC_end - RExC_parse,
9612                                    &numlen, UTF8_ALLOW_DEFAULT);
9613                 RExC_parse += numlen;
9614             }
9615             else
9616                 value = UCHARAT(RExC_parse++);
9617             /* Some compilers cannot handle switching on 64-bit integer
9618              * values, therefore value cannot be an UV.  Yes, this will
9619              * be a problem later if we want switch on Unicode.
9620              * A similar issue a little bit later when switching on
9621              * namedclass. --jhi */
9622             switch ((I32)value) {
9623             case 'w':   namedclass = ANYOF_ALNUM;       break;
9624             case 'W':   namedclass = ANYOF_NALNUM;      break;
9625             case 's':   namedclass = ANYOF_SPACE;       break;
9626             case 'S':   namedclass = ANYOF_NSPACE;      break;
9627             case 'd':   namedclass = ANYOF_DIGIT;       break;
9628             case 'D':   namedclass = ANYOF_NDIGIT;      break;
9629             case 'v':   namedclass = ANYOF_VERTWS;      break;
9630             case 'V':   namedclass = ANYOF_NVERTWS;     break;
9631             case 'h':   namedclass = ANYOF_HORIZWS;     break;
9632             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
9633             case 'N':  /* Handle \N{NAME} in class */
9634                 {
9635                     /* We only pay attention to the first char of
9636                     multichar strings being returned. I kinda wonder
9637                     if this makes sense as it does change the behaviour
9638                     from earlier versions, OTOH that behaviour was broken
9639                     as well. */
9640                     UV v; /* value is register so we cant & it /grrr */
9641                     if (reg_namedseq(pRExC_state, &v, NULL)) {
9642                         goto parseit;
9643                     }
9644                     value= v;
9645                 }
9646                 break;
9647             case 'p':
9648             case 'P':
9649                 {
9650                 char *e;
9651                 if (RExC_parse >= RExC_end)
9652                     vFAIL2("Empty \\%c{}", (U8)value);
9653                 if (*RExC_parse == '{') {
9654                     const U8 c = (U8)value;
9655                     e = strchr(RExC_parse++, '}');
9656                     if (!e)
9657                         vFAIL2("Missing right brace on \\%c{}", c);
9658                     while (isSPACE(UCHARAT(RExC_parse)))
9659                         RExC_parse++;
9660                     if (e == RExC_parse)
9661                         vFAIL2("Empty \\%c{}", c);
9662                     n = e - RExC_parse;
9663                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9664                         n--;
9665                 }
9666                 else {
9667                     e = RExC_parse;
9668                     n = 1;
9669                 }
9670                 if (!SIZE_ONLY) {
9671                     if (UCHARAT(RExC_parse) == '^') {
9672                          RExC_parse++;
9673                          n--;
9674                          value = value == 'p' ? 'P' : 'p'; /* toggle */
9675                          while (isSPACE(UCHARAT(RExC_parse))) {
9676                               RExC_parse++;
9677                               n--;
9678                          }
9679                     }
9680
9681                     /* Add the property name to the list.  If /i matching, give
9682                      * a different name which consists of the normal name
9683                      * sandwiched between two underscores and '_i'.  The design
9684                      * is discussed in the commit message for this. */
9685                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9686                                         (value=='p' ? '+' : '!'),
9687                                         (FOLD) ? "__" : "",
9688                                         (int)n,
9689                                         RExC_parse,
9690                                         (FOLD) ? "_i" : ""
9691                                     );
9692                 }
9693                 RExC_parse = e + 1;
9694
9695                 /* The \p could match something in the Latin1 range, hence
9696                  * something that isn't utf8 */
9697                 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9698                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
9699
9700                 /* \p means they want Unicode semantics */
9701                 RExC_uni_semantics = 1;
9702                 }
9703                 break;
9704             case 'n':   value = '\n';                   break;
9705             case 'r':   value = '\r';                   break;
9706             case 't':   value = '\t';                   break;
9707             case 'f':   value = '\f';                   break;
9708             case 'b':   value = '\b';                   break;
9709             case 'e':   value = ASCII_TO_NATIVE('\033');break;
9710             case 'a':   value = ASCII_TO_NATIVE('\007');break;
9711             case 'o':
9712                 RExC_parse--;   /* function expects to be pointed at the 'o' */
9713                 {
9714                     const char* error_msg;
9715                     bool valid = grok_bslash_o(RExC_parse,
9716                                                &value,
9717                                                &numlen,
9718                                                &error_msg,
9719                                                SIZE_ONLY);
9720                     RExC_parse += numlen;
9721                     if (! valid) {
9722                         vFAIL(error_msg);
9723                     }
9724                 }
9725                 if (PL_encoding && value < 0x100) {
9726                     goto recode_encoding;
9727                 }
9728                 break;
9729             case 'x':
9730                 if (*RExC_parse == '{') {
9731                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9732                         | PERL_SCAN_DISALLOW_PREFIX;
9733                     char * const e = strchr(RExC_parse++, '}');
9734                     if (!e)
9735                         vFAIL("Missing right brace on \\x{}");
9736
9737                     numlen = e - RExC_parse;
9738                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9739                     RExC_parse = e + 1;
9740                 }
9741                 else {
9742                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9743                     numlen = 2;
9744                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9745                     RExC_parse += numlen;
9746                 }
9747                 if (PL_encoding && value < 0x100)
9748                     goto recode_encoding;
9749                 break;
9750             case 'c':
9751                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9752                 break;
9753             case '0': case '1': case '2': case '3': case '4':
9754             case '5': case '6': case '7':
9755                 {
9756                     /* Take 1-3 octal digits */
9757                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9758                     numlen = 3;
9759                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9760                     RExC_parse += numlen;
9761                     if (PL_encoding && value < 0x100)
9762                         goto recode_encoding;
9763                     break;
9764                 }
9765             recode_encoding:
9766                 {
9767                     SV* enc = PL_encoding;
9768                     value = reg_recode((const char)(U8)value, &enc);
9769                     if (!enc && SIZE_ONLY)
9770                         ckWARNreg(RExC_parse,
9771                                   "Invalid escape in the specified encoding");
9772                     break;
9773                 }
9774             default:
9775                 /* Allow \_ to not give an error */
9776                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9777                     ckWARN2reg(RExC_parse,
9778                                "Unrecognized escape \\%c in character class passed through",
9779                                (int)value);
9780                 }
9781                 break;
9782             }
9783         } /* end of \blah */
9784 #ifdef EBCDIC
9785         else
9786             literal_endpoint++;
9787 #endif
9788
9789         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9790
9791             /* What matches in a locale is not known until runtime, so need to
9792              * (one time per class) allocate extra space to pass to regexec.
9793              * The space will contain a bit for each named class that is to be
9794              * matched against.  This isn't needed for \p{} and pseudo-classes,
9795              * as they are not affected by locale, and hence are dealt with
9796              * separately */
9797             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9798                 need_class = 1;
9799                 if (SIZE_ONLY) {
9800                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9801                 }
9802                 else {
9803                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9804                     ANYOF_CLASS_ZERO(ret);
9805                 }
9806                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9807             }
9808
9809             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
9810              * literal, as is the character that began the false range, i.e.
9811              * the 'a' in the examples */
9812             if (range) {
9813                 if (!SIZE_ONLY) {
9814                     const int w =
9815                         RExC_parse >= rangebegin ?
9816                         RExC_parse - rangebegin : 0;
9817                     ckWARN4reg(RExC_parse,
9818                                "False [] range \"%*.*s\"",
9819                                w, w, rangebegin);
9820
9821                     stored +=
9822                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9823                     if (prevvalue < 256) {
9824                         stored +=
9825                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
9826                     }
9827                     else {
9828                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
9829                     }
9830                 }
9831
9832                 range = 0; /* this was not a true range */
9833             }
9834
9835
9836
9837             if (!SIZE_ONLY) {
9838                 const char *what = NULL;
9839                 char yesno = 0;
9840
9841                 /* Possible truncation here but in some 64-bit environments
9842                  * the compiler gets heartburn about switch on 64-bit values.
9843                  * A similar issue a little earlier when switching on value.
9844                  * --jhi */
9845                 switch ((I32)namedclass) {
9846
9847                 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9848                 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9849                 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9850                 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9851                 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9852                 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9853                 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9854                 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9855                 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9856                 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9857                 /* \s, \w match all unicode if utf8. */
9858                 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9859                 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9860                 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9861                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9862                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9863                 case ANYOF_ASCII:
9864                     if (LOC)
9865                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9866                     else {
9867                         for (value = 0; value < 128; value++)
9868                             stored +=
9869                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9870                     }
9871                     yesno = '+';
9872                     what = NULL;        /* Doesn't match outside ascii, so
9873                                            don't want to add +utf8:: */
9874                     break;
9875                 case ANYOF_NASCII:
9876                     if (LOC)
9877                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9878                     else {
9879                         for (value = 128; value < 256; value++)
9880                             stored +=
9881                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9882                     }
9883                     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9884                     yesno = '!';
9885                     what = "ASCII";
9886                     break;
9887                 case ANYOF_DIGIT:
9888                     if (LOC)
9889                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9890                     else {
9891                         /* consecutive digits assumed */
9892                         for (value = '0'; value <= '9'; value++)
9893                             stored +=
9894                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9895                     }
9896                     yesno = '+';
9897                     what = "Digit";
9898                     break;
9899                 case ANYOF_NDIGIT:
9900                     if (LOC)
9901                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9902                     else {
9903                         /* consecutive digits assumed */
9904                         for (value = 0; value < '0'; value++)
9905                             stored +=
9906                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9907                         for (value = '9' + 1; value < 256; value++)
9908                             stored +=
9909                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9910                     }
9911                     yesno = '!';
9912                     what = "Digit";
9913                     if (AT_LEAST_ASCII_RESTRICTED ) {
9914                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9915                     }
9916                     break;
9917                 case ANYOF_MAX:
9918                     /* this is to handle \p and \P */
9919                     break;
9920                 default:
9921                     vFAIL("Invalid [::] class");
9922                     break;
9923                 }
9924                 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9925                     /* Strings such as "+utf8::isWord\n" */
9926                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9927                 }
9928
9929                 continue;
9930             }
9931         } /* end of namedclass \blah */
9932
9933         if (range) {
9934             if (prevvalue > (IV)value) /* b-a */ {
9935                 const int w = RExC_parse - rangebegin;
9936                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9937                 range = 0; /* not a valid range */
9938             }
9939         }
9940         else {
9941             prevvalue = value; /* save the beginning of the range */
9942             if (RExC_parse+1 < RExC_end
9943                 && *RExC_parse == '-'
9944                 && RExC_parse[1] != ']')
9945             {
9946                 RExC_parse++;
9947
9948                 /* a bad range like \w-, [:word:]- ? */
9949                 if (namedclass > OOB_NAMEDCLASS) {
9950                     if (ckWARN(WARN_REGEXP)) {
9951                         const int w =
9952                             RExC_parse >= rangebegin ?
9953                             RExC_parse - rangebegin : 0;
9954                         vWARN4(RExC_parse,
9955                                "False [] range \"%*.*s\"",
9956                                w, w, rangebegin);
9957                     }
9958                     if (!SIZE_ONLY)
9959                         stored +=
9960                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9961                 } else
9962                     range = 1;  /* yeah, it's a range! */
9963                 continue;       /* but do it the next time */
9964             }
9965         }
9966
9967         /* non-Latin1 code point implies unicode semantics.  Must be set in
9968          * pass1 so is there for the whole of pass 2 */
9969         if (value > 255) {
9970             RExC_uni_semantics = 1;
9971         }
9972
9973         /* now is the next time */
9974         if (!SIZE_ONLY) {
9975             if (prevvalue < 256) {
9976                 const IV ceilvalue = value < 256 ? value : 255;
9977                 IV i;
9978 #ifdef EBCDIC
9979                 /* In EBCDIC [\x89-\x91] should include
9980                  * the \x8e but [i-j] should not. */
9981                 if (literal_endpoint == 2 &&
9982                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9983                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9984                 {
9985                     if (isLOWER(prevvalue)) {
9986                         for (i = prevvalue; i <= ceilvalue; i++)
9987                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9988                                 stored +=
9989                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9990                             }
9991                     } else {
9992                         for (i = prevvalue; i <= ceilvalue; i++)
9993                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9994                                 stored +=
9995                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9996                             }
9997                     }
9998                 }
9999                 else
10000 #endif
10001                       for (i = prevvalue; i <= ceilvalue; i++) {
10002                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10003                       }
10004           }
10005           if (value > 255) {
10006             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
10007             const UV natvalue      = NATIVE_TO_UNI(value);
10008             nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
10009         }
10010 #ifdef EBCDIC
10011             literal_endpoint = 0;
10012 #endif
10013         }
10014
10015         range = 0; /* this range (if it was one) is done now */
10016     }
10017
10018
10019
10020     if (SIZE_ONLY)
10021         return ret;
10022     /****** !SIZE_ONLY AFTER HERE *********/
10023
10024     /* If folding and there are code points above 255, we calculate all
10025      * characters that could fold to or from the ones already on the list */
10026     if (FOLD && nonbitmap) {
10027         UV i;
10028
10029         HV* fold_intersection;
10030         UV* fold_list;
10031
10032         /* This is a list of all the characters that participate in folds
10033             * (except marks, etc in multi-char folds */
10034         if (! PL_utf8_foldable) {
10035             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
10036             PL_utf8_foldable = _swash_to_invlist(swash);
10037         }
10038
10039         /* This is a hash that for a particular fold gives all characters
10040             * that are involved in it */
10041         if (! PL_utf8_foldclosures) {
10042
10043             /* If we were unable to find any folds, then we likely won't be
10044              * able to find the closures.  So just create an empty list.
10045              * Folding will effectively be restricted to the non-Unicode rules
10046              * hard-coded into Perl.  (This case happens legitimately during
10047              * compilation of Perl itself before the Unicode tables are
10048              * generated) */
10049             if (invlist_len(PL_utf8_foldable) == 0) {
10050                 PL_utf8_foldclosures = _new_invlist(0);
10051             } else {
10052                 /* If the folds haven't been read in, call a fold function
10053                     * to force that */
10054                 if (! PL_utf8_tofold) {
10055                     U8 dummy[UTF8_MAXBYTES+1];
10056                     STRLEN dummy_len;
10057                     to_utf8_fold((U8*) "A", dummy, &dummy_len);
10058                 }
10059                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10060             }
10061         }
10062
10063         /* Only the characters in this class that participate in folds need
10064             * be checked.  Get the intersection of this class and all the
10065             * possible characters that are foldable.  This can quickly narrow
10066             * down a large class */
10067         fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
10068
10069         /* Now look at the foldable characters in this class individually */
10070         fold_list = invlist_array(fold_intersection);
10071         for (i = 0; i < invlist_len(fold_intersection); i++) {
10072             UV j;
10073
10074             /* The next entry is the beginning of the range that is in the
10075              * class */
10076             UV start = fold_list[i++];
10077
10078
10079             /* The next entry is the beginning of the next range, which
10080                 * isn't in the class, so the end of the current range is one
10081                 * less than that */
10082             UV end = fold_list[i] - 1;
10083
10084             /* Look at every character in the range */
10085             for (j = start; j <= end; j++) {
10086
10087                 /* Get its fold */
10088                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10089                 STRLEN foldlen;
10090                 const UV f = to_uni_fold(j, foldbuf, &foldlen);
10091
10092                 if (foldlen > (STRLEN)UNISKIP(f)) {
10093
10094                     /* Any multicharacter foldings (disallowed in
10095                         * lookbehind patterns) require the following
10096                         * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10097                         * E folds into "pq" and F folds into "rst", all other
10098                         * characters fold to single characters.  We save away
10099                         * these multicharacter foldings, to be later saved as
10100                         * part of the additional "s" data. */
10101                     if (! RExC_in_lookbehind) {
10102                         U8* loc = foldbuf;
10103                         U8* e = foldbuf + foldlen;
10104
10105                         /* If any of the folded characters of this are in
10106                             * the Latin1 range, tell the regex engine that
10107                             * this can match a non-utf8 target string.  The
10108                             * only multi-byte fold whose source is in the
10109                             * Latin1 range (U+00DF) applies only when the
10110                             * target string is utf8, or under unicode rules */
10111                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10112                             while (loc < e) {
10113
10114                                 /* Can't mix ascii with non- under /aa */
10115                                 if (MORE_ASCII_RESTRICTED
10116                                     && (isASCII(*loc) != isASCII(j)))
10117                                 {
10118                                     goto end_multi_fold;
10119                                 }
10120                                 if (UTF8_IS_INVARIANT(*loc)
10121                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
10122                                 {
10123                                     /* Can't mix above and below 256 under
10124                                         * LOC */
10125                                     if (LOC) {
10126                                         goto end_multi_fold;
10127                                     }
10128                                     ANYOF_FLAGS(ret)
10129                                             |= ANYOF_NONBITMAP_NON_UTF8;
10130                                     break;
10131                                 }
10132                                 loc += UTF8SKIP(loc);
10133                             }
10134                         }
10135
10136                         add_alternate(&unicode_alternate, foldbuf, foldlen);
10137                     end_multi_fold: ;
10138                     }
10139                 }
10140                 else {
10141                     /* Single character fold.  Add everything in its fold
10142                         * closure to the list that this node should match */
10143                     SV** listp;
10144
10145                     /* The fold closures data structure is a hash with the
10146                         * keys being every character that is folded to, like
10147                         * 'k', and the values each an array of everything that
10148                         * folds to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
10149                     if ((listp = hv_fetch(PL_utf8_foldclosures,
10150                                     (char *) foldbuf, foldlen, FALSE)))
10151                     {
10152                         AV* list = (AV*) *listp;
10153                         IV k;
10154                         for (k = 0; k <= av_len(list); k++) {
10155                             SV** c_p = av_fetch(list, k, FALSE);
10156                             UV c;
10157                             if (c_p == NULL) {
10158                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10159                             }
10160                             c = SvUV(*c_p);
10161
10162                             /* /aa doesn't allow folds between ASCII and
10163                                 * non-; /l doesn't allow them between above
10164                                 * and below 256 */
10165                             if ((MORE_ASCII_RESTRICTED
10166                                  && (isASCII(c) != isASCII(j)))
10167                                     || (LOC && ((c < 256) != (j < 256))))
10168                             {
10169                                 continue;
10170                             }
10171
10172                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10173                                 stored += set_regclass_bit(pRExC_state,
10174                                         ret,
10175                                         (U8) c,
10176                                         &l1_fold_invlist, &unicode_alternate);
10177                             }
10178                                 /* It may be that the code point is already
10179                                     * in this range or already in the bitmap,
10180                                     * in which case we need do nothing */
10181                             else if ((c < start || c > end)
10182                                         && (c > 255
10183                                             || ! ANYOF_BITMAP_TEST(ret, c)))
10184                             {
10185                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
10186                             }
10187                         }
10188                     }
10189                 }
10190             }
10191         }
10192         invlist_destroy(fold_intersection);
10193     }
10194
10195     /* Combine the two lists into one. */
10196     if (l1_fold_invlist) {
10197         if (nonbitmap) {
10198             nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
10199         }
10200         else {
10201             nonbitmap = l1_fold_invlist;
10202         }
10203     }
10204
10205     /* Here, we have calculated what code points should be in the character
10206      * class.   Now we can see about various optimizations.  Fold calculation
10207      * needs to take place before inversion.  Otherwise /[^k]/i would invert to
10208      * include K, which under /i would match k. */
10209
10210     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
10211      * set the FOLD flag yet, so this this does optimize those.  It doesn't
10212      * optimize locale.  Doing so perhaps could be done as long as there is
10213      * nothing like \w in it; some thought also would have to be given to the
10214      * interaction with above 0x100 chars */
10215     if (! LOC
10216         && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10217         && ! unicode_alternate
10218         && ! nonbitmap
10219         && SvCUR(listsv) == initial_listsv_len)
10220     {
10221         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10222             ANYOF_BITMAP(ret)[value] ^= 0xFF;
10223         stored = 256 - stored;
10224
10225         /* The inversion means that everything above 255 is matched; and at the
10226          * same time we clear the invert flag */
10227         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
10228     }
10229
10230     /* Folding in the bitmap is taken care of above, but not for locale (for
10231      * which we have to wait to see what folding is in effect at runtime), and
10232      * for things not in the bitmap.  Set run-time fold flag for these */
10233     if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
10234         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10235     }
10236
10237     /* A single character class can be "optimized" into an EXACTish node.
10238      * Note that since we don't currently count how many characters there are
10239      * outside the bitmap, we are XXX missing optimization possibilities for
10240      * them.  This optimization can't happen unless this is a truly single
10241      * character class, which means that it can't be an inversion into a
10242      * many-character class, and there must be no possibility of there being
10243      * things outside the bitmap.  'stored' (only) for locales doesn't include
10244      * \w, etc, so have to make a special test that they aren't present
10245      *
10246      * Similarly A 2-character class of the very special form like [bB] can be
10247      * optimized into an EXACTFish node, but only for non-locales, and for
10248      * characters which only have the two folds; so things like 'fF' and 'Ii'
10249      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10250      * FI'. */
10251     if (! nonbitmap
10252         && ! unicode_alternate
10253         && SvCUR(listsv) == initial_listsv_len
10254         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
10255         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10256                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10257             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10258                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10259                                  /* If the latest code point has a fold whose
10260                                   * bit is set, it must be the only other one */
10261                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10262                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10263     {
10264         /* Note that the information needed to decide to do this optimization
10265          * is not currently available until the 2nd pass, and that the actually
10266          * used EXACTish node takes less space than the calculated ANYOF node,
10267          * and hence the amount of space calculated in the first pass is larger
10268          * than actually used, so this optimization doesn't gain us any space.
10269          * But an EXACT node is faster than an ANYOF node, and can be combined
10270          * with any adjacent EXACT nodes later by the optimizer for further
10271          * gains.  The speed of executing an EXACTF is similar to an ANYOF
10272          * node, so the optimization advantage comes from the ability to join
10273          * it to adjacent EXACT nodes */
10274
10275         const char * cur_parse= RExC_parse;
10276         U8 op;
10277         RExC_emit = (regnode *)orig_emit;
10278         RExC_parse = (char *)orig_parse;
10279
10280         if (stored == 1) {
10281
10282             /* A locale node with one point can be folded; all the other cases
10283              * with folding will have two points, since we calculate them above
10284              */
10285             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10286                  op = EXACTFL;
10287             }
10288             else {
10289                 op = EXACT;
10290             }
10291         }   /* else 2 chars in the bit map: the folds of each other */
10292         else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10293
10294             /* To join adjacent nodes, they must be the exact EXACTish type.
10295              * Try to use the most likely type, by using EXACTFU if the regex
10296              * calls for them, or is required because the character is
10297              * non-ASCII */
10298             op = EXACTFU;
10299         }
10300         else {    /* Otherwise, more likely to be EXACTF type */
10301             op = EXACTF;
10302         }
10303
10304         ret = reg_node(pRExC_state, op);
10305         RExC_parse = (char *)cur_parse;
10306         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10307             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10308             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10309             STR_LEN(ret)= 2;
10310             RExC_emit += STR_SZ(2);
10311         }
10312         else {
10313             *STRING(ret)= (char)value;
10314             STR_LEN(ret)= 1;
10315             RExC_emit += STR_SZ(1);
10316         }
10317         SvREFCNT_dec(listsv);
10318         return ret;
10319     }
10320
10321     if (nonbitmap) {
10322         UV* nonbitmap_array = invlist_array(nonbitmap);
10323         UV nonbitmap_len = invlist_len(nonbitmap);
10324         UV i;
10325
10326         /*  Here have the full list of items to match that aren't in the
10327          *  bitmap.  Convert to the structure that the rest of the code is
10328          *  expecting.   XXX That rest of the code should convert to this
10329          *  structure */
10330         for (i = 0; i < nonbitmap_len; i++) {
10331
10332             /* The next entry is the beginning of the range that is in the
10333              * class */
10334             UV start = nonbitmap_array[i++];
10335             UV end;
10336
10337             /* The next entry is the beginning of the next range, which isn't
10338              * in the class, so the end of the current range is one less than
10339              * that.  But if there is no next range, it means that the range
10340              * begun by 'start' extends to infinity, which for this platform
10341              * ends at UV_MAX */
10342             if (i == nonbitmap_len) {
10343                 end = UV_MAX;
10344             }
10345             else {
10346                 end = nonbitmap_array[i] - 1;
10347             }
10348
10349             if (start == end) {
10350                 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10351             }
10352             else {
10353                 /* The \t sets the whole range */
10354                 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10355                         /* XXX EBCDIC */
10356                                    start, end);
10357             }
10358         }
10359         invlist_destroy(nonbitmap);
10360     }
10361
10362     if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10363         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10364         SvREFCNT_dec(listsv);
10365         SvREFCNT_dec(unicode_alternate);
10366     }
10367     else {
10368
10369         AV * const av = newAV();
10370         SV *rv;
10371         /* The 0th element stores the character class description
10372          * in its textual form: used later (regexec.c:Perl_regclass_swash())
10373          * to initialize the appropriate swash (which gets stored in
10374          * the 1st element), and also useful for dumping the regnode.
10375          * The 2nd element stores the multicharacter foldings,
10376          * used later (regexec.c:S_reginclass()). */
10377         av_store(av, 0, listsv);
10378         av_store(av, 1, NULL);
10379         av_store(av, 2, MUTABLE_SV(unicode_alternate));
10380         if (unicode_alternate) { /* This node is variable length */
10381             OP(ret) = ANYOFV;
10382         }
10383         rv = newRV_noinc(MUTABLE_SV(av));
10384         n = add_data(pRExC_state, 1, "s");
10385         RExC_rxi->data->data[n] = (void*)rv;
10386         ARG_SET(ret, n);
10387     }
10388     return ret;
10389 }
10390 #undef _C_C_T_
10391
10392
10393 /* reg_skipcomment()
10394
10395    Absorbs an /x style # comments from the input stream.
10396    Returns true if there is more text remaining in the stream.
10397    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10398    terminates the pattern without including a newline.
10399
10400    Note its the callers responsibility to ensure that we are
10401    actually in /x mode
10402
10403 */
10404
10405 STATIC bool
10406 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10407 {
10408     bool ended = 0;
10409
10410     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10411
10412     while (RExC_parse < RExC_end)
10413         if (*RExC_parse++ == '\n') {
10414             ended = 1;
10415             break;
10416         }
10417     if (!ended) {
10418         /* we ran off the end of the pattern without ending
10419            the comment, so we have to add an \n when wrapping */
10420         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10421         return 0;
10422     } else
10423         return 1;
10424 }
10425
10426 /* nextchar()
10427
10428    Advances the parse position, and optionally absorbs
10429    "whitespace" from the inputstream.
10430
10431    Without /x "whitespace" means (?#...) style comments only,
10432    with /x this means (?#...) and # comments and whitespace proper.
10433
10434    Returns the RExC_parse point from BEFORE the scan occurs.
10435
10436    This is the /x friendly way of saying RExC_parse++.
10437 */
10438
10439 STATIC char*
10440 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10441 {
10442     char* const retval = RExC_parse++;
10443
10444     PERL_ARGS_ASSERT_NEXTCHAR;
10445
10446     for (;;) {
10447         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10448                 RExC_parse[2] == '#') {
10449             while (*RExC_parse != ')') {
10450                 if (RExC_parse == RExC_end)
10451                     FAIL("Sequence (?#... not terminated");
10452                 RExC_parse++;
10453             }
10454             RExC_parse++;
10455             continue;
10456         }
10457         if (RExC_flags & RXf_PMf_EXTENDED) {
10458             if (isSPACE(*RExC_parse)) {
10459                 RExC_parse++;
10460                 continue;
10461             }
10462             else if (*RExC_parse == '#') {
10463                 if ( reg_skipcomment( pRExC_state ) )
10464                     continue;
10465             }
10466         }
10467         return retval;
10468     }
10469 }
10470
10471 /*
10472 - reg_node - emit a node
10473 */
10474 STATIC regnode *                        /* Location. */
10475 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10476 {
10477     dVAR;
10478     register regnode *ptr;
10479     regnode * const ret = RExC_emit;
10480     GET_RE_DEBUG_FLAGS_DECL;
10481
10482     PERL_ARGS_ASSERT_REG_NODE;
10483
10484     if (SIZE_ONLY) {
10485         SIZE_ALIGN(RExC_size);
10486         RExC_size += 1;
10487         return(ret);
10488     }
10489     if (RExC_emit >= RExC_emit_bound)
10490         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10491
10492     NODE_ALIGN_FILL(ret);
10493     ptr = ret;
10494     FILL_ADVANCE_NODE(ptr, op);
10495 #ifdef RE_TRACK_PATTERN_OFFSETS
10496     if (RExC_offsets) {         /* MJD */
10497         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10498               "reg_node", __LINE__,
10499               PL_reg_name[op],
10500               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10501                 ? "Overwriting end of array!\n" : "OK",
10502               (UV)(RExC_emit - RExC_emit_start),
10503               (UV)(RExC_parse - RExC_start),
10504               (UV)RExC_offsets[0]));
10505         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10506     }
10507 #endif
10508     RExC_emit = ptr;
10509     return(ret);
10510 }
10511
10512 /*
10513 - reganode - emit a node with an argument
10514 */
10515 STATIC regnode *                        /* Location. */
10516 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10517 {
10518     dVAR;
10519     register regnode *ptr;
10520     regnode * const ret = RExC_emit;
10521     GET_RE_DEBUG_FLAGS_DECL;
10522
10523     PERL_ARGS_ASSERT_REGANODE;
10524
10525     if (SIZE_ONLY) {
10526         SIZE_ALIGN(RExC_size);
10527         RExC_size += 2;
10528         /*
10529            We can't do this:
10530
10531            assert(2==regarglen[op]+1);
10532
10533            Anything larger than this has to allocate the extra amount.
10534            If we changed this to be:
10535
10536            RExC_size += (1 + regarglen[op]);
10537
10538            then it wouldn't matter. Its not clear what side effect
10539            might come from that so its not done so far.
10540            -- dmq
10541         */
10542         return(ret);
10543     }
10544     if (RExC_emit >= RExC_emit_bound)
10545         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10546
10547     NODE_ALIGN_FILL(ret);
10548     ptr = ret;
10549     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10550 #ifdef RE_TRACK_PATTERN_OFFSETS
10551     if (RExC_offsets) {         /* MJD */
10552         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10553               "reganode",
10554               __LINE__,
10555               PL_reg_name[op],
10556               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10557               "Overwriting end of array!\n" : "OK",
10558               (UV)(RExC_emit - RExC_emit_start),
10559               (UV)(RExC_parse - RExC_start),
10560               (UV)RExC_offsets[0]));
10561         Set_Cur_Node_Offset;
10562     }
10563 #endif
10564     RExC_emit = ptr;
10565     return(ret);
10566 }
10567
10568 /*
10569 - reguni - emit (if appropriate) a Unicode character
10570 */
10571 STATIC STRLEN
10572 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10573 {
10574     dVAR;
10575
10576     PERL_ARGS_ASSERT_REGUNI;
10577
10578     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10579 }
10580
10581 /*
10582 - reginsert - insert an operator in front of already-emitted operand
10583 *
10584 * Means relocating the operand.
10585 */
10586 STATIC void
10587 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10588 {
10589     dVAR;
10590     register regnode *src;
10591     register regnode *dst;
10592     register regnode *place;
10593     const int offset = regarglen[(U8)op];
10594     const int size = NODE_STEP_REGNODE + offset;
10595     GET_RE_DEBUG_FLAGS_DECL;
10596
10597     PERL_ARGS_ASSERT_REGINSERT;
10598     PERL_UNUSED_ARG(depth);
10599 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10600     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10601     if (SIZE_ONLY) {
10602         RExC_size += size;
10603         return;
10604     }
10605
10606     src = RExC_emit;
10607     RExC_emit += size;
10608     dst = RExC_emit;
10609     if (RExC_open_parens) {
10610         int paren;
10611         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10612         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10613             if ( RExC_open_parens[paren] >= opnd ) {
10614                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10615                 RExC_open_parens[paren] += size;
10616             } else {
10617                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10618             }
10619             if ( RExC_close_parens[paren] >= opnd ) {
10620                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10621                 RExC_close_parens[paren] += size;
10622             } else {
10623                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10624             }
10625         }
10626     }
10627
10628     while (src > opnd) {
10629         StructCopy(--src, --dst, regnode);
10630 #ifdef RE_TRACK_PATTERN_OFFSETS
10631         if (RExC_offsets) {     /* MJD 20010112 */
10632             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10633                   "reg_insert",
10634                   __LINE__,
10635                   PL_reg_name[op],
10636                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10637                     ? "Overwriting end of array!\n" : "OK",
10638                   (UV)(src - RExC_emit_start),
10639                   (UV)(dst - RExC_emit_start),
10640                   (UV)RExC_offsets[0]));
10641             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10642             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10643         }
10644 #endif
10645     }
10646
10647
10648     place = opnd;               /* Op node, where operand used to be. */
10649 #ifdef RE_TRACK_PATTERN_OFFSETS
10650     if (RExC_offsets) {         /* MJD */
10651         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10652               "reginsert",
10653               __LINE__,
10654               PL_reg_name[op],
10655               (UV)(place - RExC_emit_start) > RExC_offsets[0]
10656               ? "Overwriting end of array!\n" : "OK",
10657               (UV)(place - RExC_emit_start),
10658               (UV)(RExC_parse - RExC_start),
10659               (UV)RExC_offsets[0]));
10660         Set_Node_Offset(place, RExC_parse);
10661         Set_Node_Length(place, 1);
10662     }
10663 #endif
10664     src = NEXTOPER(place);
10665     FILL_ADVANCE_NODE(place, op);
10666     Zero(src, offset, regnode);
10667 }
10668
10669 /*
10670 - regtail - set the next-pointer at the end of a node chain of p to val.
10671 - SEE ALSO: regtail_study
10672 */
10673 /* TODO: All three parms should be const */
10674 STATIC void
10675 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10676 {
10677     dVAR;
10678     register regnode *scan;
10679     GET_RE_DEBUG_FLAGS_DECL;
10680
10681     PERL_ARGS_ASSERT_REGTAIL;
10682 #ifndef DEBUGGING
10683     PERL_UNUSED_ARG(depth);
10684 #endif
10685
10686     if (SIZE_ONLY)
10687         return;
10688
10689     /* Find last node. */
10690     scan = p;
10691     for (;;) {
10692         regnode * const temp = regnext(scan);
10693         DEBUG_PARSE_r({
10694             SV * const mysv=sv_newmortal();
10695             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10696             regprop(RExC_rx, mysv, scan);
10697             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10698                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10699                     (temp == NULL ? "->" : ""),
10700                     (temp == NULL ? PL_reg_name[OP(val)] : "")
10701             );
10702         });
10703         if (temp == NULL)
10704             break;
10705         scan = temp;
10706     }
10707
10708     if (reg_off_by_arg[OP(scan)]) {
10709         ARG_SET(scan, val - scan);
10710     }
10711     else {
10712         NEXT_OFF(scan) = val - scan;
10713     }
10714 }
10715
10716 #ifdef DEBUGGING
10717 /*
10718 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10719 - Look for optimizable sequences at the same time.
10720 - currently only looks for EXACT chains.
10721
10722 This is experimental code. The idea is to use this routine to perform
10723 in place optimizations on branches and groups as they are constructed,
10724 with the long term intention of removing optimization from study_chunk so
10725 that it is purely analytical.
10726
10727 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10728 to control which is which.
10729
10730 */
10731 /* TODO: All four parms should be const */
10732
10733 STATIC U8
10734 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10735 {
10736     dVAR;
10737     register regnode *scan;
10738     U8 exact = PSEUDO;
10739 #ifdef EXPERIMENTAL_INPLACESCAN
10740     I32 min = 0;
10741 #endif
10742     GET_RE_DEBUG_FLAGS_DECL;
10743
10744     PERL_ARGS_ASSERT_REGTAIL_STUDY;
10745
10746
10747     if (SIZE_ONLY)
10748         return exact;
10749
10750     /* Find last node. */
10751
10752     scan = p;
10753     for (;;) {
10754         regnode * const temp = regnext(scan);
10755 #ifdef EXPERIMENTAL_INPLACESCAN
10756         if (PL_regkind[OP(scan)] == EXACT)
10757             if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10758                 return EXACT;
10759 #endif
10760         if ( exact ) {
10761             switch (OP(scan)) {
10762                 case EXACT:
10763                 case EXACTF:
10764                 case EXACTFA:
10765                 case EXACTFU:
10766                 case EXACTFL:
10767                         if( exact == PSEUDO )
10768                             exact= OP(scan);
10769                         else if ( exact != OP(scan) )
10770                             exact= 0;
10771                 case NOTHING:
10772                     break;
10773                 default:
10774                     exact= 0;
10775             }
10776         }
10777         DEBUG_PARSE_r({
10778             SV * const mysv=sv_newmortal();
10779             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10780             regprop(RExC_rx, mysv, scan);
10781             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10782                 SvPV_nolen_const(mysv),
10783                 REG_NODE_NUM(scan),
10784                 PL_reg_name[exact]);
10785         });
10786         if (temp == NULL)
10787             break;
10788         scan = temp;
10789     }
10790     DEBUG_PARSE_r({
10791         SV * const mysv_val=sv_newmortal();
10792         DEBUG_PARSE_MSG("");
10793         regprop(RExC_rx, mysv_val, val);
10794         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10795                       SvPV_nolen_const(mysv_val),
10796                       (IV)REG_NODE_NUM(val),
10797                       (IV)(val - scan)
10798         );
10799     });
10800     if (reg_off_by_arg[OP(scan)]) {
10801         ARG_SET(scan, val - scan);
10802     }
10803     else {
10804         NEXT_OFF(scan) = val - scan;
10805     }
10806
10807     return exact;
10808 }
10809 #endif
10810
10811 /*
10812  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10813  */
10814 #ifdef DEBUGGING
10815 static void
10816 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10817 {
10818     int bit;
10819     int set=0;
10820     regex_charset cs;
10821
10822     for (bit=0; bit<32; bit++) {
10823         if (flags & (1<<bit)) {
10824             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
10825                 continue;
10826             }
10827             if (!set++ && lead)
10828                 PerlIO_printf(Perl_debug_log, "%s",lead);
10829             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10830         }
10831     }
10832     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10833             if (!set++ && lead) {
10834                 PerlIO_printf(Perl_debug_log, "%s",lead);
10835             }
10836             switch (cs) {
10837                 case REGEX_UNICODE_CHARSET:
10838                     PerlIO_printf(Perl_debug_log, "UNICODE");
10839                     break;
10840                 case REGEX_LOCALE_CHARSET:
10841                     PerlIO_printf(Perl_debug_log, "LOCALE");
10842                     break;
10843                 case REGEX_ASCII_RESTRICTED_CHARSET:
10844                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10845                     break;
10846                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10847                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10848                     break;
10849                 default:
10850                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10851                     break;
10852             }
10853     }
10854     if (lead)  {
10855         if (set)
10856             PerlIO_printf(Perl_debug_log, "\n");
10857         else
10858             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10859     }
10860 }
10861 #endif
10862
10863 void
10864 Perl_regdump(pTHX_ const regexp *r)
10865 {
10866 #ifdef DEBUGGING
10867     dVAR;
10868     SV * const sv = sv_newmortal();
10869     SV *dsv= sv_newmortal();
10870     RXi_GET_DECL(r,ri);
10871     GET_RE_DEBUG_FLAGS_DECL;
10872
10873     PERL_ARGS_ASSERT_REGDUMP;
10874
10875     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10876
10877     /* Header fields of interest. */
10878     if (r->anchored_substr) {
10879         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10880             RE_SV_DUMPLEN(r->anchored_substr), 30);
10881         PerlIO_printf(Perl_debug_log,
10882                       "anchored %s%s at %"IVdf" ",
10883                       s, RE_SV_TAIL(r->anchored_substr),
10884                       (IV)r->anchored_offset);
10885     } else if (r->anchored_utf8) {
10886         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10887             RE_SV_DUMPLEN(r->anchored_utf8), 30);
10888         PerlIO_printf(Perl_debug_log,
10889                       "anchored utf8 %s%s at %"IVdf" ",
10890                       s, RE_SV_TAIL(r->anchored_utf8),
10891                       (IV)r->anchored_offset);
10892     }
10893     if (r->float_substr) {
10894         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10895             RE_SV_DUMPLEN(r->float_substr), 30);
10896         PerlIO_printf(Perl_debug_log,
10897                       "floating %s%s at %"IVdf"..%"UVuf" ",
10898                       s, RE_SV_TAIL(r->float_substr),
10899                       (IV)r->float_min_offset, (UV)r->float_max_offset);
10900     } else if (r->float_utf8) {
10901         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10902             RE_SV_DUMPLEN(r->float_utf8), 30);
10903         PerlIO_printf(Perl_debug_log,
10904                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10905                       s, RE_SV_TAIL(r->float_utf8),
10906                       (IV)r->float_min_offset, (UV)r->float_max_offset);
10907     }
10908     if (r->check_substr || r->check_utf8)
10909         PerlIO_printf(Perl_debug_log,
10910                       (const char *)
10911                       (r->check_substr == r->float_substr
10912                        && r->check_utf8 == r->float_utf8
10913                        ? "(checking floating" : "(checking anchored"));
10914     if (r->extflags & RXf_NOSCAN)
10915         PerlIO_printf(Perl_debug_log, " noscan");
10916     if (r->extflags & RXf_CHECK_ALL)
10917         PerlIO_printf(Perl_debug_log, " isall");
10918     if (r->check_substr || r->check_utf8)
10919         PerlIO_printf(Perl_debug_log, ") ");
10920
10921     if (ri->regstclass) {
10922         regprop(r, sv, ri->regstclass);
10923         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10924     }
10925     if (r->extflags & RXf_ANCH) {
10926         PerlIO_printf(Perl_debug_log, "anchored");
10927         if (r->extflags & RXf_ANCH_BOL)
10928             PerlIO_printf(Perl_debug_log, "(BOL)");
10929         if (r->extflags & RXf_ANCH_MBOL)
10930             PerlIO_printf(Perl_debug_log, "(MBOL)");
10931         if (r->extflags & RXf_ANCH_SBOL)
10932             PerlIO_printf(Perl_debug_log, "(SBOL)");
10933         if (r->extflags & RXf_ANCH_GPOS)
10934             PerlIO_printf(Perl_debug_log, "(GPOS)");
10935         PerlIO_putc(Perl_debug_log, ' ');
10936     }
10937     if (r->extflags & RXf_GPOS_SEEN)
10938         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10939     if (r->intflags & PREGf_SKIP)
10940         PerlIO_printf(Perl_debug_log, "plus ");
10941     if (r->intflags & PREGf_IMPLICIT)
10942         PerlIO_printf(Perl_debug_log, "implicit ");
10943     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10944     if (r->extflags & RXf_EVAL_SEEN)
10945         PerlIO_printf(Perl_debug_log, "with eval ");
10946     PerlIO_printf(Perl_debug_log, "\n");
10947     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10948 #else
10949     PERL_ARGS_ASSERT_REGDUMP;
10950     PERL_UNUSED_CONTEXT;
10951     PERL_UNUSED_ARG(r);
10952 #endif  /* DEBUGGING */
10953 }
10954
10955 /*
10956 - regprop - printable representation of opcode
10957 */
10958 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10959 STMT_START { \
10960         if (do_sep) {                           \
10961             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10962             if (flags & ANYOF_INVERT)           \
10963                 /*make sure the invert info is in each */ \
10964                 sv_catpvs(sv, "^");             \
10965             do_sep = 0;                         \
10966         }                                       \
10967 } STMT_END
10968
10969 void
10970 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10971 {
10972 #ifdef DEBUGGING
10973     dVAR;
10974     register int k;
10975     RXi_GET_DECL(prog,progi);
10976     GET_RE_DEBUG_FLAGS_DECL;
10977
10978     PERL_ARGS_ASSERT_REGPROP;
10979
10980     sv_setpvs(sv, "");
10981
10982     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
10983         /* It would be nice to FAIL() here, but this may be called from
10984            regexec.c, and it would be hard to supply pRExC_state. */
10985         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
10986     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
10987
10988     k = PL_regkind[OP(o)];
10989
10990     if (k == EXACT) {
10991         sv_catpvs(sv, " ");
10992         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10993          * is a crude hack but it may be the best for now since
10994          * we have no flag "this EXACTish node was UTF-8"
10995          * --jhi */
10996         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
10997                   PERL_PV_ESCAPE_UNI_DETECT |
10998                   PERL_PV_ESCAPE_NONASCII   |
10999                   PERL_PV_PRETTY_ELLIPSES   |
11000                   PERL_PV_PRETTY_LTGT       |
11001                   PERL_PV_PRETTY_NOCLEAR
11002                   );
11003     } else if (k == TRIE) {
11004         /* print the details of the trie in dumpuntil instead, as
11005          * progi->data isn't available here */
11006         const char op = OP(o);
11007         const U32 n = ARG(o);
11008         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
11009                (reg_ac_data *)progi->data->data[n] :
11010                NULL;
11011         const reg_trie_data * const trie
11012             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
11013
11014         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
11015         DEBUG_TRIE_COMPILE_r(
11016             Perl_sv_catpvf(aTHX_ sv,
11017                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
11018                 (UV)trie->startstate,
11019                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
11020                 (UV)trie->wordcount,
11021                 (UV)trie->minlen,
11022                 (UV)trie->maxlen,
11023                 (UV)TRIE_CHARCOUNT(trie),
11024                 (UV)trie->uniquecharcount
11025             )
11026         );
11027         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
11028             int i;
11029             int rangestart = -1;
11030             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
11031             sv_catpvs(sv, "[");
11032             for (i = 0; i <= 256; i++) {
11033                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
11034                     if (rangestart == -1)
11035                         rangestart = i;
11036                 } else if (rangestart != -1) {
11037                     if (i <= rangestart + 3)
11038                         for (; rangestart < i; rangestart++)
11039                             put_byte(sv, rangestart);
11040                     else {
11041                         put_byte(sv, rangestart);
11042                         sv_catpvs(sv, "-");
11043                         put_byte(sv, i - 1);
11044                     }
11045                     rangestart = -1;
11046                 }
11047             }
11048             sv_catpvs(sv, "]");
11049         }
11050
11051     } else if (k == CURLY) {
11052         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
11053             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
11054         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
11055     }
11056     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
11057         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
11058     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
11059         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
11060         if ( RXp_PAREN_NAMES(prog) ) {
11061             if ( k != REF || (OP(o) < NREF)) {
11062                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
11063                 SV **name= av_fetch(list, ARG(o), 0 );
11064                 if (name)
11065                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11066             }
11067             else {
11068                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
11069                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
11070                 I32 *nums=(I32*)SvPVX(sv_dat);
11071                 SV **name= av_fetch(list, nums[0], 0 );
11072                 I32 n;
11073                 if (name) {
11074                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
11075                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
11076                                     (n ? "," : ""), (IV)nums[n]);
11077                     }
11078                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11079                 }
11080             }
11081         }
11082     } else if (k == GOSUB)
11083         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
11084     else if (k == VERB) {
11085         if (!o->flags)
11086             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
11087                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
11088     } else if (k == LOGICAL)
11089         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
11090     else if (k == FOLDCHAR)
11091         Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
11092     else if (k == ANYOF) {
11093         int i, rangestart = -1;
11094         const U8 flags = ANYOF_FLAGS(o);
11095         int do_sep = 0;
11096
11097         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11098         static const char * const anyofs[] = {
11099             "\\w",
11100             "\\W",
11101             "\\s",
11102             "\\S",
11103             "\\d",
11104             "\\D",
11105             "[:alnum:]",
11106             "[:^alnum:]",
11107             "[:alpha:]",
11108             "[:^alpha:]",
11109             "[:ascii:]",
11110             "[:^ascii:]",
11111             "[:cntrl:]",
11112             "[:^cntrl:]",
11113             "[:graph:]",
11114             "[:^graph:]",
11115             "[:lower:]",
11116             "[:^lower:]",
11117             "[:print:]",
11118             "[:^print:]",
11119             "[:punct:]",
11120             "[:^punct:]",
11121             "[:upper:]",
11122             "[:^upper:]",
11123             "[:xdigit:]",
11124             "[:^xdigit:]",
11125             "[:space:]",
11126             "[:^space:]",
11127             "[:blank:]",
11128             "[:^blank:]"
11129         };
11130
11131         if (flags & ANYOF_LOCALE)
11132             sv_catpvs(sv, "{loc}");
11133         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
11134             sv_catpvs(sv, "{i}");
11135         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
11136         if (flags & ANYOF_INVERT)
11137             sv_catpvs(sv, "^");
11138
11139         /* output what the standard cp 0-255 bitmap matches */
11140         for (i = 0; i <= 256; i++) {
11141             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11142                 if (rangestart == -1)
11143                     rangestart = i;
11144             } else if (rangestart != -1) {
11145                 if (i <= rangestart + 3)
11146                     for (; rangestart < i; rangestart++)
11147                         put_byte(sv, rangestart);
11148                 else {
11149                     put_byte(sv, rangestart);
11150                     sv_catpvs(sv, "-");
11151                     put_byte(sv, i - 1);
11152                 }
11153                 do_sep = 1;
11154                 rangestart = -1;
11155             }
11156         }
11157
11158         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11159         /* output any special charclass tests (used entirely under use locale) */
11160         if (ANYOF_CLASS_TEST_ANY_SET(o))
11161             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11162                 if (ANYOF_CLASS_TEST(o,i)) {
11163                     sv_catpv(sv, anyofs[i]);
11164                     do_sep = 1;
11165                 }
11166
11167         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11168
11169         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11170             sv_catpvs(sv, "{non-utf8-latin1-all}");
11171         }
11172
11173         /* output information about the unicode matching */
11174         if (flags & ANYOF_UNICODE_ALL)
11175             sv_catpvs(sv, "{unicode_all}");
11176         else if (ANYOF_NONBITMAP(o))
11177             sv_catpvs(sv, "{unicode}");
11178         if (flags & ANYOF_NONBITMAP_NON_UTF8)
11179             sv_catpvs(sv, "{outside bitmap}");
11180
11181         if (ANYOF_NONBITMAP(o)) {
11182             SV *lv;
11183             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11184
11185             if (lv) {
11186                 if (sw) {
11187                     U8 s[UTF8_MAXBYTES_CASE+1];
11188
11189                     for (i = 0; i <= 256; i++) { /* just the first 256 */
11190                         uvchr_to_utf8(s, i);
11191
11192                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
11193                             if (rangestart == -1)
11194                                 rangestart = i;
11195                         } else if (rangestart != -1) {
11196                             if (i <= rangestart + 3)
11197                                 for (; rangestart < i; rangestart++) {
11198                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
11199                                     U8 *p;
11200                                     for(p = s; p < e; p++)
11201                                         put_byte(sv, *p);
11202                                 }
11203                             else {
11204                                 const U8 *e = uvchr_to_utf8(s,rangestart);
11205                                 U8 *p;
11206                                 for (p = s; p < e; p++)
11207                                     put_byte(sv, *p);
11208                                 sv_catpvs(sv, "-");
11209                                 e = uvchr_to_utf8(s, i-1);
11210                                 for (p = s; p < e; p++)
11211                                     put_byte(sv, *p);
11212                                 }
11213                                 rangestart = -1;
11214                             }
11215                         }
11216
11217                     sv_catpvs(sv, "..."); /* et cetera */
11218                 }
11219
11220                 {
11221                     char *s = savesvpv(lv);
11222                     char * const origs = s;
11223
11224                     while (*s && *s != '\n')
11225                         s++;
11226
11227                     if (*s == '\n') {
11228                         const char * const t = ++s;
11229
11230                         while (*s) {
11231                             if (*s == '\n')
11232                                 *s = ' ';
11233                             s++;
11234                         }
11235                         if (s[-1] == ' ')
11236                             s[-1] = 0;
11237
11238                         sv_catpv(sv, t);
11239                     }
11240
11241                     Safefree(origs);
11242                 }
11243             }
11244         }
11245
11246         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11247     }
11248     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11249         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11250 #else
11251     PERL_UNUSED_CONTEXT;
11252     PERL_UNUSED_ARG(sv);
11253     PERL_UNUSED_ARG(o);
11254     PERL_UNUSED_ARG(prog);
11255 #endif  /* DEBUGGING */
11256 }
11257
11258 SV *
11259 Perl_re_intuit_string(pTHX_ REGEXP * const r)
11260 {                               /* Assume that RE_INTUIT is set */
11261     dVAR;
11262     struct regexp *const prog = (struct regexp *)SvANY(r);
11263     GET_RE_DEBUG_FLAGS_DECL;
11264
11265     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11266     PERL_UNUSED_CONTEXT;
11267
11268     DEBUG_COMPILE_r(
11269         {
11270             const char * const s = SvPV_nolen_const(prog->check_substr
11271                       ? prog->check_substr : prog->check_utf8);
11272
11273             if (!PL_colorset) reginitcolors();
11274             PerlIO_printf(Perl_debug_log,
11275                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11276                       PL_colors[4],
11277                       prog->check_substr ? "" : "utf8 ",
11278                       PL_colors[5],PL_colors[0],
11279                       s,
11280                       PL_colors[1],
11281                       (strlen(s) > 60 ? "..." : ""));
11282         } );
11283
11284     return prog->check_substr ? prog->check_substr : prog->check_utf8;
11285 }
11286
11287 /*
11288    pregfree()
11289
11290    handles refcounting and freeing the perl core regexp structure. When
11291    it is necessary to actually free the structure the first thing it
11292    does is call the 'free' method of the regexp_engine associated to
11293    the regexp, allowing the handling of the void *pprivate; member
11294    first. (This routine is not overridable by extensions, which is why
11295    the extensions free is called first.)
11296
11297    See regdupe and regdupe_internal if you change anything here.
11298 */
11299 #ifndef PERL_IN_XSUB_RE
11300 void
11301 Perl_pregfree(pTHX_ REGEXP *r)
11302 {
11303     SvREFCNT_dec(r);
11304 }
11305
11306 void
11307 Perl_pregfree2(pTHX_ REGEXP *rx)
11308 {
11309     dVAR;
11310     struct regexp *const r = (struct regexp *)SvANY(rx);
11311     GET_RE_DEBUG_FLAGS_DECL;
11312
11313     PERL_ARGS_ASSERT_PREGFREE2;
11314
11315     if (r->mother_re) {
11316         ReREFCNT_dec(r->mother_re);
11317     } else {
11318         CALLREGFREE_PVT(rx); /* free the private data */
11319         SvREFCNT_dec(RXp_PAREN_NAMES(r));
11320     }
11321     if (r->substrs) {
11322         SvREFCNT_dec(r->anchored_substr);
11323         SvREFCNT_dec(r->anchored_utf8);
11324         SvREFCNT_dec(r->float_substr);
11325         SvREFCNT_dec(r->float_utf8);
11326         Safefree(r->substrs);
11327     }
11328     RX_MATCH_COPY_FREE(rx);
11329 #ifdef PERL_OLD_COPY_ON_WRITE
11330     SvREFCNT_dec(r->saved_copy);
11331 #endif
11332     Safefree(r->offs);
11333 }
11334
11335 /*  reg_temp_copy()
11336
11337     This is a hacky workaround to the structural issue of match results
11338     being stored in the regexp structure which is in turn stored in
11339     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11340     could be PL_curpm in multiple contexts, and could require multiple
11341     result sets being associated with the pattern simultaneously, such
11342     as when doing a recursive match with (??{$qr})
11343
11344     The solution is to make a lightweight copy of the regexp structure
11345     when a qr// is returned from the code executed by (??{$qr}) this
11346     lightweight copy doesn't actually own any of its data except for
11347     the starp/end and the actual regexp structure itself.
11348
11349 */
11350
11351
11352 REGEXP *
11353 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11354 {
11355     struct regexp *ret;
11356     struct regexp *const r = (struct regexp *)SvANY(rx);
11357     register const I32 npar = r->nparens+1;
11358
11359     PERL_ARGS_ASSERT_REG_TEMP_COPY;
11360
11361     if (!ret_x)
11362         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11363     ret = (struct regexp *)SvANY(ret_x);
11364
11365     (void)ReREFCNT_inc(rx);
11366     /* We can take advantage of the existing "copied buffer" mechanism in SVs
11367        by pointing directly at the buffer, but flagging that the allocated
11368        space in the copy is zero. As we've just done a struct copy, it's now
11369        a case of zero-ing that, rather than copying the current length.  */
11370     SvPV_set(ret_x, RX_WRAPPED(rx));
11371     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11372     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11373            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11374     SvLEN_set(ret_x, 0);
11375     SvSTASH_set(ret_x, NULL);
11376     SvMAGIC_set(ret_x, NULL);
11377     Newx(ret->offs, npar, regexp_paren_pair);
11378     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11379     if (r->substrs) {
11380         Newx(ret->substrs, 1, struct reg_substr_data);
11381         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11382
11383         SvREFCNT_inc_void(ret->anchored_substr);
11384         SvREFCNT_inc_void(ret->anchored_utf8);
11385         SvREFCNT_inc_void(ret->float_substr);
11386         SvREFCNT_inc_void(ret->float_utf8);
11387
11388         /* check_substr and check_utf8, if non-NULL, point to either their
11389            anchored or float namesakes, and don't hold a second reference.  */
11390     }
11391     RX_MATCH_COPIED_off(ret_x);
11392 #ifdef PERL_OLD_COPY_ON_WRITE
11393     ret->saved_copy = NULL;
11394 #endif
11395     ret->mother_re = rx;
11396
11397     return ret_x;
11398 }
11399 #endif
11400
11401 /* regfree_internal()
11402
11403    Free the private data in a regexp. This is overloadable by
11404    extensions. Perl takes care of the regexp structure in pregfree(),
11405    this covers the *pprivate pointer which technically perl doesn't
11406    know about, however of course we have to handle the
11407    regexp_internal structure when no extension is in use.
11408
11409    Note this is called before freeing anything in the regexp
11410    structure.
11411  */
11412
11413 void
11414 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11415 {
11416     dVAR;
11417     struct regexp *const r = (struct regexp *)SvANY(rx);
11418     RXi_GET_DECL(r,ri);
11419     GET_RE_DEBUG_FLAGS_DECL;
11420
11421     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11422
11423     DEBUG_COMPILE_r({
11424         if (!PL_colorset)
11425             reginitcolors();
11426         {
11427             SV *dsv= sv_newmortal();
11428             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11429                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11430             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11431                 PL_colors[4],PL_colors[5],s);
11432         }
11433     });
11434 #ifdef RE_TRACK_PATTERN_OFFSETS
11435     if (ri->u.offsets)
11436         Safefree(ri->u.offsets);             /* 20010421 MJD */
11437 #endif
11438     if (ri->data) {
11439         int n = ri->data->count;
11440         PAD* new_comppad = NULL;
11441         PAD* old_comppad;
11442         PADOFFSET refcnt;
11443
11444         while (--n >= 0) {
11445           /* If you add a ->what type here, update the comment in regcomp.h */
11446             switch (ri->data->what[n]) {
11447             case 'a':
11448             case 's':
11449             case 'S':
11450             case 'u':
11451                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11452                 break;
11453             case 'f':
11454                 Safefree(ri->data->data[n]);
11455                 break;
11456             case 'p':
11457                 new_comppad = MUTABLE_AV(ri->data->data[n]);
11458                 break;
11459             case 'o':
11460                 if (new_comppad == NULL)
11461                     Perl_croak(aTHX_ "panic: pregfree comppad");
11462                 PAD_SAVE_LOCAL(old_comppad,
11463                     /* Watch out for global destruction's random ordering. */
11464                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11465                 );
11466                 OP_REFCNT_LOCK;
11467                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11468                 OP_REFCNT_UNLOCK;
11469                 if (!refcnt)
11470                     op_free((OP_4tree*)ri->data->data[n]);
11471
11472                 PAD_RESTORE_LOCAL(old_comppad);
11473                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
11474                 new_comppad = NULL;
11475                 break;
11476             case 'n':
11477                 break;
11478             case 'T':
11479                 { /* Aho Corasick add-on structure for a trie node.
11480                      Used in stclass optimization only */
11481                     U32 refcount;
11482                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11483                     OP_REFCNT_LOCK;
11484                     refcount = --aho->refcount;
11485                     OP_REFCNT_UNLOCK;
11486                     if ( !refcount ) {
11487                         PerlMemShared_free(aho->states);
11488                         PerlMemShared_free(aho->fail);
11489                          /* do this last!!!! */
11490                         PerlMemShared_free(ri->data->data[n]);
11491                         PerlMemShared_free(ri->regstclass);
11492                     }
11493                 }
11494                 break;
11495             case 't':
11496                 {
11497                     /* trie structure. */
11498                     U32 refcount;
11499                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11500                     OP_REFCNT_LOCK;
11501                     refcount = --trie->refcount;
11502                     OP_REFCNT_UNLOCK;
11503                     if ( !refcount ) {
11504                         PerlMemShared_free(trie->charmap);
11505                         PerlMemShared_free(trie->states);
11506                         PerlMemShared_free(trie->trans);
11507                         if (trie->bitmap)
11508                             PerlMemShared_free(trie->bitmap);
11509                         if (trie->jump)
11510                             PerlMemShared_free(trie->jump);
11511                         PerlMemShared_free(trie->wordinfo);
11512                         /* do this last!!!! */
11513                         PerlMemShared_free(ri->data->data[n]);
11514                     }
11515                 }
11516                 break;
11517             default:
11518                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11519             }
11520         }
11521         Safefree(ri->data->what);
11522         Safefree(ri->data);
11523     }
11524
11525     Safefree(ri);
11526 }
11527
11528 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11529 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11530 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
11531
11532 /*
11533    re_dup - duplicate a regexp.
11534
11535    This routine is expected to clone a given regexp structure. It is only
11536    compiled under USE_ITHREADS.
11537
11538    After all of the core data stored in struct regexp is duplicated
11539    the regexp_engine.dupe method is used to copy any private data
11540    stored in the *pprivate pointer. This allows extensions to handle
11541    any duplication it needs to do.
11542
11543    See pregfree() and regfree_internal() if you change anything here.
11544 */
11545 #if defined(USE_ITHREADS)
11546 #ifndef PERL_IN_XSUB_RE
11547 void
11548 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11549 {
11550     dVAR;
11551     I32 npar;
11552     const struct regexp *r = (const struct regexp *)SvANY(sstr);
11553     struct regexp *ret = (struct regexp *)SvANY(dstr);
11554
11555     PERL_ARGS_ASSERT_RE_DUP_GUTS;
11556
11557     npar = r->nparens+1;
11558     Newx(ret->offs, npar, regexp_paren_pair);
11559     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11560     if(ret->swap) {
11561         /* no need to copy these */
11562         Newx(ret->swap, npar, regexp_paren_pair);
11563     }
11564
11565     if (ret->substrs) {
11566         /* Do it this way to avoid reading from *r after the StructCopy().
11567            That way, if any of the sv_dup_inc()s dislodge *r from the L1
11568            cache, it doesn't matter.  */
11569         const bool anchored = r->check_substr
11570             ? r->check_substr == r->anchored_substr
11571             : r->check_utf8 == r->anchored_utf8;
11572         Newx(ret->substrs, 1, struct reg_substr_data);
11573         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11574
11575         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11576         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11577         ret->float_substr = sv_dup_inc(ret->float_substr, param);
11578         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11579
11580         /* check_substr and check_utf8, if non-NULL, point to either their
11581            anchored or float namesakes, and don't hold a second reference.  */
11582
11583         if (ret->check_substr) {
11584             if (anchored) {
11585                 assert(r->check_utf8 == r->anchored_utf8);
11586                 ret->check_substr = ret->anchored_substr;
11587                 ret->check_utf8 = ret->anchored_utf8;
11588             } else {
11589                 assert(r->check_substr == r->float_substr);
11590                 assert(r->check_utf8 == r->float_utf8);
11591                 ret->check_substr = ret->float_substr;
11592                 ret->check_utf8 = ret->float_utf8;
11593             }
11594         } else if (ret->check_utf8) {
11595             if (anchored) {
11596                 ret->check_utf8 = ret->anchored_utf8;
11597             } else {
11598                 ret->check_utf8 = ret->float_utf8;
11599             }
11600         }
11601     }
11602
11603     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11604
11605     if (ret->pprivate)
11606         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11607
11608     if (RX_MATCH_COPIED(dstr))
11609         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
11610     else
11611         ret->subbeg = NULL;
11612 #ifdef PERL_OLD_COPY_ON_WRITE
11613     ret->saved_copy = NULL;
11614 #endif
11615
11616     if (ret->mother_re) {
11617         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11618             /* Our storage points directly to our mother regexp, but that's
11619                1: a buffer in a different thread
11620                2: something we no longer hold a reference on
11621                so we need to copy it locally.  */
11622             /* Note we need to sue SvCUR() on our mother_re, because it, in
11623                turn, may well be pointing to its own mother_re.  */
11624             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11625                                    SvCUR(ret->mother_re)+1));
11626             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11627         }
11628         ret->mother_re      = NULL;
11629     }
11630     ret->gofs = 0;
11631 }
11632 #endif /* PERL_IN_XSUB_RE */
11633
11634 /*
11635    regdupe_internal()
11636
11637    This is the internal complement to regdupe() which is used to copy
11638    the structure pointed to by the *pprivate pointer in the regexp.
11639    This is the core version of the extension overridable cloning hook.
11640    The regexp structure being duplicated will be copied by perl prior
11641    to this and will be provided as the regexp *r argument, however
11642    with the /old/ structures pprivate pointer value. Thus this routine
11643    may override any copying normally done by perl.
11644
11645    It returns a pointer to the new regexp_internal structure.
11646 */
11647
11648 void *
11649 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11650 {
11651     dVAR;
11652     struct regexp *const r = (struct regexp *)SvANY(rx);
11653     regexp_internal *reti;
11654     int len, npar;
11655     RXi_GET_DECL(r,ri);
11656
11657     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11658
11659     npar = r->nparens+1;
11660     len = ProgLen(ri);
11661
11662     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11663     Copy(ri->program, reti->program, len+1, regnode);
11664
11665
11666     reti->regstclass = NULL;
11667
11668     if (ri->data) {
11669         struct reg_data *d;
11670         const int count = ri->data->count;
11671         int i;
11672
11673         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11674                 char, struct reg_data);
11675         Newx(d->what, count, U8);
11676
11677         d->count = count;
11678         for (i = 0; i < count; i++) {
11679             d->what[i] = ri->data->what[i];
11680             switch (d->what[i]) {
11681                 /* legal options are one of: sSfpontTua
11682                    see also regcomp.h and pregfree() */
11683             case 'a': /* actually an AV, but the dup function is identical.  */
11684             case 's':
11685             case 'S':
11686             case 'p': /* actually an AV, but the dup function is identical.  */
11687             case 'u': /* actually an HV, but the dup function is identical.  */
11688                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11689                 break;
11690             case 'f':
11691                 /* This is cheating. */
11692                 Newx(d->data[i], 1, struct regnode_charclass_class);
11693                 StructCopy(ri->data->data[i], d->data[i],
11694                             struct regnode_charclass_class);
11695                 reti->regstclass = (regnode*)d->data[i];
11696                 break;
11697             case 'o':
11698                 /* Compiled op trees are readonly and in shared memory,
11699                    and can thus be shared without duplication. */
11700                 OP_REFCNT_LOCK;
11701                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11702                 OP_REFCNT_UNLOCK;
11703                 break;
11704             case 'T':
11705                 /* Trie stclasses are readonly and can thus be shared
11706                  * without duplication. We free the stclass in pregfree
11707                  * when the corresponding reg_ac_data struct is freed.
11708                  */
11709                 reti->regstclass= ri->regstclass;
11710                 /* Fall through */
11711             case 't':
11712                 OP_REFCNT_LOCK;
11713                 ((reg_trie_data*)ri->data->data[i])->refcount++;
11714                 OP_REFCNT_UNLOCK;
11715                 /* Fall through */
11716             case 'n':
11717                 d->data[i] = ri->data->data[i];
11718                 break;
11719             default:
11720                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11721             }
11722         }
11723
11724         reti->data = d;
11725     }
11726     else
11727         reti->data = NULL;
11728
11729     reti->name_list_idx = ri->name_list_idx;
11730
11731 #ifdef RE_TRACK_PATTERN_OFFSETS
11732     if (ri->u.offsets) {
11733         Newx(reti->u.offsets, 2*len+1, U32);
11734         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11735     }
11736 #else
11737     SetProgLen(reti,len);
11738 #endif
11739
11740     return (void*)reti;
11741 }
11742
11743 #endif    /* USE_ITHREADS */
11744
11745 #ifndef PERL_IN_XSUB_RE
11746
11747 /*
11748  - regnext - dig the "next" pointer out of a node
11749  */
11750 regnode *
11751 Perl_regnext(pTHX_ register regnode *p)
11752 {
11753     dVAR;
11754     register I32 offset;
11755
11756     if (!p)
11757         return(NULL);
11758
11759     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
11760         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11761     }
11762
11763     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11764     if (offset == 0)
11765         return(NULL);
11766
11767     return(p+offset);
11768 }
11769 #endif
11770
11771 STATIC void
11772 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11773 {
11774     va_list args;
11775     STRLEN l1 = strlen(pat1);
11776     STRLEN l2 = strlen(pat2);
11777     char buf[512];
11778     SV *msv;
11779     const char *message;
11780
11781     PERL_ARGS_ASSERT_RE_CROAK2;
11782
11783     if (l1 > 510)
11784         l1 = 510;
11785     if (l1 + l2 > 510)
11786         l2 = 510 - l1;
11787     Copy(pat1, buf, l1 , char);
11788     Copy(pat2, buf + l1, l2 , char);
11789     buf[l1 + l2] = '\n';
11790     buf[l1 + l2 + 1] = '\0';
11791 #ifdef I_STDARG
11792     /* ANSI variant takes additional second argument */
11793     va_start(args, pat2);
11794 #else
11795     va_start(args);
11796 #endif
11797     msv = vmess(buf, &args);
11798     va_end(args);
11799     message = SvPV_const(msv,l1);
11800     if (l1 > 512)
11801         l1 = 512;
11802     Copy(message, buf, l1 , char);
11803     buf[l1-1] = '\0';                   /* Overwrite \n */
11804     Perl_croak(aTHX_ "%s", buf);
11805 }
11806
11807 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
11808
11809 #ifndef PERL_IN_XSUB_RE
11810 void
11811 Perl_save_re_context(pTHX)
11812 {
11813     dVAR;
11814
11815     struct re_save_state *state;
11816
11817     SAVEVPTR(PL_curcop);
11818     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11819
11820     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11821     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11822     SSPUSHUV(SAVEt_RE_STATE);
11823
11824     Copy(&PL_reg_state, state, 1, struct re_save_state);
11825
11826     PL_reg_start_tmp = 0;
11827     PL_reg_start_tmpl = 0;
11828     PL_reg_oldsaved = NULL;
11829     PL_reg_oldsavedlen = 0;
11830     PL_reg_maxiter = 0;
11831     PL_reg_leftiter = 0;
11832     PL_reg_poscache = NULL;
11833     PL_reg_poscache_size = 0;
11834 #ifdef PERL_OLD_COPY_ON_WRITE
11835     PL_nrs = NULL;
11836 #endif
11837
11838     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11839     if (PL_curpm) {
11840         const REGEXP * const rx = PM_GETRE(PL_curpm);
11841         if (rx) {
11842             U32 i;
11843             for (i = 1; i <= RX_NPARENS(rx); i++) {
11844                 char digits[TYPE_CHARS(long)];
11845                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11846                 GV *const *const gvp
11847                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11848
11849                 if (gvp) {
11850                     GV * const gv = *gvp;
11851                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11852                         save_scalar(gv);
11853                 }
11854             }
11855         }
11856     }
11857 }
11858 #endif
11859
11860 static void
11861 clear_re(pTHX_ void *r)
11862 {
11863     dVAR;
11864     ReREFCNT_dec((REGEXP *)r);
11865 }
11866
11867 #ifdef DEBUGGING
11868
11869 STATIC void
11870 S_put_byte(pTHX_ SV *sv, int c)
11871 {
11872     PERL_ARGS_ASSERT_PUT_BYTE;
11873
11874     /* Our definition of isPRINT() ignores locales, so only bytes that are
11875        not part of UTF-8 are considered printable. I assume that the same
11876        holds for UTF-EBCDIC.
11877        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11878        which Wikipedia says:
11879
11880        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11881        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11882        identical, to the ASCII delete (DEL) or rubout control character.
11883        ) So the old condition can be simplified to !isPRINT(c)  */
11884     if (!isPRINT(c)) {
11885         if (c < 256) {
11886             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11887         }
11888         else {
11889             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11890         }
11891     }
11892     else {
11893         const char string = c;
11894         if (c == '-' || c == ']' || c == '\\' || c == '^')
11895             sv_catpvs(sv, "\\");
11896         sv_catpvn(sv, &string, 1);
11897     }
11898 }
11899
11900
11901 #define CLEAR_OPTSTART \
11902     if (optstart) STMT_START { \
11903             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11904             optstart=NULL; \
11905     } STMT_END
11906
11907 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11908
11909 STATIC const regnode *
11910 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11911             const regnode *last, const regnode *plast,
11912             SV* sv, I32 indent, U32 depth)
11913 {
11914     dVAR;
11915     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
11916     register const regnode *next;
11917     const regnode *optstart= NULL;
11918
11919     RXi_GET_DECL(r,ri);
11920     GET_RE_DEBUG_FLAGS_DECL;
11921
11922     PERL_ARGS_ASSERT_DUMPUNTIL;
11923
11924 #ifdef DEBUG_DUMPUNTIL
11925     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11926         last ? last-start : 0,plast ? plast-start : 0);
11927 #endif
11928
11929     if (plast && plast < last)
11930         last= plast;
11931
11932     while (PL_regkind[op] != END && (!last || node < last)) {
11933         /* While that wasn't END last time... */
11934         NODE_ALIGN(node);
11935         op = OP(node);
11936         if (op == CLOSE || op == WHILEM)
11937             indent--;
11938         next = regnext((regnode *)node);
11939
11940         /* Where, what. */
11941         if (OP(node) == OPTIMIZED) {
11942             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11943                 optstart = node;
11944             else
11945                 goto after_print;
11946         } else
11947             CLEAR_OPTSTART;
11948
11949         regprop(r, sv, node);
11950         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11951                       (int)(2*indent + 1), "", SvPVX_const(sv));
11952
11953         if (OP(node) != OPTIMIZED) {
11954             if (next == NULL)           /* Next ptr. */
11955                 PerlIO_printf(Perl_debug_log, " (0)");
11956             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11957                 PerlIO_printf(Perl_debug_log, " (FAIL)");
11958             else
11959                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11960             (void)PerlIO_putc(Perl_debug_log, '\n');
11961         }
11962
11963       after_print:
11964         if (PL_regkind[(U8)op] == BRANCHJ) {
11965             assert(next);
11966             {
11967                 register const regnode *nnode = (OP(next) == LONGJMP
11968                                              ? regnext((regnode *)next)
11969                                              : next);
11970                 if (last && nnode > last)
11971                     nnode = last;
11972                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11973             }
11974         }
11975         else if (PL_regkind[(U8)op] == BRANCH) {
11976             assert(next);
11977             DUMPUNTIL(NEXTOPER(node), next);
11978         }
11979         else if ( PL_regkind[(U8)op]  == TRIE ) {
11980             const regnode *this_trie = node;
11981             const char op = OP(node);
11982             const U32 n = ARG(node);
11983             const reg_ac_data * const ac = op>=AHOCORASICK ?
11984                (reg_ac_data *)ri->data->data[n] :
11985                NULL;
11986             const reg_trie_data * const trie =
11987                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
11988 #ifdef DEBUGGING
11989             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
11990 #endif
11991             const regnode *nextbranch= NULL;
11992             I32 word_idx;
11993             sv_setpvs(sv, "");
11994             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
11995                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
11996
11997                 PerlIO_printf(Perl_debug_log, "%*s%s ",
11998                    (int)(2*(indent+3)), "",
11999                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
12000                             PL_colors[0], PL_colors[1],
12001                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
12002                             PERL_PV_PRETTY_ELLIPSES    |
12003                             PERL_PV_PRETTY_LTGT
12004                             )
12005                             : "???"
12006                 );
12007                 if (trie->jump) {
12008                     U16 dist= trie->jump[word_idx+1];
12009                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
12010                                   (UV)((dist ? this_trie + dist : next) - start));
12011                     if (dist) {
12012                         if (!nextbranch)
12013                             nextbranch= this_trie + trie->jump[0];
12014                         DUMPUNTIL(this_trie + dist, nextbranch);
12015                     }
12016                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
12017                         nextbranch= regnext((regnode *)nextbranch);
12018                 } else {
12019                     PerlIO_printf(Perl_debug_log, "\n");
12020                 }
12021             }
12022             if (last && next > last)
12023                 node= last;
12024             else
12025                 node= next;
12026         }
12027         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
12028             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
12029                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
12030         }
12031         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
12032             assert(next);
12033             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
12034         }
12035         else if ( op == PLUS || op == STAR) {
12036             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
12037         }
12038         else if (PL_regkind[(U8)op] == ANYOF) {
12039             /* arglen 1 + class block */
12040             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
12041                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
12042             node = NEXTOPER(node);
12043         }
12044         else if (PL_regkind[(U8)op] == EXACT) {
12045             /* Literal string, where present. */
12046             node += NODE_SZ_STR(node) - 1;
12047             node = NEXTOPER(node);
12048         }
12049         else {
12050             node = NEXTOPER(node);
12051             node += regarglen[(U8)op];
12052         }
12053         if (op == CURLYX || op == OPEN)
12054             indent++;
12055     }
12056     CLEAR_OPTSTART;
12057 #ifdef DEBUG_DUMPUNTIL
12058     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
12059 #endif
12060     return node;
12061 }
12062
12063 #endif  /* DEBUGGING */
12064
12065 /*
12066  * Local variables:
12067  * c-indentation-style: bsd
12068  * c-basic-offset: 4
12069  * indent-tabs-mode: t
12070  * End:
12071  *
12072  * ex: set ts=8 sts=4 sw=4 noet:
12073  */