src/5015007/orig/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  *      Copyright (c) 1986 by University of Toronto.
  42  *      Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  *      Permission is granted to anyone to use this software for any
  45  *      purpose on any computer system, and to redistribute it freely,
  46  *      subject to the following restrictions:
  47  *
  48  *      1. The author is not responsible for the consequences of use of
  49  *              this software, no matter how awful, even if they arise
  50  *              from defects in it.
  51  *
  52  *      2. The origin of this software must not be misrepresented, either
  53  *              by explicit claim or by omission.
  54  *
  55  *      3. Altered versions must be plainly marked as such, and must not
  56  *              be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #  include "INTERN.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 #else
  85 #  include "regcomp.h"
  86 #endif
  87
  88 #include "dquote_static.c"
  89
  90 #ifdef op
  91 #undef op
  92 #endif /* op */
  93
  94 #ifdef MSDOS
  95 #  if defined(BUGGY_MSC6)
  96  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
  97 #    pragma optimize("a",off)
  98  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
  99 #    pragma optimize("w",on )
 100 #  endif /* BUGGY_MSC6 */
 101 #endif /* MSDOS */
 102
 103 #ifndef STATIC
 104 #define STATIC  static
 105 #endif
 106
 107 typedef struct RExC_state_t {
 108     U32         flags;                  /* are we folding, multilining? */
 109     char        *precomp;               /* uncompiled string. */
 110     REGEXP      *rx_sv;                 /* The SV that is the regexp. */
 111     regexp      *rx;                    /* perl core regexp structure */
 112     regexp_internal     *rxi;           /* internal data for regexp object pprivate field */
 113     char        *start;                 /* Start of input for compile */
 114     char        *end;                   /* End of input for compile */
 115     char        *parse;                 /* Input-scan pointer. */
 116     I32         whilem_seen;            /* number of WHILEM in this expr */
 117     regnode     *emit_start;            /* Start of emitted-code area */
 118     regnode     *emit_bound;            /* First regnode outside of the allocated space */
 119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
 120     I32         naughty;                /* How bad is this pattern? */
 121     I32         sawback;                /* Did we see \1, ...? */
 122     U32         seen;
 123     I32         size;                   /* Code size. */
 124     I32         npar;                   /* Capture buffer count, (OPEN). */
 125     I32         cpar;                   /* Capture buffer count, (CLOSE). */
 126     I32         nestroot;               /* root parens we are in - used by accept */
 127     I32         extralen;
 128     I32         seen_zerolen;
 129     I32         seen_evals;
 130     regnode     **open_parens;          /* pointers to open parens */
 131     regnode     **close_parens;         /* pointers to close parens */
 132     regnode     *opend;                 /* END node in program */
 133     I32         utf8;           /* whether the pattern is utf8 or not */
 134     I32         orig_utf8;      /* whether the pattern was originally in utf8 */
 135                                 /* XXX use this for future optimisation of case
 136                                  * where pattern must be upgraded to utf8. */
 137     I32         uni_semantics;  /* If a d charset modifier should use unicode
 138                                    rules, even if the pattern is not in
 139                                    utf8 */
 140     HV          *paren_names;           /* Paren names */
 141
 142     regnode     **recurse;              /* Recurse regops */
 143     I32         recurse_count;          /* Number of recurse regops */
 144     I32         in_lookbehind;
 145     I32         contains_locale;
 146     I32         override_recoding;
 147 #if ADD_TO_REGEXEC
 148     char        *starttry;              /* -Dr: where regtry was called. */
 149 #define RExC_starttry   (pRExC_state->starttry)
 150 #endif
 151 #ifdef DEBUGGING
 152     const char  *lastparse;
 153     I32         lastnum;
 154     AV          *paren_name_list;       /* idx -> name */
 155 #define RExC_lastparse  (pRExC_state->lastparse)
 156 #define RExC_lastnum    (pRExC_state->lastnum)
 157 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 158 #endif
 159 } RExC_state_t;
 160
 161 #define RExC_flags      (pRExC_state->flags)
 162 #define RExC_precomp    (pRExC_state->precomp)
 163 #define RExC_rx_sv      (pRExC_state->rx_sv)
 164 #define RExC_rx         (pRExC_state->rx)
 165 #define RExC_rxi        (pRExC_state->rxi)
 166 #define RExC_start      (pRExC_state->start)
 167 #define RExC_end        (pRExC_state->end)
 168 #define RExC_parse      (pRExC_state->parse)
 169 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
 170 #ifdef RE_TRACK_PATTERN_OFFSETS
 171 #define RExC_offsets    (pRExC_state->rxi->u.offsets) /* I am not like the others */
 172 #endif
 173 #define RExC_emit       (pRExC_state->emit)
 174 #define RExC_emit_start (pRExC_state->emit_start)
 175 #define RExC_emit_bound (pRExC_state->emit_bound)
 176 #define RExC_naughty    (pRExC_state->naughty)
 177 #define RExC_sawback    (pRExC_state->sawback)
 178 #define RExC_seen       (pRExC_state->seen)
 179 #define RExC_size       (pRExC_state->size)
 180 #define RExC_npar       (pRExC_state->npar)
 181 #define RExC_nestroot   (pRExC_state->nestroot)
 182 #define RExC_extralen   (pRExC_state->extralen)
 183 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
 184 #define RExC_seen_evals (pRExC_state->seen_evals)
 185 #define RExC_utf8       (pRExC_state->utf8)
 186 #define RExC_uni_semantics      (pRExC_state->uni_semantics)
 187 #define RExC_orig_utf8  (pRExC_state->orig_utf8)
 188 #define RExC_open_parens        (pRExC_state->open_parens)
 189 #define RExC_close_parens       (pRExC_state->close_parens)
 190 #define RExC_opend      (pRExC_state->opend)
 191 #define RExC_paren_names        (pRExC_state->paren_names)
 192 #define RExC_recurse    (pRExC_state->recurse)
 193 #define RExC_recurse_count      (pRExC_state->recurse_count)
 194 #define RExC_in_lookbehind      (pRExC_state->in_lookbehind)
 195 #define RExC_contains_locale    (pRExC_state->contains_locale)
 196 #define RExC_override_recoding  (pRExC_state->override_recoding)
 197
 198
 199 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
 200 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 201         ((*s) == '{' && regcurly(s)))
 202
 203 #ifdef SPSTART
 204 #undef SPSTART          /* dratted cpp namespace... */
 205 #endif
 206 /*
 207  * Flags to be passed up and down.
 208  */
 209 #define WORST           0       /* Worst case. */
 210 #define HASWIDTH        0x01    /* Known to match non-null strings. */
 211
 212 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
 213  * character, and if utf8, must be invariant.  Note that this is not the same thing as REGNODE_SIMPLE */
 214 #define SIMPLE          0x02
 215 #define SPSTART         0x04    /* Starts with * or +. */
 216 #define TRYAGAIN        0x08    /* Weeded out a declaration. */
 217 #define POSTPONED       0x10    /* (?1),(?&name), (??{...}) or similar */
 218
 219 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 220
 221 /* whether trie related optimizations are enabled */
 222 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 223 #define TRIE_STUDY_OPT
 224 #define FULL_TRIE_STUDY
 225 #define TRIE_STCLASS
 226 #endif
 227
 228
 229
 230 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 231 #define PBITVAL(paren) (1 << ((paren) & 7))
 232 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 233 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 234 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 235
 236 /* If not already in utf8, do a longjmp back to the beginning */
 237 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
 238 #define REQUIRE_UTF8    STMT_START {                                       \
 239                                      if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
 240                         } STMT_END
 241
 242 /* About scan_data_t.
 243
 244   During optimisation we recurse through the regexp program performing
 245   various inplace (keyhole style) optimisations. In addition study_chunk
 246   and scan_commit populate this data structure with information about
 247   what strings MUST appear in the pattern. We look for the longest
 248   string that must appear at a fixed location, and we look for the
 249   longest string that may appear at a floating location. So for instance
 250   in the pattern:
 251
 252     /FOO[xX]A.*B[xX]BAR/
 253
 254   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 255   strings (because they follow a .* construct). study_chunk will identify
 256   both FOO and BAR as being the longest fixed and floating strings respectively.
 257
 258   The strings can be composites, for instance
 259
 260      /(f)(o)(o)/
 261
 262   will result in a composite fixed substring 'foo'.
 263
 264   For each string some basic information is maintained:
 265
 266   - offset or min_offset
 267     This is the position the string must appear at, or not before.
 268     It also implicitly (when combined with minlenp) tells us how many
 269     characters must match before the string we are searching for.
 270     Likewise when combined with minlenp and the length of the string it
 271     tells us how many characters must appear after the string we have
 272     found.
 273
 274   - max_offset
 275     Only used for floating strings. This is the rightmost point that
 276     the string can appear at. If set to I32 max it indicates that the
 277     string can occur infinitely far to the right.
 278
 279   - minlenp
 280     A pointer to the minimum length of the pattern that the string
 281     was found inside. This is important as in the case of positive
 282     lookahead or positive lookbehind we can have multiple patterns
 283     involved. Consider
 284
 285     /(?=FOO).*F/
 286
 287     The minimum length of the pattern overall is 3, the minimum length
 288     of the lookahead part is 3, but the minimum length of the part that
 289     will actually match is 1. So 'FOO's minimum length is 3, but the
 290     minimum length for the F is 1. This is important as the minimum length
 291     is used to determine offsets in front of and behind the string being
 292     looked for.  Since strings can be composites this is the length of the
 293     pattern at the time it was committed with a scan_commit. Note that
 294     the length is calculated by study_chunk, so that the minimum lengths
 295     are not known until the full pattern has been compiled, thus the
 296     pointer to the value.
 297
 298   - lookbehind
 299
 300     In the case of lookbehind the string being searched for can be
 301     offset past the start point of the final matching string.
 302     If this value was just blithely removed from the min_offset it would
 303     invalidate some of the calculations for how many chars must match
 304     before or after (as they are derived from min_offset and minlen and
 305     the length of the string being searched for).
 306     When the final pattern is compiled and the data is moved from the
 307     scan_data_t structure into the regexp structure the information
 308     about lookbehind is factored in, with the information that would
 309     have been lost precalculated in the end_shift field for the
 310     associated string.
 311
 312   The fields pos_min and pos_delta are used to store the minimum offset
 313   and the delta to the maximum offset at the current point in the pattern.
 314
 315 */
 316
 317 typedef struct scan_data_t {
 318     /*I32 len_min;      unused */
 319     /*I32 len_delta;    unused */
 320     I32 pos_min;
 321     I32 pos_delta;
 322     SV *last_found;
 323     I32 last_end;           /* min value, <0 unless valid. */
 324     I32 last_start_min;
 325     I32 last_start_max;
 326     SV **longest;           /* Either &l_fixed, or &l_float. */
 327     SV *longest_fixed;      /* longest fixed string found in pattern */
 328     I32 offset_fixed;       /* offset where it starts */
 329     I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 330     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 331     SV *longest_float;      /* longest floating string found in pattern */
 332     I32 offset_float_min;   /* earliest point in string it can appear */
 333     I32 offset_float_max;   /* latest point in string it can appear */
 334     I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 335     I32 lookbehind_float;   /* is the position of the string modified by LB */
 336     I32 flags;
 337     I32 whilem_c;
 338     I32 *last_closep;
 339     struct regnode_charclass_class *start_class;
 340 } scan_data_t;
 341
 342 /*
 343  * Forward declarations for pregcomp()'s friends.
 344  */
 345
 346 static const scan_data_t zero_scan_data =
 347   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 348
 349 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 350 #define SF_BEFORE_SEOL          0x0001
 351 #define SF_BEFORE_MEOL          0x0002
 352 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 353 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 354
 355 #ifdef NO_UNARY_PLUS
 356 #  define SF_FIX_SHIFT_EOL      (0+2)
 357 #  define SF_FL_SHIFT_EOL               (0+4)
 358 #else
 359 #  define SF_FIX_SHIFT_EOL      (+2)
 360 #  define SF_FL_SHIFT_EOL               (+4)
 361 #endif
 362
 363 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 364 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 365
 366 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 367 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 368 #define SF_IS_INF               0x0040
 369 #define SF_HAS_PAR              0x0080
 370 #define SF_IN_PAR               0x0100
 371 #define SF_HAS_EVAL             0x0200
 372 #define SCF_DO_SUBSTR           0x0400
 373 #define SCF_DO_STCLASS_AND      0x0800
 374 #define SCF_DO_STCLASS_OR       0x1000
 375 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 376 #define SCF_WHILEM_VISITED_POS  0x2000
 377
 378 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 379 #define SCF_SEEN_ACCEPT         0x8000
 380
 381 #define UTF cBOOL(RExC_utf8)
 382 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 383 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 384 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 385 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 386 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 387 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 388 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 389
 390 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 391
 392 #define OOB_UNICODE             12345678
 393 #define OOB_NAMEDCLASS          -1
 394
 395 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 396 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 397
 398
 399 /* length of regex to show in messages that don't mark a position within */
 400 #define RegexLengthToShowInErrorMessages 127
 401
 402 /*
 403  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 404  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 405  * op/pragma/warn/regcomp.
 406  */
 407 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 408 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 409
 410 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 411
 412 /*
 413  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 414  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 415  * "...".
 416  */
 417 #define _FAIL(code) STMT_START {                                        \
 418     const char *ellipses = "";                                          \
 419     IV len = RExC_end - RExC_precomp;                                   \
 420                                                                         \
 421     if (!SIZE_ONLY)                                                     \
 422         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);                   \
 423     if (len > RegexLengthToShowInErrorMessages) {                       \
 424         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
 425         len = RegexLengthToShowInErrorMessages - 10;                    \
 426         ellipses = "...";                                               \
 427     }                                                                   \
 428     code;                                                               \
 429 } STMT_END
 430
 431 #define FAIL(msg) _FAIL(                            \
 432     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",       \
 433             msg, (int)len, RExC_precomp, ellipses))
 434
 435 #define FAIL2(msg,arg) _FAIL(                       \
 436     Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 437             arg, (int)len, RExC_precomp, ellipses))
 438
 439 /*
 440  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 441  */
 442 #define Simple_vFAIL(m) STMT_START {                                    \
 443     const IV offset = RExC_parse - RExC_precomp;                        \
 444     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
 445             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 446 } STMT_END
 447
 448 /*
 449  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 450  */
 451 #define vFAIL(m) STMT_START {                           \
 452     if (!SIZE_ONLY)                                     \
 453         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 454     Simple_vFAIL(m);                                    \
 455 } STMT_END
 456
 457 /*
 458  * Like Simple_vFAIL(), but accepts two arguments.
 459  */
 460 #define Simple_vFAIL2(m,a1) STMT_START {                        \
 461     const IV offset = RExC_parse - RExC_precomp;                        \
 462     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
 463             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 464 } STMT_END
 465
 466 /*
 467  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 468  */
 469 #define vFAIL2(m,a1) STMT_START {                       \
 470     if (!SIZE_ONLY)                                     \
 471         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 472     Simple_vFAIL2(m, a1);                               \
 473 } STMT_END
 474
 475
 476 /*
 477  * Like Simple_vFAIL(), but accepts three arguments.
 478  */
 479 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
 480     const IV offset = RExC_parse - RExC_precomp;                \
 481     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
 482             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 483 } STMT_END
 484
 485 /*
 486  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 487  */
 488 #define vFAIL3(m,a1,a2) STMT_START {                    \
 489     if (!SIZE_ONLY)                                     \
 490         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv);   \
 491     Simple_vFAIL3(m, a1, a2);                           \
 492 } STMT_END
 493
 494 /*
 495  * Like Simple_vFAIL(), but accepts four arguments.
 496  */
 497 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
 498     const IV offset = RExC_parse - RExC_precomp;                \
 499     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
 500             (int)offset, RExC_precomp, RExC_precomp + offset);  \
 501 } STMT_END
 502
 503 #define ckWARNreg(loc,m) STMT_START {                                   \
 504     const IV offset = loc - RExC_precomp;                               \
 505     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 506             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 507 } STMT_END
 508
 509 #define ckWARNregdep(loc,m) STMT_START {                                \
 510     const IV offset = loc - RExC_precomp;                               \
 511     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 512             m REPORT_LOCATION,                                          \
 513             (int)offset, RExC_precomp, RExC_precomp + offset);          \
 514 } STMT_END
 515
 516 #define ckWARN2regdep(loc,m, a1) STMT_START {                           \
 517     const IV offset = loc - RExC_precomp;                               \
 518     Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),     \
 519             m REPORT_LOCATION,                                          \
 520             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 521 } STMT_END
 522
 523 #define ckWARN2reg(loc, m, a1) STMT_START {                             \
 524     const IV offset = loc - RExC_precomp;                               \
 525     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 526             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
 527 } STMT_END
 528
 529 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
 530     const IV offset = loc - RExC_precomp;                               \
 531     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 532             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 533 } STMT_END
 534
 535 #define ckWARN3reg(loc, m, a1, a2) STMT_START {                         \
 536     const IV offset = loc - RExC_precomp;                               \
 537     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 538             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
 539 } STMT_END
 540
 541 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
 542     const IV offset = loc - RExC_precomp;                               \
 543     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 544             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 545 } STMT_END
 546
 547 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {                     \
 548     const IV offset = loc - RExC_precomp;                               \
 549     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
 550             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 551 } STMT_END
 552
 553 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
 554     const IV offset = loc - RExC_precomp;                               \
 555     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
 556             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 557 } STMT_END
 558
 559
 560 /* Allow for side effects in s */
 561 #define REGC(c,s) STMT_START {                  \
 562     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 563 } STMT_END
 564
 565 /* Macros for recording node offsets.   20001227 mjd@plover.com
 566  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 567  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 568  * Element 0 holds the number n.
 569  * Position is 1 indexed.
 570  */
 571 #ifndef RE_TRACK_PATTERN_OFFSETS
 572 #define Set_Node_Offset_To_R(node,byte)
 573 #define Set_Node_Offset(node,byte)
 574 #define Set_Cur_Node_Offset
 575 #define Set_Node_Length_To_R(node,len)
 576 #define Set_Node_Length(node,len)
 577 #define Set_Node_Cur_Length(node)
 578 #define Node_Offset(n)
 579 #define Node_Length(n)
 580 #define Set_Node_Offset_Length(node,offset,len)
 581 #define ProgLen(ri) ri->u.proglen
 582 #define SetProgLen(ri,x) ri->u.proglen = x
 583 #else
 584 #define ProgLen(ri) ri->u.offsets[0]
 585 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 586 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
 587     if (! SIZE_ONLY) {                                                  \
 588         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
 589                     __LINE__, (int)(node), (int)(byte)));               \
 590         if((node) < 0) {                                                \
 591             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 592         } else {                                                        \
 593             RExC_offsets[2*(node)-1] = (byte);                          \
 594         }                                                               \
 595     }                                                                   \
 596 } STMT_END
 597
 598 #define Set_Node_Offset(node,byte) \
 599     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 600 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 601
 602 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
 603     if (! SIZE_ONLY) {                                                  \
 604         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
 605                 __LINE__, (int)(node), (int)(len)));                    \
 606         if((node) < 0) {                                                \
 607             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 608         } else {                                                        \
 609             RExC_offsets[2*(node)] = (len);                             \
 610         }                                                               \
 611     }                                                                   \
 612 } STMT_END
 613
 614 #define Set_Node_Length(node,len) \
 615     Set_Node_Length_To_R((node)-RExC_emit_start, len)
 616 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 617 #define Set_Node_Cur_Length(node) \
 618     Set_Node_Length(node, RExC_parse - parse_start)
 619
 620 /* Get offsets and lengths */
 621 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 622 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 623
 624 #define Set_Node_Offset_Length(node,offset,len) STMT_START {    \
 625     Set_Node_Offset_To_R((node)-RExC_emit_start, (offset));     \
 626     Set_Node_Length_To_R((node)-RExC_emit_start, (len));        \
 627 } STMT_END
 628 #endif
 629
 630 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 631 #define EXPERIMENTAL_INPLACESCAN
 632 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 633
 634 #define DEBUG_STUDYDATA(str,data,depth)                              \
 635 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 636     PerlIO_printf(Perl_debug_log,                                    \
 637         "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 638         " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 639         (int)(depth)*2, "",                                          \
 640         (IV)((data)->pos_min),                                       \
 641         (IV)((data)->pos_delta),                                     \
 642         (UV)((data)->flags),                                         \
 643         (IV)((data)->whilem_c),                                      \
 644         (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 645         is_inf ? "INF " : ""                                         \
 646     );                                                               \
 647     if ((data)->last_found)                                          \
 648         PerlIO_printf(Perl_debug_log,                                \
 649             "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 650             " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 651             SvPVX_const((data)->last_found),                         \
 652             (IV)((data)->last_end),                                  \
 653             (IV)((data)->last_start_min),                            \
 654             (IV)((data)->last_start_max),                            \
 655             ((data)->longest &&                                      \
 656              (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 657             SvPVX_const((data)->longest_fixed),                      \
 658             (IV)((data)->offset_fixed),                              \
 659             ((data)->longest &&                                      \
 660              (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 661             SvPVX_const((data)->longest_float),                      \
 662             (IV)((data)->offset_float_min),                          \
 663             (IV)((data)->offset_float_max)                           \
 664         );                                                           \
 665     PerlIO_printf(Perl_debug_log,"\n");                              \
 666 });
 667
 668 static void clear_re(pTHX_ void *r);
 669
 670 /* Mark that we cannot extend a found fixed substring at this point.
 671    Update the longest found anchored substring and the longest found
 672    floating substrings if needed. */
 673
 674 STATIC void
 675 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 676 {
 677     const STRLEN l = CHR_SVLEN(data->last_found);
 678     const STRLEN old_l = CHR_SVLEN(*data->longest);
 679     GET_RE_DEBUG_FLAGS_DECL;
 680
 681     PERL_ARGS_ASSERT_SCAN_COMMIT;
 682
 683     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 684         SvSetMagicSV(*data->longest, data->last_found);
 685         if (*data->longest == data->longest_fixed) {
 686             data->offset_fixed = l ? data->last_start_min : data->pos_min;
 687             if (data->flags & SF_BEFORE_EOL)
 688                 data->flags
 689                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 690             else
 691                 data->flags &= ~SF_FIX_BEFORE_EOL;
 692             data->minlen_fixed=minlenp;
 693             data->lookbehind_fixed=0;
 694         }
 695         else { /* *data->longest == data->longest_float */
 696             data->offset_float_min = l ? data->last_start_min : data->pos_min;
 697             data->offset_float_max = (l
 698                                       ? data->last_start_max
 699                                       : data->pos_min + data->pos_delta);
 700             if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 701                 data->offset_float_max = I32_MAX;
 702             if (data->flags & SF_BEFORE_EOL)
 703                 data->flags
 704                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 705             else
 706                 data->flags &= ~SF_FL_BEFORE_EOL;
 707             data->minlen_float=minlenp;
 708             data->lookbehind_float=0;
 709         }
 710     }
 711     SvCUR_set(data->last_found, 0);
 712     {
 713         SV * const sv = data->last_found;
 714         if (SvUTF8(sv) && SvMAGICAL(sv)) {
 715             MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 716             if (mg)
 717                 mg->mg_len = 0;
 718         }
 719     }
 720     data->last_end = -1;
 721     data->flags &= ~SF_BEFORE_EOL;
 722     DEBUG_STUDYDATA("commit: ",data,0);
 723 }
 724
 725 /* Can match anything (initialization) */
 726 STATIC void
 727 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 728 {
 729     PERL_ARGS_ASSERT_CL_ANYTHING;
 730
 731     ANYOF_BITMAP_SETALL(cl);
 732     cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
 733                 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
 734
 735     /* If any portion of the regex is to operate under locale rules,
 736      * initialization includes it.  The reason this isn't done for all regexes
 737      * is that the optimizer was written under the assumption that locale was
 738      * all-or-nothing.  Given the complexity and lack of documentation in the
 739      * optimizer, and that there are inadequate test cases for locale, so many
 740      * parts of it may not work properly, it is safest to avoid locale unless
 741      * necessary. */
 742     if (RExC_contains_locale) {
 743         ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 744         cl->flags |= ANYOF_LOCALE;
 745     }
 746     else {
 747         ANYOF_CLASS_ZERO(cl);       /* Only /l uses class now */
 748     }
 749 }
 750
 751 /* Can match anything (initialization) */
 752 STATIC int
 753 S_cl_is_anything(const struct regnode_charclass_class *cl)
 754 {
 755     int value;
 756
 757     PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 758
 759     for (value = 0; value <= ANYOF_MAX; value += 2)
 760         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 761             return 1;
 762     if (!(cl->flags & ANYOF_UNICODE_ALL))
 763         return 0;
 764     if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 765         return 0;
 766     return 1;
 767 }
 768
 769 /* Can match anything (initialization) */
 770 STATIC void
 771 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 772 {
 773     PERL_ARGS_ASSERT_CL_INIT;
 774
 775     Zero(cl, 1, struct regnode_charclass_class);
 776     cl->type = ANYOF;
 777     cl_anything(pRExC_state, cl);
 778     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 779 }
 780
 781 /* These two functions currently do the exact same thing */
 782 #define cl_init_zero            S_cl_init
 783
 784 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 785  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 786  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 787 STATIC void
 788 S_cl_and(struct regnode_charclass_class *cl,
 789         const struct regnode_charclass_class *and_with)
 790 {
 791     PERL_ARGS_ASSERT_CL_AND;
 792
 793     assert(and_with->type == ANYOF);
 794
 795     /* I (khw) am not sure all these restrictions are necessary XXX */
 796     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 797         && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 798         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 799         && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 800         && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
 801         int i;
 802
 803         if (and_with->flags & ANYOF_INVERT)
 804             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 805                 cl->bitmap[i] &= ~and_with->bitmap[i];
 806         else
 807             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 808                 cl->bitmap[i] &= and_with->bitmap[i];
 809     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 810
 811     if (and_with->flags & ANYOF_INVERT) {
 812
 813         /* Here, the and'ed node is inverted.  Get the AND of the flags that
 814          * aren't affected by the inversion.  Those that are affected are
 815          * handled individually below */
 816         U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 817         cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 818         cl->flags |= affected_flags;
 819
 820         /* We currently don't know how to deal with things that aren't in the
 821          * bitmap, but we know that the intersection is no greater than what
 822          * is already in cl, so let there be false positives that get sorted
 823          * out after the synthetic start class succeeds, and the node is
 824          * matched for real. */
 825
 826         /* The inversion of these two flags indicate that the resulting
 827          * intersection doesn't have them */
 828         if (and_with->flags & ANYOF_UNICODE_ALL) {
 829             cl->flags &= ~ANYOF_UNICODE_ALL;
 830         }
 831         if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 832             cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 833         }
 834     }
 835     else {   /* and'd node is not inverted */
 836         U8 outside_bitmap_but_not_utf8; /* Temp variable */
 837
 838         if (! ANYOF_NONBITMAP(and_with)) {
 839
 840             /* Here 'and_with' doesn't match anything outside the bitmap
 841              * (except possibly ANYOF_UNICODE_ALL), which means the
 842              * intersection can't either, except for ANYOF_UNICODE_ALL, in
 843              * which case we don't know what the intersection is, but it's no
 844              * greater than what cl already has, so can just leave it alone,
 845              * with possible false positives */
 846             if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 847                 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 848                 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 849             }
 850         }
 851         else if (! ANYOF_NONBITMAP(cl)) {
 852
 853             /* Here, 'and_with' does match something outside the bitmap, and cl
 854              * doesn't have a list of things to match outside the bitmap.  If
 855              * cl can match all code points above 255, the intersection will
 856              * be those above-255 code points that 'and_with' matches.  If cl
 857              * can't match all Unicode code points, it means that it can't
 858              * match anything outside the bitmap (since the 'if' that got us
 859              * into this block tested for that), so we leave the bitmap empty.
 860              */
 861             if (cl->flags & ANYOF_UNICODE_ALL) {
 862                 ARG_SET(cl, ARG(and_with));
 863
 864                 /* and_with's ARG may match things that don't require UTF8.
 865                  * And now cl's will too, in spite of this being an 'and'.  See
 866                  * the comments below about the kludge */
 867                 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 868             }
 869         }
 870         else {
 871             /* Here, both 'and_with' and cl match something outside the
 872              * bitmap.  Currently we do not do the intersection, so just match
 873              * whatever cl had at the beginning.  */
 874         }
 875
 876
 877         /* Take the intersection of the two sets of flags.  However, the
 878          * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 879          * kludge around the fact that this flag is not treated like the others
 880          * which are initialized in cl_anything().  The way the optimizer works
 881          * is that the synthetic start class (SSC) is initialized to match
 882          * anything, and then the first time a real node is encountered, its
 883          * values are AND'd with the SSC's with the result being the values of
 884          * the real node.  However, there are paths through the optimizer where
 885          * the AND never gets called, so those initialized bits are set
 886          * inappropriately, which is not usually a big deal, as they just cause
 887          * false positives in the SSC, which will just mean a probably
 888          * imperceptible slow down in execution.  However this bit has a
 889          * higher false positive consequence in that it can cause utf8.pm,
 890          * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 891          * bigger slowdown and also causes significant extra memory to be used.
 892          * In order to prevent this, the code now takes a different tack.  The
 893          * bit isn't set unless some part of the regular expression needs it,
 894          * but once set it won't get cleared.  This means that these extra
 895          * modules won't get loaded unless there was some path through the
 896          * pattern that would have required them anyway, and  so any false
 897          * positives that occur by not ANDing them out when they could be
 898          * aren't as severe as they would be if we treated this bit like all
 899          * the others */
 900         outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 901                                       & ANYOF_NONBITMAP_NON_UTF8;
 902         cl->flags &= and_with->flags;
 903         cl->flags |= outside_bitmap_but_not_utf8;
 904     }
 905 }
 906
 907 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 908  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 909  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 910 STATIC void
 911 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 912 {
 913     PERL_ARGS_ASSERT_CL_OR;
 914
 915     if (or_with->flags & ANYOF_INVERT) {
 916
 917         /* Here, the or'd node is to be inverted.  This means we take the
 918          * complement of everything not in the bitmap, but currently we don't
 919          * know what that is, so give up and match anything */
 920         if (ANYOF_NONBITMAP(or_with)) {
 921             cl_anything(pRExC_state, cl);
 922         }
 923         /* We do not use
 924          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 925          *   <= (B1 | !B2) | (CL1 | !CL2)
 926          * which is wasteful if CL2 is small, but we ignore CL2:
 927          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 928          * XXXX Can we handle case-fold?  Unclear:
 929          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 930          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 931          */
 932         else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 933              && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 934              && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
 935             int i;
 936
 937             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 938                 cl->bitmap[i] |= ~or_with->bitmap[i];
 939         } /* XXXX: logic is complicated otherwise */
 940         else {
 941             cl_anything(pRExC_state, cl);
 942         }
 943
 944         /* And, we can just take the union of the flags that aren't affected
 945          * by the inversion */
 946         cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
 947
 948         /* For the remaining flags:
 949             ANYOF_UNICODE_ALL and inverted means to not match anything above
 950                     255, which means that the union with cl should just be
 951                     what cl has in it, so can ignore this flag
 952             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
 953                     is 127-255 to match them, but then invert that, so the
 954                     union with cl should just be what cl has in it, so can
 955                     ignore this flag
 956          */
 957     } else {    /* 'or_with' is not inverted */
 958         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
 959         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 960              && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
 961                  || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
 962             int i;
 963
 964             /* OR char bitmap and class bitmap separately */
 965             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 966                 cl->bitmap[i] |= or_with->bitmap[i];
 967             if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
 968                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
 969                     cl->classflags[i] |= or_with->classflags[i];
 970                 cl->flags |= ANYOF_CLASS;
 971             }
 972         }
 973         else { /* XXXX: logic is complicated, leave it along for a moment. */
 974             cl_anything(pRExC_state, cl);
 975         }
 976
 977         if (ANYOF_NONBITMAP(or_with)) {
 978
 979             /* Use the added node's outside-the-bit-map match if there isn't a
 980              * conflict.  If there is a conflict (both nodes match something
 981              * outside the bitmap, but what they match outside is not the same
 982              * pointer, and hence not easily compared until XXX we extend
 983              * inversion lists this far), give up and allow the start class to
 984              * match everything outside the bitmap.  If that stuff is all above
 985              * 255, can just set UNICODE_ALL, otherwise caould be anything. */
 986             if (! ANYOF_NONBITMAP(cl)) {
 987                 ARG_SET(cl, ARG(or_with));
 988             }
 989             else if (ARG(cl) != ARG(or_with)) {
 990
 991                 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
 992                     cl_anything(pRExC_state, cl);
 993                 }
 994                 else {
 995                     cl->flags |= ANYOF_UNICODE_ALL;
 996                 }
 997             }
 998         }
 999
1000         /* Take the union */
1001         cl->flags |= or_with->flags;
1002     }
1003 }
1004
1005 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1006 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1007 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1008 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1009
1010
1011 #ifdef DEBUGGING
1012 /*
1013    dump_trie(trie,widecharmap,revcharmap)
1014    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1015    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1016
1017    These routines dump out a trie in a somewhat readable format.
1018    The _interim_ variants are used for debugging the interim
1019    tables that are used to generate the final compressed
1020    representation which is what dump_trie expects.
1021
1022    Part of the reason for their existence is to provide a form
1023    of documentation as to how the different representations function.
1024
1025 */
1026
1027 /*
1028   Dumps the final compressed table form of the trie to Perl_debug_log.
1029   Used for debugging make_trie().
1030 */
1031
1032 STATIC void
1033 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1034             AV *revcharmap, U32 depth)
1035 {
1036     U32 state;
1037     SV *sv=sv_newmortal();
1038     int colwidth= widecharmap ? 6 : 4;
1039     U16 word;
1040     GET_RE_DEBUG_FLAGS_DECL;
1041
1042     PERL_ARGS_ASSERT_DUMP_TRIE;
1043
1044     PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1045         (int)depth * 2 + 2,"",
1046         "Match","Base","Ofs" );
1047
1048     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1049         SV ** const tmp = av_fetch( revcharmap, state, 0);
1050         if ( tmp ) {
1051             PerlIO_printf( Perl_debug_log, "%*s",
1052                 colwidth,
1053                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1054                             PL_colors[0], PL_colors[1],
1055                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1056                             PERL_PV_ESCAPE_FIRSTCHAR
1057                 )
1058             );
1059         }
1060     }
1061     PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1062         (int)depth * 2 + 2,"");
1063
1064     for( state = 0 ; state < trie->uniquecharcount ; state++ )
1065         PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1066     PerlIO_printf( Perl_debug_log, "\n");
1067
1068     for( state = 1 ; state < trie->statecount ; state++ ) {
1069         const U32 base = trie->states[ state ].trans.base;
1070
1071         PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1072
1073         if ( trie->states[ state ].wordnum ) {
1074             PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1075         } else {
1076             PerlIO_printf( Perl_debug_log, "%6s", "" );
1077         }
1078
1079         PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1080
1081         if ( base ) {
1082             U32 ofs = 0;
1083
1084             while( ( base + ofs  < trie->uniquecharcount ) ||
1085                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
1086                      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1087                     ofs++;
1088
1089             PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1090
1091             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1092                 if ( ( base + ofs >= trie->uniquecharcount ) &&
1093                      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1094                      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1095                 {
1096                    PerlIO_printf( Perl_debug_log, "%*"UVXf,
1097                     colwidth,
1098                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1099                 } else {
1100                     PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1101                 }
1102             }
1103
1104             PerlIO_printf( Perl_debug_log, "]");
1105
1106         }
1107         PerlIO_printf( Perl_debug_log, "\n" );
1108     }
1109     PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1110     for (word=1; word <= trie->wordcount; word++) {
1111         PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1112             (int)word, (int)(trie->wordinfo[word].prev),
1113             (int)(trie->wordinfo[word].len));
1114     }
1115     PerlIO_printf(Perl_debug_log, "\n" );
1116 }
1117 /*
1118   Dumps a fully constructed but uncompressed trie in list form.
1119   List tries normally only are used for construction when the number of
1120   possible chars (trie->uniquecharcount) is very high.
1121   Used for debugging make_trie().
1122 */
1123 STATIC void
1124 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1125                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
1126                          U32 depth)
1127 {
1128     U32 state;
1129     SV *sv=sv_newmortal();
1130     int colwidth= widecharmap ? 6 : 4;
1131     GET_RE_DEBUG_FLAGS_DECL;
1132
1133     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1134
1135     /* print out the table precompression.  */
1136     PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1137         (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1138         "------:-----+-----------------\n" );
1139
1140     for( state=1 ; state < next_alloc ; state ++ ) {
1141         U16 charid;
1142
1143         PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1144             (int)depth * 2 + 2,"", (UV)state  );
1145         if ( ! trie->states[ state ].wordnum ) {
1146             PerlIO_printf( Perl_debug_log, "%5s| ","");
1147         } else {
1148             PerlIO_printf( Perl_debug_log, "W%4x| ",
1149                 trie->states[ state ].wordnum
1150             );
1151         }
1152         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1153             SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1154             if ( tmp ) {
1155                 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1156                     colwidth,
1157                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1158                             PL_colors[0], PL_colors[1],
1159                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1160                             PERL_PV_ESCAPE_FIRSTCHAR
1161                     ) ,
1162                     TRIE_LIST_ITEM(state,charid).forid,
1163                     (UV)TRIE_LIST_ITEM(state,charid).newstate
1164                 );
1165                 if (!(charid % 10))
1166                     PerlIO_printf(Perl_debug_log, "\n%*s| ",
1167                         (int)((depth * 2) + 14), "");
1168             }
1169         }
1170         PerlIO_printf( Perl_debug_log, "\n");
1171     }
1172 }
1173
1174 /*
1175   Dumps a fully constructed but uncompressed trie in table form.
1176   This is the normal DFA style state transition table, with a few
1177   twists to facilitate compression later.
1178   Used for debugging make_trie().
1179 */
1180 STATIC void
1181 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1182                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
1183                           U32 depth)
1184 {
1185     U32 state;
1186     U16 charid;
1187     SV *sv=sv_newmortal();
1188     int colwidth= widecharmap ? 6 : 4;
1189     GET_RE_DEBUG_FLAGS_DECL;
1190
1191     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1192
1193     /*
1194        print out the table precompression so that we can do a visual check
1195        that they are identical.
1196      */
1197
1198     PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1199
1200     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1201         SV ** const tmp = av_fetch( revcharmap, charid, 0);
1202         if ( tmp ) {
1203             PerlIO_printf( Perl_debug_log, "%*s",
1204                 colwidth,
1205                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1206                             PL_colors[0], PL_colors[1],
1207                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1208                             PERL_PV_ESCAPE_FIRSTCHAR
1209                 )
1210             );
1211         }
1212     }
1213
1214     PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1215
1216     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1217         PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1218     }
1219
1220     PerlIO_printf( Perl_debug_log, "\n" );
1221
1222     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1223
1224         PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1225             (int)depth * 2 + 2,"",
1226             (UV)TRIE_NODENUM( state ) );
1227
1228         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1229             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1230             if (v)
1231                 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1232             else
1233                 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1234         }
1235         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1236             PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1237         } else {
1238             PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1239             trie->states[ TRIE_NODENUM( state ) ].wordnum );
1240         }
1241     }
1242 }
1243
1244 #endif
1245
1246
1247 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1248   startbranch: the first branch in the whole branch sequence
1249   first      : start branch of sequence of branch-exact nodes.
1250                May be the same as startbranch
1251   last       : Thing following the last branch.
1252                May be the same as tail.
1253   tail       : item following the branch sequence
1254   count      : words in the sequence
1255   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1256   depth      : indent depth
1257
1258 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1259
1260 A trie is an N'ary tree where the branches are determined by digital
1261 decomposition of the key. IE, at the root node you look up the 1st character and
1262 follow that branch repeat until you find the end of the branches. Nodes can be
1263 marked as "accepting" meaning they represent a complete word. Eg:
1264
1265   /he|she|his|hers/
1266
1267 would convert into the following structure. Numbers represent states, letters
1268 following numbers represent valid transitions on the letter from that state, if
1269 the number is in square brackets it represents an accepting state, otherwise it
1270 will be in parenthesis.
1271
1272       +-h->+-e->[3]-+-r->(8)-+-s->[9]
1273       |    |
1274       |   (2)
1275       |    |
1276      (1)   +-i->(6)-+-s->[7]
1277       |
1278       +-s->(3)-+-h->(4)-+-e->[5]
1279
1280       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1281
1282 This shows that when matching against the string 'hers' we will begin at state 1
1283 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1284 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1285 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1286 single traverse. We store a mapping from accepting to state to which word was
1287 matched, and then when we have multiple possibilities we try to complete the
1288 rest of the regex in the order in which they occured in the alternation.
1289
1290 The only prior NFA like behaviour that would be changed by the TRIE support is
1291 the silent ignoring of duplicate alternations which are of the form:
1292
1293  / (DUPE|DUPE) X? (?{ ... }) Y /x
1294
1295 Thus EVAL blocks following a trie may be called a different number of times with
1296 and without the optimisation. With the optimisations dupes will be silently
1297 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1298 the following demonstrates:
1299
1300  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1301
1302 which prints out 'word' three times, but
1303
1304  'words'=~/(word|word|word)(?{ print $1 })S/
1305
1306 which doesnt print it out at all. This is due to other optimisations kicking in.
1307
1308 Example of what happens on a structural level:
1309
1310 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1311
1312    1: CURLYM[1] {1,32767}(18)
1313    5:   BRANCH(8)
1314    6:     EXACT <ac>(16)
1315    8:   BRANCH(11)
1316    9:     EXACT <ad>(16)
1317   11:   BRANCH(14)
1318   12:     EXACT <ab>(16)
1319   16:   SUCCEED(0)
1320   17:   NOTHING(18)
1321   18: END(0)
1322
1323 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1324 and should turn into:
1325
1326    1: CURLYM[1] {1,32767}(18)
1327    5:   TRIE(16)
1328         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1329           <ac>
1330           <ad>
1331           <ab>
1332   16:   SUCCEED(0)
1333   17:   NOTHING(18)
1334   18: END(0)
1335
1336 Cases where tail != last would be like /(?foo|bar)baz/:
1337
1338    1: BRANCH(4)
1339    2:   EXACT <foo>(8)
1340    4: BRANCH(7)
1341    5:   EXACT <bar>(8)
1342    7: TAIL(8)
1343    8: EXACT <baz>(10)
1344   10: END(0)
1345
1346 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1347 and would end up looking like:
1348
1349     1: TRIE(8)
1350       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1351         <foo>
1352         <bar>
1353    7: TAIL(8)
1354    8: EXACT <baz>(10)
1355   10: END(0)
1356
1357     d = uvuni_to_utf8_flags(d, uv, 0);
1358
1359 is the recommended Unicode-aware way of saying
1360
1361     *(d++) = uv;
1362 */
1363
1364 #define TRIE_STORE_REVCHAR                                                 \
1365     STMT_START {                                                           \
1366         if (UTF) {                                                         \
1367             SV *zlopp = newSV(2);                                          \
1368             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
1369             unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1370             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
1371             SvPOK_on(zlopp);                                               \
1372             SvUTF8_on(zlopp);                                              \
1373             av_push(revcharmap, zlopp);                                    \
1374         } else {                                                           \
1375             char ooooff = (char)uvc;                                               \
1376             av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
1377         }                                                                  \
1378         } STMT_END
1379
1380 #define TRIE_READ_CHAR STMT_START {                                           \
1381     wordlen++;                                                                \
1382     if ( UTF ) {                                                              \
1383         if ( folder ) {                                                       \
1384             if ( foldlen > 0 ) {                                              \
1385                uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
1386                foldlen -= len;                                                \
1387                scan += len;                                                   \
1388                len = 0;                                                       \
1389             } else {                                                          \
1390                 len = UTF8SKIP(uc);\
1391                 uvc = to_utf8_fold( uc, foldbuf, &foldlen);                   \
1392                 foldlen -= UNISKIP( uvc );                                    \
1393                 scan = foldbuf + UNISKIP( uvc );                              \
1394             }                                                                 \
1395         } else {                                                              \
1396             uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1397         }                                                                     \
1398     } else {                                                                  \
1399         uvc = (U32)*uc;                                                       \
1400         len = 1;                                                              \
1401     }                                                                         \
1402 } STMT_END
1403
1404
1405
1406 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1407     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1408         U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1409         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1410     }                                                           \
1411     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1412     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1413     TRIE_LIST_CUR( state )++;                                   \
1414 } STMT_END
1415
1416 #define TRIE_LIST_NEW(state) STMT_START {                       \
1417     Newxz( trie->states[ state ].trans.list,               \
1418         4, reg_trie_trans_le );                                 \
1419      TRIE_LIST_CUR( state ) = 1;                                \
1420      TRIE_LIST_LEN( state ) = 4;                                \
1421 } STMT_END
1422
1423 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1424     U16 dupe= trie->states[ state ].wordnum;                    \
1425     regnode * const noper_next = regnext( noper );              \
1426                                                                 \
1427     DEBUG_r({                                                   \
1428         /* store the word for dumping */                        \
1429         SV* tmp;                                                \
1430         if (OP(noper) != NOTHING)                               \
1431             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
1432         else                                                    \
1433             tmp = newSVpvn_utf8( "", 0, UTF );                  \
1434         av_push( trie_words, tmp );                             \
1435     });                                                         \
1436                                                                 \
1437     curword++;                                                  \
1438     trie->wordinfo[curword].prev   = 0;                         \
1439     trie->wordinfo[curword].len    = wordlen;                   \
1440     trie->wordinfo[curword].accept = state;                     \
1441                                                                 \
1442     if ( noper_next < tail ) {                                  \
1443         if (!trie->jump)                                        \
1444             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1445         trie->jump[curword] = (U16)(noper_next - convert);      \
1446         if (!jumper)                                            \
1447             jumper = noper_next;                                \
1448         if (!nextbranch)                                        \
1449             nextbranch= regnext(cur);                           \
1450     }                                                           \
1451                                                                 \
1452     if ( dupe ) {                                               \
1453         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1454         /* chain, so that when the bits of chain are later    */\
1455         /* linked together, the dups appear in the chain      */\
1456         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1457         trie->wordinfo[dupe].prev = curword;                    \
1458     } else {                                                    \
1459         /* we haven't inserted this word yet.                */ \
1460         trie->states[ state ].wordnum = curword;                \
1461     }                                                           \
1462 } STMT_END
1463
1464
1465 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
1466      ( ( base + charid >=  ucharcount                                   \
1467          && base + charid < ubound                                      \
1468          && state == trie->trans[ base - ucharcount + charid ].check    \
1469          && trie->trans[ base - ucharcount + charid ].next )            \
1470            ? trie->trans[ base - ucharcount + charid ].next             \
1471            : ( state==1 ? special : 0 )                                 \
1472       )
1473
1474 #define MADE_TRIE       1
1475 #define MADE_JUMP_TRIE  2
1476 #define MADE_EXACT_TRIE 4
1477
1478 STATIC I32
1479 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1480 {
1481     dVAR;
1482     /* first pass, loop through and scan words */
1483     reg_trie_data *trie;
1484     HV *widecharmap = NULL;
1485     AV *revcharmap = newAV();
1486     regnode *cur;
1487     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1488     STRLEN len = 0;
1489     UV uvc = 0;
1490     U16 curword = 0;
1491     U32 next_alloc = 0;
1492     regnode *jumper = NULL;
1493     regnode *nextbranch = NULL;
1494     regnode *convert = NULL;
1495     U32 *prev_states; /* temp array mapping each state to previous one */
1496     /* we just use folder as a flag in utf8 */
1497     const U8 * folder = NULL;
1498
1499 #ifdef DEBUGGING
1500     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1501     AV *trie_words = NULL;
1502     /* along with revcharmap, this only used during construction but both are
1503      * useful during debugging so we store them in the struct when debugging.
1504      */
1505 #else
1506     const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1507     STRLEN trie_charcount=0;
1508 #endif
1509     SV *re_trie_maxbuff;
1510     GET_RE_DEBUG_FLAGS_DECL;
1511
1512     PERL_ARGS_ASSERT_MAKE_TRIE;
1513 #ifndef DEBUGGING
1514     PERL_UNUSED_ARG(depth);
1515 #endif
1516
1517     switch (flags) {
1518         case EXACT: break;
1519         case EXACTFA:
1520         case EXACTFU: folder = PL_fold_latin1; break;
1521         case EXACTF:  folder = PL_fold; break;
1522         case EXACTFL: folder = PL_fold_locale; break;
1523         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
1524     }
1525
1526     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1527     trie->refcount = 1;
1528     trie->startstate = 1;
1529     trie->wordcount = word_count;
1530     RExC_rxi->data->data[ data_slot ] = (void*)trie;
1531     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1532     if (!(UTF && folder))
1533         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1534     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1535                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
1536
1537     DEBUG_r({
1538         trie_words = newAV();
1539     });
1540
1541     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1542     if (!SvIOK(re_trie_maxbuff)) {
1543         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1544     }
1545     DEBUG_OPTIMISE_r({
1546                 PerlIO_printf( Perl_debug_log,
1547                   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1548                   (int)depth * 2 + 2, "",
1549                   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1550                   REG_NODE_NUM(last), REG_NODE_NUM(tail),
1551                   (int)depth);
1552     });
1553
1554    /* Find the node we are going to overwrite */
1555     if ( first == startbranch && OP( last ) != BRANCH ) {
1556         /* whole branch chain */
1557         convert = first;
1558     } else {
1559         /* branch sub-chain */
1560         convert = NEXTOPER( first );
1561     }
1562
1563     /*  -- First loop and Setup --
1564
1565        We first traverse the branches and scan each word to determine if it
1566        contains widechars, and how many unique chars there are, this is
1567        important as we have to build a table with at least as many columns as we
1568        have unique chars.
1569
1570        We use an array of integers to represent the character codes 0..255
1571        (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1572        native representation of the character value as the key and IV's for the
1573        coded index.
1574
1575        *TODO* If we keep track of how many times each character is used we can
1576        remap the columns so that the table compression later on is more
1577        efficient in terms of memory by ensuring the most common value is in the
1578        middle and the least common are on the outside.  IMO this would be better
1579        than a most to least common mapping as theres a decent chance the most
1580        common letter will share a node with the least common, meaning the node
1581        will not be compressible. With a middle is most common approach the worst
1582        case is when we have the least common nodes twice.
1583
1584      */
1585
1586     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1587         regnode * const noper = NEXTOPER( cur );
1588         const U8 *uc = (U8*)STRING( noper );
1589         const U8 * const e  = uc + STR_LEN( noper );
1590         STRLEN foldlen = 0;
1591         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1592         const U8 *scan = (U8*)NULL;
1593         U32 wordlen      = 0;         /* required init */
1594         STRLEN chars = 0;
1595         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1596
1597         if (OP(noper) == NOTHING) {
1598             trie->minlen= 0;
1599             continue;
1600         }
1601         if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1602             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1603                                           regardless of encoding */
1604
1605         for ( ; uc < e ; uc += len ) {
1606             TRIE_CHARCOUNT(trie)++;
1607             TRIE_READ_CHAR;
1608             chars++;
1609             if ( uvc < 256 ) {
1610                 if ( !trie->charmap[ uvc ] ) {
1611                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1612                     if ( folder )
1613                         trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1614                     TRIE_STORE_REVCHAR;
1615                 }
1616                 if ( set_bit ) {
1617                     /* store the codepoint in the bitmap, and its folded
1618                      * equivalent. */
1619                     TRIE_BITMAP_SET(trie,uvc);
1620
1621                     /* store the folded codepoint */
1622                     if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1623
1624                     if ( !UTF ) {
1625                         /* store first byte of utf8 representation of
1626                            variant codepoints */
1627                         if (! UNI_IS_INVARIANT(uvc)) {
1628                             TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1629                         }
1630                     }
1631                     set_bit = 0; /* We've done our bit :-) */
1632                 }
1633             } else {
1634                 SV** svpp;
1635                 if ( !widecharmap )
1636                     widecharmap = newHV();
1637
1638                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1639
1640                 if ( !svpp )
1641                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1642
1643                 if ( !SvTRUE( *svpp ) ) {
1644                     sv_setiv( *svpp, ++trie->uniquecharcount );
1645                     TRIE_STORE_REVCHAR;
1646                 }
1647             }
1648         }
1649         if( cur == first ) {
1650             trie->minlen=chars;
1651             trie->maxlen=chars;
1652         } else if (chars < trie->minlen) {
1653             trie->minlen=chars;
1654         } else if (chars > trie->maxlen) {
1655             trie->maxlen=chars;
1656         }
1657
1658     } /* end first pass */
1659     DEBUG_TRIE_COMPILE_r(
1660         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1661                 (int)depth * 2 + 2,"",
1662                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1663                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1664                 (int)trie->minlen, (int)trie->maxlen )
1665     );
1666
1667     /*
1668         We now know what we are dealing with in terms of unique chars and
1669         string sizes so we can calculate how much memory a naive
1670         representation using a flat table  will take. If it's over a reasonable
1671         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1672         conservative but potentially much slower representation using an array
1673         of lists.
1674
1675         At the end we convert both representations into the same compressed
1676         form that will be used in regexec.c for matching with. The latter
1677         is a form that cannot be used to construct with but has memory
1678         properties similar to the list form and access properties similar
1679         to the table form making it both suitable for fast searches and
1680         small enough that its feasable to store for the duration of a program.
1681
1682         See the comment in the code where the compressed table is produced
1683         inplace from the flat tabe representation for an explanation of how
1684         the compression works.
1685
1686     */
1687
1688
1689     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1690     prev_states[1] = 0;
1691
1692     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1693         /*
1694             Second Pass -- Array Of Lists Representation
1695
1696             Each state will be represented by a list of charid:state records
1697             (reg_trie_trans_le) the first such element holds the CUR and LEN
1698             points of the allocated array. (See defines above).
1699
1700             We build the initial structure using the lists, and then convert
1701             it into the compressed table form which allows faster lookups
1702             (but cant be modified once converted).
1703         */
1704
1705         STRLEN transcount = 1;
1706
1707         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1708             "%*sCompiling trie using list compiler\n",
1709             (int)depth * 2 + 2, ""));
1710
1711         trie->states = (reg_trie_state *)
1712             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1713                                   sizeof(reg_trie_state) );
1714         TRIE_LIST_NEW(1);
1715         next_alloc = 2;
1716
1717         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1718
1719             regnode * const noper = NEXTOPER( cur );
1720             U8 *uc           = (U8*)STRING( noper );
1721             const U8 * const e = uc + STR_LEN( noper );
1722             U32 state        = 1;         /* required init */
1723             U16 charid       = 0;         /* sanity init */
1724             U8 *scan         = (U8*)NULL; /* sanity init */
1725             STRLEN foldlen   = 0;         /* required init */
1726             U32 wordlen      = 0;         /* required init */
1727             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1728
1729             if (OP(noper) != NOTHING) {
1730                 for ( ; uc < e ; uc += len ) {
1731
1732                     TRIE_READ_CHAR;
1733
1734                     if ( uvc < 256 ) {
1735                         charid = trie->charmap[ uvc ];
1736                     } else {
1737                         SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1738                         if ( !svpp ) {
1739                             charid = 0;
1740                         } else {
1741                             charid=(U16)SvIV( *svpp );
1742                         }
1743                     }
1744                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1745                     if ( charid ) {
1746
1747                         U16 check;
1748                         U32 newstate = 0;
1749
1750                         charid--;
1751                         if ( !trie->states[ state ].trans.list ) {
1752                             TRIE_LIST_NEW( state );
1753                         }
1754                         for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1755                             if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1756                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1757                                 break;
1758                             }
1759                         }
1760                         if ( ! newstate ) {
1761                             newstate = next_alloc++;
1762                             prev_states[newstate] = state;
1763                             TRIE_LIST_PUSH( state, charid, newstate );
1764                             transcount++;
1765                         }
1766                         state = newstate;
1767                     } else {
1768                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1769                     }
1770                 }
1771             }
1772             TRIE_HANDLE_WORD(state);
1773
1774         } /* end second pass */
1775
1776         /* next alloc is the NEXT state to be allocated */
1777         trie->statecount = next_alloc;
1778         trie->states = (reg_trie_state *)
1779             PerlMemShared_realloc( trie->states,
1780                                    next_alloc
1781                                    * sizeof(reg_trie_state) );
1782
1783         /* and now dump it out before we compress it */
1784         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1785                                                          revcharmap, next_alloc,
1786                                                          depth+1)
1787         );
1788
1789         trie->trans = (reg_trie_trans *)
1790             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1791         {
1792             U32 state;
1793             U32 tp = 0;
1794             U32 zp = 0;
1795
1796
1797             for( state=1 ; state < next_alloc ; state ++ ) {
1798                 U32 base=0;
1799
1800                 /*
1801                 DEBUG_TRIE_COMPILE_MORE_r(
1802                     PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1803                 );
1804                 */
1805
1806                 if (trie->states[state].trans.list) {
1807                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1808                     U16 maxid=minid;
1809                     U16 idx;
1810
1811                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1812                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1813                         if ( forid < minid ) {
1814                             minid=forid;
1815                         } else if ( forid > maxid ) {
1816                             maxid=forid;
1817                         }
1818                     }
1819                     if ( transcount < tp + maxid - minid + 1) {
1820                         transcount *= 2;
1821                         trie->trans = (reg_trie_trans *)
1822                             PerlMemShared_realloc( trie->trans,
1823                                                      transcount
1824                                                      * sizeof(reg_trie_trans) );
1825                         Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1826                     }
1827                     base = trie->uniquecharcount + tp - minid;
1828                     if ( maxid == minid ) {
1829                         U32 set = 0;
1830                         for ( ; zp < tp ; zp++ ) {
1831                             if ( ! trie->trans[ zp ].next ) {
1832                                 base = trie->uniquecharcount + zp - minid;
1833                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1834                                 trie->trans[ zp ].check = state;
1835                                 set = 1;
1836                                 break;
1837                             }
1838                         }
1839                         if ( !set ) {
1840                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1841                             trie->trans[ tp ].check = state;
1842                             tp++;
1843                             zp = tp;
1844                         }
1845                     } else {
1846                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1847                             const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1848                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1849                             trie->trans[ tid ].check = state;
1850                         }
1851                         tp += ( maxid - minid + 1 );
1852                     }
1853                     Safefree(trie->states[ state ].trans.list);
1854                 }
1855                 /*
1856                 DEBUG_TRIE_COMPILE_MORE_r(
1857                     PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1858                 );
1859                 */
1860                 trie->states[ state ].trans.base=base;
1861             }
1862             trie->lasttrans = tp + 1;
1863         }
1864     } else {
1865         /*
1866            Second Pass -- Flat Table Representation.
1867
1868            we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1869            We know that we will need Charcount+1 trans at most to store the data
1870            (one row per char at worst case) So we preallocate both structures
1871            assuming worst case.
1872
1873            We then construct the trie using only the .next slots of the entry
1874            structs.
1875
1876            We use the .check field of the first entry of the node temporarily to
1877            make compression both faster and easier by keeping track of how many non
1878            zero fields are in the node.
1879
1880            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1881            transition.
1882
1883            There are two terms at use here: state as a TRIE_NODEIDX() which is a
1884            number representing the first entry of the node, and state as a
1885            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1886            TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1887            are 2 entrys per node. eg:
1888
1889              A B       A B
1890           1. 2 4    1. 3 7
1891           2. 0 3    3. 0 5
1892           3. 0 0    5. 0 0
1893           4. 0 0    7. 0 0
1894
1895            The table is internally in the right hand, idx form. However as we also
1896            have to deal with the states array which is indexed by nodenum we have to
1897            use TRIE_NODENUM() to convert.
1898
1899         */
1900         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1901             "%*sCompiling trie using table compiler\n",
1902             (int)depth * 2 + 2, ""));
1903
1904         trie->trans = (reg_trie_trans *)
1905             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1906                                   * trie->uniquecharcount + 1,
1907                                   sizeof(reg_trie_trans) );
1908         trie->states = (reg_trie_state *)
1909             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1910                                   sizeof(reg_trie_state) );
1911         next_alloc = trie->uniquecharcount + 1;
1912
1913
1914         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1915
1916             regnode * const noper   = NEXTOPER( cur );
1917             const U8 *uc     = (U8*)STRING( noper );
1918             const U8 * const e = uc + STR_LEN( noper );
1919
1920             U32 state        = 1;         /* required init */
1921
1922             U16 charid       = 0;         /* sanity init */
1923             U32 accept_state = 0;         /* sanity init */
1924             U8 *scan         = (U8*)NULL; /* sanity init */
1925
1926             STRLEN foldlen   = 0;         /* required init */
1927             U32 wordlen      = 0;         /* required init */
1928             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1929
1930             if ( OP(noper) != NOTHING ) {
1931                 for ( ; uc < e ; uc += len ) {
1932
1933                     TRIE_READ_CHAR;
1934
1935                     if ( uvc < 256 ) {
1936                         charid = trie->charmap[ uvc ];
1937                     } else {
1938                         SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1939                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1940                     }
1941                     if ( charid ) {
1942                         charid--;
1943                         if ( !trie->trans[ state + charid ].next ) {
1944                             trie->trans[ state + charid ].next = next_alloc;
1945                             trie->trans[ state ].check++;
1946                             prev_states[TRIE_NODENUM(next_alloc)]
1947                                     = TRIE_NODENUM(state);
1948                             next_alloc += trie->uniquecharcount;
1949                         }
1950                         state = trie->trans[ state + charid ].next;
1951                     } else {
1952                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1953                     }
1954                     /* charid is now 0 if we dont know the char read, or nonzero if we do */
1955                 }
1956             }
1957             accept_state = TRIE_NODENUM( state );
1958             TRIE_HANDLE_WORD(accept_state);
1959
1960         } /* end second pass */
1961
1962         /* and now dump it out before we compress it */
1963         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1964                                                           revcharmap,
1965                                                           next_alloc, depth+1));
1966
1967         {
1968         /*
1969            * Inplace compress the table.*
1970
1971            For sparse data sets the table constructed by the trie algorithm will
1972            be mostly 0/FAIL transitions or to put it another way mostly empty.
1973            (Note that leaf nodes will not contain any transitions.)
1974
1975            This algorithm compresses the tables by eliminating most such
1976            transitions, at the cost of a modest bit of extra work during lookup:
1977
1978            - Each states[] entry contains a .base field which indicates the
1979            index in the state[] array wheres its transition data is stored.
1980
1981            - If .base is 0 there are no valid transitions from that node.
1982
1983            - If .base is nonzero then charid is added to it to find an entry in
1984            the trans array.
1985
1986            -If trans[states[state].base+charid].check!=state then the
1987            transition is taken to be a 0/Fail transition. Thus if there are fail
1988            transitions at the front of the node then the .base offset will point
1989            somewhere inside the previous nodes data (or maybe even into a node
1990            even earlier), but the .check field determines if the transition is
1991            valid.
1992
1993            XXX - wrong maybe?
1994            The following process inplace converts the table to the compressed
1995            table: We first do not compress the root node 1,and mark all its
1996            .check pointers as 1 and set its .base pointer as 1 as well. This
1997            allows us to do a DFA construction from the compressed table later,
1998            and ensures that any .base pointers we calculate later are greater
1999            than 0.
2000
2001            - We set 'pos' to indicate the first entry of the second node.
2002
2003            - We then iterate over the columns of the node, finding the first and
2004            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2005            and set the .check pointers accordingly, and advance pos
2006            appropriately and repreat for the next node. Note that when we copy
2007            the next pointers we have to convert them from the original
2008            NODEIDX form to NODENUM form as the former is not valid post
2009            compression.
2010
2011            - If a node has no transitions used we mark its base as 0 and do not
2012            advance the pos pointer.
2013
2014            - If a node only has one transition we use a second pointer into the
2015            structure to fill in allocated fail transitions from other states.
2016            This pointer is independent of the main pointer and scans forward
2017            looking for null transitions that are allocated to a state. When it
2018            finds one it writes the single transition into the "hole".  If the
2019            pointer doesnt find one the single transition is appended as normal.
2020
2021            - Once compressed we can Renew/realloc the structures to release the
2022            excess space.
2023
2024            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2025            specifically Fig 3.47 and the associated pseudocode.
2026
2027            demq
2028         */
2029         const U32 laststate = TRIE_NODENUM( next_alloc );
2030         U32 state, charid;
2031         U32 pos = 0, zp=0;
2032         trie->statecount = laststate;
2033
2034         for ( state = 1 ; state < laststate ; state++ ) {
2035             U8 flag = 0;
2036             const U32 stateidx = TRIE_NODEIDX( state );
2037             const U32 o_used = trie->trans[ stateidx ].check;
2038             U32 used = trie->trans[ stateidx ].check;
2039             trie->trans[ stateidx ].check = 0;
2040
2041             for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2042                 if ( flag || trie->trans[ stateidx + charid ].next ) {
2043                     if ( trie->trans[ stateidx + charid ].next ) {
2044                         if (o_used == 1) {
2045                             for ( ; zp < pos ; zp++ ) {
2046                                 if ( ! trie->trans[ zp ].next ) {
2047                                     break;
2048                                 }
2049                             }
2050                             trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2051                             trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2052                             trie->trans[ zp ].check = state;
2053                             if ( ++zp > pos ) pos = zp;
2054                             break;
2055                         }
2056                         used--;
2057                     }
2058                     if ( !flag ) {
2059                         flag = 1;
2060                         trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2061                     }
2062                     trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2063                     trie->trans[ pos ].check = state;
2064                     pos++;
2065                 }
2066             }
2067         }
2068         trie->lasttrans = pos + 1;
2069         trie->states = (reg_trie_state *)
2070             PerlMemShared_realloc( trie->states, laststate
2071                                    * sizeof(reg_trie_state) );
2072         DEBUG_TRIE_COMPILE_MORE_r(
2073                 PerlIO_printf( Perl_debug_log,
2074                     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2075                     (int)depth * 2 + 2,"",
2076                     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2077                     (IV)next_alloc,
2078                     (IV)pos,
2079                     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2080             );
2081
2082         } /* end table compress */
2083     }
2084     DEBUG_TRIE_COMPILE_MORE_r(
2085             PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2086                 (int)depth * 2 + 2, "",
2087                 (UV)trie->statecount,
2088                 (UV)trie->lasttrans)
2089     );
2090     /* resize the trans array to remove unused space */
2091     trie->trans = (reg_trie_trans *)
2092         PerlMemShared_realloc( trie->trans, trie->lasttrans
2093                                * sizeof(reg_trie_trans) );
2094
2095     {   /* Modify the program and insert the new TRIE node */
2096         U8 nodetype =(U8)(flags & 0xFF);
2097         char *str=NULL;
2098
2099 #ifdef DEBUGGING
2100         regnode *optimize = NULL;
2101 #ifdef RE_TRACK_PATTERN_OFFSETS
2102
2103         U32 mjd_offset = 0;
2104         U32 mjd_nodelen = 0;
2105 #endif /* RE_TRACK_PATTERN_OFFSETS */
2106 #endif /* DEBUGGING */
2107         /*
2108            This means we convert either the first branch or the first Exact,
2109            depending on whether the thing following (in 'last') is a branch
2110            or not and whther first is the startbranch (ie is it a sub part of
2111            the alternation or is it the whole thing.)
2112            Assuming its a sub part we convert the EXACT otherwise we convert
2113            the whole branch sequence, including the first.
2114          */
2115         /* Find the node we are going to overwrite */
2116         if ( first != startbranch || OP( last ) == BRANCH ) {
2117             /* branch sub-chain */
2118             NEXT_OFF( first ) = (U16)(last - first);
2119 #ifdef RE_TRACK_PATTERN_OFFSETS
2120             DEBUG_r({
2121                 mjd_offset= Node_Offset((convert));
2122                 mjd_nodelen= Node_Length((convert));
2123             });
2124 #endif
2125             /* whole branch chain */
2126         }
2127 #ifdef RE_TRACK_PATTERN_OFFSETS
2128         else {
2129             DEBUG_r({
2130                 const  regnode *nop = NEXTOPER( convert );
2131                 mjd_offset= Node_Offset((nop));
2132                 mjd_nodelen= Node_Length((nop));
2133             });
2134         }
2135         DEBUG_OPTIMISE_r(
2136             PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2137                 (int)depth * 2 + 2, "",
2138                 (UV)mjd_offset, (UV)mjd_nodelen)
2139         );
2140 #endif
2141         /* But first we check to see if there is a common prefix we can
2142            split out as an EXACT and put in front of the TRIE node.  */
2143         trie->startstate= 1;
2144         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2145             U32 state;
2146             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2147                 U32 ofs = 0;
2148                 I32 idx = -1;
2149                 U32 count = 0;
2150                 const U32 base = trie->states[ state ].trans.base;
2151
2152                 if ( trie->states[state].wordnum )
2153                         count = 1;
2154
2155                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2156                     if ( ( base + ofs >= trie->uniquecharcount ) &&
2157                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2158                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2159                     {
2160                         if ( ++count > 1 ) {
2161                             SV **tmp = av_fetch( revcharmap, ofs, 0);
2162                             const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2163                             if ( state == 1 ) break;
2164                             if ( count == 2 ) {
2165                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2166                                 DEBUG_OPTIMISE_r(
2167                                     PerlIO_printf(Perl_debug_log,
2168                                         "%*sNew Start State=%"UVuf" Class: [",
2169                                         (int)depth * 2 + 2, "",
2170                                         (UV)state));
2171                                 if (idx >= 0) {
2172                                     SV ** const tmp = av_fetch( revcharmap, idx, 0);
2173                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2174
2175                                     TRIE_BITMAP_SET(trie,*ch);
2176                                     if ( folder )
2177                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
2178                                     DEBUG_OPTIMISE_r(
2179                                         PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2180                                     );
2181                                 }
2182                             }
2183                             TRIE_BITMAP_SET(trie,*ch);
2184                             if ( folder )
2185                                 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2186                             DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2187                         }
2188                         idx = ofs;
2189                     }
2190                 }
2191                 if ( count == 1 ) {
2192                     SV **tmp = av_fetch( revcharmap, idx, 0);
2193                     STRLEN len;
2194                     char *ch = SvPV( *tmp, len );
2195                     DEBUG_OPTIMISE_r({
2196                         SV *sv=sv_newmortal();
2197                         PerlIO_printf( Perl_debug_log,
2198                             "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2199                             (int)depth * 2 + 2, "",
2200                             (UV)state, (UV)idx,
2201                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2202                                 PL_colors[0], PL_colors[1],
2203                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2204                                 PERL_PV_ESCAPE_FIRSTCHAR
2205                             )
2206                         );
2207                     });
2208                     if ( state==1 ) {
2209                         OP( convert ) = nodetype;
2210                         str=STRING(convert);
2211                         STR_LEN(convert)=0;
2212                     }
2213                     STR_LEN(convert) += len;
2214                     while (len--)
2215                         *str++ = *ch++;
2216                 } else {
2217 #ifdef DEBUGGING
2218                     if (state>1)
2219                         DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2220 #endif
2221                     break;
2222                 }
2223             }
2224             trie->prefixlen = (state-1);
2225             if (str) {
2226                 regnode *n = convert+NODE_SZ_STR(convert);
2227                 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2228                 trie->startstate = state;
2229                 trie->minlen -= (state - 1);
2230                 trie->maxlen -= (state - 1);
2231 #ifdef DEBUGGING
2232                /* At least the UNICOS C compiler choked on this
2233                 * being argument to DEBUG_r(), so let's just have
2234                 * it right here. */
2235                if (
2236 #ifdef PERL_EXT_RE_BUILD
2237                    1
2238 #else
2239                    DEBUG_r_TEST
2240 #endif
2241                    ) {
2242                    regnode *fix = convert;
2243                    U32 word = trie->wordcount;
2244                    mjd_nodelen++;
2245                    Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2246                    while( ++fix < n ) {
2247                        Set_Node_Offset_Length(fix, 0, 0);
2248                    }
2249                    while (word--) {
2250                        SV ** const tmp = av_fetch( trie_words, word, 0 );
2251                        if (tmp) {
2252                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
2253                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2254                            else
2255                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2256                        }
2257                    }
2258                }
2259 #endif
2260                 if (trie->maxlen) {
2261                     convert = n;
2262                 } else {
2263                     NEXT_OFF(convert) = (U16)(tail - convert);
2264                     DEBUG_r(optimize= n);
2265                 }
2266             }
2267         }
2268         if (!jumper)
2269             jumper = last;
2270         if ( trie->maxlen ) {
2271             NEXT_OFF( convert ) = (U16)(tail - convert);
2272             ARG_SET( convert, data_slot );
2273             /* Store the offset to the first unabsorbed branch in
2274                jump[0], which is otherwise unused by the jump logic.
2275                We use this when dumping a trie and during optimisation. */
2276             if (trie->jump)
2277                 trie->jump[0] = (U16)(nextbranch - convert);
2278
2279             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2280              *   and there is a bitmap
2281              *   and the first "jump target" node we found leaves enough room
2282              * then convert the TRIE node into a TRIEC node, with the bitmap
2283              * embedded inline in the opcode - this is hypothetically faster.
2284              */
2285             if ( !trie->states[trie->startstate].wordnum
2286                  && trie->bitmap
2287                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2288             {
2289                 OP( convert ) = TRIEC;
2290                 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2291                 PerlMemShared_free(trie->bitmap);
2292                 trie->bitmap= NULL;
2293             } else
2294                 OP( convert ) = TRIE;
2295
2296             /* store the type in the flags */
2297             convert->flags = nodetype;
2298             DEBUG_r({
2299             optimize = convert
2300                       + NODE_STEP_REGNODE
2301                       + regarglen[ OP( convert ) ];
2302             });
2303             /* XXX We really should free up the resource in trie now,
2304                    as we won't use them - (which resources?) dmq */
2305         }
2306         /* needed for dumping*/
2307         DEBUG_r(if (optimize) {
2308             regnode *opt = convert;
2309
2310             while ( ++opt < optimize) {
2311                 Set_Node_Offset_Length(opt,0,0);
2312             }
2313             /*
2314                 Try to clean up some of the debris left after the
2315                 optimisation.
2316              */
2317             while( optimize < jumper ) {
2318                 mjd_nodelen += Node_Length((optimize));
2319                 OP( optimize ) = OPTIMIZED;
2320                 Set_Node_Offset_Length(optimize,0,0);
2321                 optimize++;
2322             }
2323             Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2324         });
2325     } /* end node insert */
2326
2327     /*  Finish populating the prev field of the wordinfo array.  Walk back
2328      *  from each accept state until we find another accept state, and if
2329      *  so, point the first word's .prev field at the second word. If the
2330      *  second already has a .prev field set, stop now. This will be the
2331      *  case either if we've already processed that word's accept state,
2332      *  or that state had multiple words, and the overspill words were
2333      *  already linked up earlier.
2334      */
2335     {
2336         U16 word;
2337         U32 state;
2338         U16 prev;
2339
2340         for (word=1; word <= trie->wordcount; word++) {
2341             prev = 0;
2342             if (trie->wordinfo[word].prev)
2343                 continue;
2344             state = trie->wordinfo[word].accept;
2345             while (state) {
2346                 state = prev_states[state];
2347                 if (!state)
2348                     break;
2349                 prev = trie->states[state].wordnum;
2350                 if (prev)
2351                     break;
2352             }
2353             trie->wordinfo[word].prev = prev;
2354         }
2355         Safefree(prev_states);
2356     }
2357
2358
2359     /* and now dump out the compressed format */
2360     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2361
2362     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2363 #ifdef DEBUGGING
2364     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2365     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2366 #else
2367     SvREFCNT_dec(revcharmap);
2368 #endif
2369     return trie->jump
2370            ? MADE_JUMP_TRIE
2371            : trie->startstate>1
2372              ? MADE_EXACT_TRIE
2373              : MADE_TRIE;
2374 }
2375
2376 STATIC void
2377 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2378 {
2379 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2380
2381    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2382    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2383    ISBN 0-201-10088-6
2384
2385    We find the fail state for each state in the trie, this state is the longest proper
2386    suffix of the current state's 'word' that is also a proper prefix of another word in our
2387    trie. State 1 represents the word '' and is thus the default fail state. This allows
2388    the DFA not to have to restart after its tried and failed a word at a given point, it
2389    simply continues as though it had been matching the other word in the first place.
2390    Consider
2391       'abcdgu'=~/abcdefg|cdgu/
2392    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2393    fail, which would bring us to the state representing 'd' in the second word where we would
2394    try 'g' and succeed, proceeding to match 'cdgu'.
2395  */
2396  /* add a fail transition */
2397     const U32 trie_offset = ARG(source);
2398     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2399     U32 *q;
2400     const U32 ucharcount = trie->uniquecharcount;
2401     const U32 numstates = trie->statecount;
2402     const U32 ubound = trie->lasttrans + ucharcount;
2403     U32 q_read = 0;
2404     U32 q_write = 0;
2405     U32 charid;
2406     U32 base = trie->states[ 1 ].trans.base;
2407     U32 *fail;
2408     reg_ac_data *aho;
2409     const U32 data_slot = add_data( pRExC_state, 1, "T" );
2410     GET_RE_DEBUG_FLAGS_DECL;
2411
2412     PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2413 #ifndef DEBUGGING
2414     PERL_UNUSED_ARG(depth);
2415 #endif
2416
2417
2418     ARG_SET( stclass, data_slot );
2419     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2420     RExC_rxi->data->data[ data_slot ] = (void*)aho;
2421     aho->trie=trie_offset;
2422     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2423     Copy( trie->states, aho->states, numstates, reg_trie_state );
2424     Newxz( q, numstates, U32);
2425     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2426     aho->refcount = 1;
2427     fail = aho->fail;
2428     /* initialize fail[0..1] to be 1 so that we always have
2429        a valid final fail state */
2430     fail[ 0 ] = fail[ 1 ] = 1;
2431
2432     for ( charid = 0; charid < ucharcount ; charid++ ) {
2433         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2434         if ( newstate ) {
2435             q[ q_write ] = newstate;
2436             /* set to point at the root */
2437             fail[ q[ q_write++ ] ]=1;
2438         }
2439     }
2440     while ( q_read < q_write) {
2441         const U32 cur = q[ q_read++ % numstates ];
2442         base = trie->states[ cur ].trans.base;
2443
2444         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2445             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2446             if (ch_state) {
2447                 U32 fail_state = cur;
2448                 U32 fail_base;
2449                 do {
2450                     fail_state = fail[ fail_state ];
2451                     fail_base = aho->states[ fail_state ].trans.base;
2452                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2453
2454                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2455                 fail[ ch_state ] = fail_state;
2456                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2457                 {
2458                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2459                 }
2460                 q[ q_write++ % numstates] = ch_state;
2461             }
2462         }
2463     }
2464     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2465        when we fail in state 1, this allows us to use the
2466        charclass scan to find a valid start char. This is based on the principle
2467        that theres a good chance the string being searched contains lots of stuff
2468        that cant be a start char.
2469      */
2470     fail[ 0 ] = fail[ 1 ] = 0;
2471     DEBUG_TRIE_COMPILE_r({
2472         PerlIO_printf(Perl_debug_log,
2473                       "%*sStclass Failtable (%"UVuf" states): 0",
2474                       (int)(depth * 2), "", (UV)numstates
2475         );
2476         for( q_read=1; q_read<numstates; q_read++ ) {
2477             PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2478         }
2479         PerlIO_printf(Perl_debug_log, "\n");
2480     });
2481     Safefree(q);
2482     /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2483 }
2484
2485
2486 /*
2487  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2488  * These need to be revisited when a newer toolchain becomes available.
2489  */
2490 #if defined(__sparc64__) && defined(__GNUC__)
2491 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2492 #       undef  SPARC64_GCC_WORKAROUND
2493 #       define SPARC64_GCC_WORKAROUND 1
2494 #   endif
2495 #endif
2496
2497 #define DEBUG_PEEP(str,scan,depth) \
2498     DEBUG_OPTIMISE_r({if (scan){ \
2499        SV * const mysv=sv_newmortal(); \
2500        regnode *Next = regnext(scan); \
2501        regprop(RExC_rx, mysv, scan); \
2502        PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2503        (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2504        Next ? (REG_NODE_NUM(Next)) : 0 ); \
2505    }});
2506
2507
2508 /* The below joins as many adjacent EXACTish nodes as possible into a single
2509  * one, and looks for problematic sequences of characters whose folds vs.
2510  * non-folds have sufficiently different lengths, that the optimizer would be
2511  * fooled into rejecting legitimate matches of them, and the trie construction
2512  * code can't cope with them.  The joining is only done if:
2513  * 1) there is room in the current conglomerated node to entirely contain the
2514  *    next one.
2515  * 2) they are the exact same node type
2516  *
2517  * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2518  * these get optimized out
2519  *
2520  * If there are problematic code sequences, *min_subtract is set to the delta
2521  * that the minimum size of the node can be less than its actual size.  And,
2522  * the node type of the result is changed to reflect that it contains these
2523  * sequences.
2524  *
2525  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2526  * and contains LATIN SMALL LETTER SHARP S
2527  *
2528  * This is as good a place as any to discuss the design of handling these
2529  * problematic sequences.  It's been wrong in Perl for a very long time.  There
2530  * are three code points in Unicode whose folded lengths differ so much from
2531  * the un-folded lengths that it causes problems for the optimizer and trie
2532  * construction.  Why only these are problematic, and not others where lengths
2533  * also differ is something I (khw) do not understand.  New versions of Unicode
2534  * might add more such code points.  Hopefully the logic in fold_grind.t that
2535  * figures out what to test (in part by verifying that each size-combination
2536  * gets tested) will catch any that do come along, so they can be added to the
2537  * special handling below.  The chances of new ones are actually rather small,
2538  * as most, if not all, of the world's scripts that have casefolding have
2539  * already been encoded by Unicode.  Also, a number of Unicode's decisions were
2540  * made to allow compatibility with pre-existing standards, and almost all of
2541  * those have already been dealt with.  These would otherwise be the most
2542  * likely candidates for generating further tricky sequences.  In other words,
2543  * Unicode by itself is unlikely to add new ones unless it is for compatibility
2544  * with pre-existing standards, and there aren't many of those left.
2545  *
2546  * The previous designs for dealing with these involved assigning a special
2547  * node for them.  This approach doesn't work, as evidenced by this example:
2548  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2549  * Both these fold to "sss", but if the pattern is parsed to create a node of
2550  * that would match just the \xDF, it won't be able to handle the case where a
2551  * successful match would have to cross the node's boundary.  The new approach
2552  * that hopefully generally solves the problem generates an EXACTFU_SS node
2553  * that is "sss".
2554  *
2555  * There are a number of components to the approach (a lot of work for just
2556  * three code points!):
2557  * 1)   This routine examines each EXACTFish node that could contain the
2558  *      problematic sequences.  It returns in *min_subtract how much to
2559  *      subtract from the the actual length of the string to get a real minimum
2560  *      for one that could match it.  This number is usually 0 except for the
2561  *      problematic sequences.  This delta is used by the caller to adjust the
2562  *      min length of the match, and the delta between min and max, so that the
2563  *      optimizer doesn't reject these possibilities based on size constraints.
2564  * 2)   These sequences are not currently correctly handled by the trie code
2565  *      either, so it changes the joined node type to ops that are not handled
2566  *      by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
2567  * 3)   This is sufficient for the two Greek sequences (described below), but
2568  *      the one involving the Sharp s (\xDF) needs more.  The node type
2569  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2570  *      sequence in it.  For non-UTF-8 patterns and strings, this is the only
2571  *      case where there is a possible fold length change.  That means that a
2572  *      regular EXACTFU node without UTF-8 involvement doesn't have to concern
2573  *      itself with length changes, and so can be processed faster.  regexec.c
2574  *      takes advantage of this.  Generally, an EXACTFish node that is in UTF-8
2575  *      is pre-folded by regcomp.c.  This saves effort in regex matching.
2576  *      However, probably mostly for historical reasons, the pre-folding isn't
2577  *      done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2578  *      nodes, as what they fold to isn't known until runtime.)  The fold
2579  *      possibilities for the non-UTF8 patterns are quite simple, except for
2580  *      the sharp s.  All the ones that don't involve a UTF-8 target string
2581  *      are members of a fold-pair, and arrays are set up for all of them
2582  *      that quickly find the other member of the pair.  It might actually
2583  *      be faster to pre-fold these, but it isn't currently done, except for
2584  *      the sharp s.  Code elsewhere in this file makes sure that it gets
2585  *      folded to 'ss', even if the pattern isn't UTF-8.  This avoids the
2586  *      issues described in the next item.
2587  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2588  *      'ss' or not is not knowable at compile time.  It will match iff the
2589  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2590  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2591  *      it can't be folded to "ss" at compile time, unlike EXACTFU does as
2592  *      described in item 3).  An assumption that the optimizer part of
2593  *      regexec.c (probably unwittingly) makes is that a character in the
2594  *      pattern corresponds to at most a single character in the target string.
2595  *      (And I do mean character, and not byte here, unlike other parts of the
2596  *      documentation that have never been updated to account for multibyte
2597  *      Unicode.)  This assumption is wrong only in this case, as all other
2598  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2599  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2600  *      reluctant to try to change this assumption, so instead the code punts.
2601  *      This routine examines EXACTF nodes for the sharp s, and returns a
2602  *      boolean indicating whether or not the node is an EXACTF node that
2603  *      contains a sharp s.  When it is true, the caller sets a flag that later
2604  *      causes the optimizer in this file to not set values for the floating
2605  *      and fixed string lengths, and thus avoids the optimizer code in
2606  *      regexec.c that makes the invalid assumption.  Thus, there is no
2607  *      optimization based on string lengths for EXACTF nodes that contain the
2608  *      sharp s.  This only happens for /id rules (which means the pattern
2609  *      isn't in UTF-8).
2610  */
2611
2612 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2613     if (PL_regkind[OP(scan)] == EXACT) \
2614         join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2615
2616 STATIC U32
2617 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2618     /* Merge several consecutive EXACTish nodes into one. */
2619     regnode *n = regnext(scan);
2620     U32 stringok = 1;
2621     regnode *next = scan + NODE_SZ_STR(scan);
2622     U32 merged = 0;
2623     U32 stopnow = 0;
2624 #ifdef DEBUGGING
2625     regnode *stop = scan;
2626     GET_RE_DEBUG_FLAGS_DECL;
2627 #else
2628     PERL_UNUSED_ARG(depth);
2629 #endif
2630
2631     PERL_ARGS_ASSERT_JOIN_EXACT;
2632 #ifndef EXPERIMENTAL_INPLACESCAN
2633     PERL_UNUSED_ARG(flags);
2634     PERL_UNUSED_ARG(val);
2635 #endif
2636     DEBUG_PEEP("join",scan,depth);
2637
2638     /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2639      * EXACT ones that are mergeable to the current one. */
2640     while (n
2641            && (PL_regkind[OP(n)] == NOTHING
2642                || (stringok && OP(n) == OP(scan)))
2643            && NEXT_OFF(n)
2644            && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2645     {
2646
2647         if (OP(n) == TAIL || n > next)
2648             stringok = 0;
2649         if (PL_regkind[OP(n)] == NOTHING) {
2650             DEBUG_PEEP("skip:",n,depth);
2651             NEXT_OFF(scan) += NEXT_OFF(n);
2652             next = n + NODE_STEP_REGNODE;
2653 #ifdef DEBUGGING
2654             if (stringok)
2655                 stop = n;
2656 #endif
2657             n = regnext(n);
2658         }
2659         else if (stringok) {
2660             const unsigned int oldl = STR_LEN(scan);
2661             regnode * const nnext = regnext(n);
2662
2663             if (oldl + STR_LEN(n) > U8_MAX)
2664                 break;
2665
2666             DEBUG_PEEP("merg",n,depth);
2667             merged++;
2668
2669             NEXT_OFF(scan) += NEXT_OFF(n);
2670             STR_LEN(scan) += STR_LEN(n);
2671             next = n + NODE_SZ_STR(n);
2672             /* Now we can overwrite *n : */
2673             Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2674 #ifdef DEBUGGING
2675             stop = next - 1;
2676 #endif
2677             n = nnext;
2678             if (stopnow) break;
2679         }
2680
2681 #ifdef EXPERIMENTAL_INPLACESCAN
2682         if (flags && !NEXT_OFF(n)) {
2683             DEBUG_PEEP("atch", val, depth);
2684             if (reg_off_by_arg[OP(n)]) {
2685                 ARG_SET(n, val - n);
2686             }
2687             else {
2688                 NEXT_OFF(n) = val - n;
2689             }
2690             stopnow = 1;
2691         }
2692 #endif
2693     }
2694
2695     *min_subtract = 0;
2696     *has_exactf_sharp_s = FALSE;
2697
2698     /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2699      * can now analyze for sequences of problematic code points.  (Prior to
2700      * this final joining, sequences could have been split over boundaries, and
2701      * hence missed).  The sequences only happen in folding, hence for any
2702      * non-EXACT EXACTish node */
2703     if (OP(scan) != EXACT) {
2704         U8 *s;
2705         U8 * s0 = (U8*) STRING(scan);
2706         U8 * const s_end = s0 + STR_LEN(scan);
2707
2708         /* The below is perhaps overboard, but this allows us to save a test
2709          * each time through the loop at the expense of a mask.  This is
2710          * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2711          * single bit.  On ASCII they are 32 apart; on EBCDIC, they are 64.
2712          * This uses an exclusive 'or' to find that bit and then inverts it to
2713          * form a mask, with just a single 0, in the bit position where 'S' and
2714          * 's' differ. */
2715         const U8 S_or_s_mask = ~ ('S' ^ 's');
2716         const U8 s_masked = 's' & S_or_s_mask;
2717
2718         /* One pass is made over the node's string looking for all the
2719          * possibilities.  to avoid some tests in the loop, there are two main
2720          * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2721          * non-UTF-8 */
2722         if (UTF) {
2723
2724             /* There are two problematic Greek code points in Unicode
2725              * casefolding
2726              *
2727              * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2728              * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2729              *
2730              * which casefold to
2731              *
2732              * Unicode                      UTF-8
2733              *
2734              * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
2735              * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
2736              *
2737              * This means that in case-insensitive matching (or "loose
2738              * matching", as Unicode calls it), an EXACTF of length six (the
2739              * UTF-8 encoded byte length of the above casefolded versions) can
2740              * match a target string of length two (the byte length of UTF-8
2741              * encoded U+0390 or U+03B0).  This would rather mess up the
2742              * minimum length computation.  (there are other code points that
2743              * also fold to these two sequences, but the delta is smaller)
2744              *
2745              * If these sequences are found, the minimum length is decreased by
2746              * four (six minus two).
2747              *
2748              * Similarly, 'ss' may match the single char and byte LATIN SMALL
2749              * LETTER SHARP S.  We decrease the min length by 1 for each
2750              * occurrence of 'ss' found */
2751
2752 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2753 #           define U390_first_byte 0xb4
2754             const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2755 #           define U3B0_first_byte 0xb5
2756             const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2757 #else
2758 #           define U390_first_byte 0xce
2759             const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2760 #           define U3B0_first_byte 0xcf
2761             const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2762 #endif
2763             const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2764                                                  yields a net of 0 */
2765             /* Examine the string for one of the problematic sequences */
2766             for (s = s0;
2767                  s < s_end - 1; /* Can stop 1 before the end, as minimum length
2768                                  * sequence we are looking for is 2 */
2769                  s += UTF8SKIP(s))
2770             {
2771
2772                 /* Look for the first byte in each problematic sequence */
2773                 switch (*s) {
2774                     /* We don't have to worry about other things that fold to
2775                      * 's' (such as the long s, U+017F), as all above-latin1
2776                      * code points have been pre-folded */
2777                     case 's':
2778                     case 'S':
2779
2780                         /* Current character is an 's' or 'S'.  If next one is
2781                          * as well, we have the dreaded sequence */
2782                         if (((*(s+1) & S_or_s_mask) == s_masked)
2783                             /* These two node types don't have special handling
2784                              * for 'ss' */
2785                             && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2786                         {
2787                             *min_subtract += 1;
2788                             OP(scan) = EXACTFU_SS;
2789                             s++;    /* No need to look at this character again */
2790                         }
2791                         break;
2792
2793                     case U390_first_byte:
2794                         if (s_end - s >= len
2795
2796                             /* The 1's are because are skipping comparing the
2797                              * first byte */
2798                             && memEQ(s + 1, U390_tail, len - 1))
2799                         {
2800                             goto greek_sequence;
2801                         }
2802                         break;
2803
2804                     case U3B0_first_byte:
2805                         if (! (s_end - s >= len
2806                                && memEQ(s + 1, U3B0_tail, len - 1)))
2807                         {
2808                             break;
2809                         }
2810                       greek_sequence:
2811                         *min_subtract += 4;
2812
2813                         /* This can't currently be handled by trie's, so change
2814                          * the node type to indicate this.  If EXACTFA and
2815                          * EXACTFL were ever to be handled by trie's, this
2816                          * would have to be changed.  If this node has already
2817                          * been changed to EXACTFU_SS in this loop, leave it as
2818                          * is.  (I (khw) think it doesn't matter in regexec.c
2819                          * for UTF patterns, but no need to change it */
2820                         if (OP(scan) == EXACTFU) {
2821                             OP(scan) = EXACTFU_NO_TRIE;
2822                         }
2823                         s += 6; /* We already know what this sequence is.  Skip
2824                                    the rest of it */
2825                         break;
2826                 }
2827             }
2828         }
2829         else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2830
2831             /* Here, the pattern is not UTF-8.  We need to look only for the
2832              * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2833              * in the final position.  Otherwise we can stop looking 1 byte
2834              * earlier because have to find both the first and second 's' */
2835             const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2836
2837             for (s = s0; s < upper; s++) {
2838                 switch (*s) {
2839                     case 'S':
2840                     case 's':
2841                         if (s_end - s > 1
2842                             && ((*(s+1) & S_or_s_mask) == s_masked))
2843                         {
2844                             *min_subtract += 1;
2845
2846                             /* EXACTF nodes need to know that the minimum
2847                              * length changed so that a sharp s in the string
2848                              * can match this ss in the pattern, but they
2849                              * remain EXACTF nodes, as they are not trie'able,
2850                              * so don't have to invent a new node type to
2851                              * exclude them from the trie code */
2852                             if (OP(scan) != EXACTF) {
2853                                 OP(scan) = EXACTFU_SS;
2854                             }
2855                             s++;
2856                         }
2857                         break;
2858                     case LATIN_SMALL_LETTER_SHARP_S:
2859                         if (OP(scan) == EXACTF) {
2860                             *has_exactf_sharp_s = TRUE;
2861                         }
2862                         break;
2863                 }
2864             }
2865         }
2866     }
2867
2868 #ifdef DEBUGGING
2869     /* Allow dumping but overwriting the collection of skipped
2870      * ops and/or strings with fake optimized ops */
2871     n = scan + NODE_SZ_STR(scan);
2872     while (n <= stop) {
2873         OP(n) = OPTIMIZED;
2874         FLAGS(n) = 0;
2875         NEXT_OFF(n) = 0;
2876         n++;
2877     }
2878 #endif
2879     DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2880     return stopnow;
2881 }
2882
2883 /* REx optimizer.  Converts nodes into quicker variants "in place".
2884    Finds fixed substrings.  */
2885
2886 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2887    to the position after last scanned or to NULL. */
2888
2889 #define INIT_AND_WITHP \
2890     assert(!and_withp); \
2891     Newx(and_withp,1,struct regnode_charclass_class); \
2892     SAVEFREEPV(and_withp)
2893
2894 /* this is a chain of data about sub patterns we are processing that
2895    need to be handled separately/specially in study_chunk. Its so
2896    we can simulate recursion without losing state.  */
2897 struct scan_frame;
2898 typedef struct scan_frame {
2899     regnode *last;  /* last node to process in this frame */
2900     regnode *next;  /* next node to process when last is reached */
2901     struct scan_frame *prev; /*previous frame*/
2902     I32 stop; /* what stopparen do we use */
2903 } scan_frame;
2904
2905
2906 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2907
2908 #define CASE_SYNST_FNC(nAmE)                                       \
2909 case nAmE:                                                         \
2910     if (flags & SCF_DO_STCLASS_AND) {                              \
2911             for (value = 0; value < 256; value++)                  \
2912                 if (!is_ ## nAmE ## _cp(value))                       \
2913                     ANYOF_BITMAP_CLEAR(data->start_class, value);  \
2914     }                                                              \
2915     else {                                                         \
2916             for (value = 0; value < 256; value++)                  \
2917                 if (is_ ## nAmE ## _cp(value))                        \
2918                     ANYOF_BITMAP_SET(data->start_class, value);    \
2919     }                                                              \
2920     break;                                                         \
2921 case N ## nAmE:                                                    \
2922     if (flags & SCF_DO_STCLASS_AND) {                              \
2923             for (value = 0; value < 256; value++)                   \
2924                 if (is_ ## nAmE ## _cp(value))                         \
2925                     ANYOF_BITMAP_CLEAR(data->start_class, value);   \
2926     }                                                               \
2927     else {                                                          \
2928             for (value = 0; value < 256; value++)                   \
2929                 if (!is_ ## nAmE ## _cp(value))                        \
2930                     ANYOF_BITMAP_SET(data->start_class, value);     \
2931     }                                                               \
2932     break
2933
2934
2935
2936 STATIC I32
2937 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2938                         I32 *minlenp, I32 *deltap,
2939                         regnode *last,
2940                         scan_data_t *data,
2941                         I32 stopparen,
2942                         U8* recursed,
2943                         struct regnode_charclass_class *and_withp,
2944                         U32 flags, U32 depth)
2945                         /* scanp: Start here (read-write). */
2946                         /* deltap: Write maxlen-minlen here. */
2947                         /* last: Stop before this one. */
2948                         /* data: string data about the pattern */
2949                         /* stopparen: treat close N as END */
2950                         /* recursed: which subroutines have we recursed into */
2951                         /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2952 {
2953     dVAR;
2954     I32 min = 0, pars = 0, code;
2955     regnode *scan = *scanp, *next;
2956     I32 delta = 0;
2957     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2958     int is_inf_internal = 0;            /* The studied chunk is infinite */
2959     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2960     scan_data_t data_fake;
2961     SV *re_trie_maxbuff = NULL;
2962     regnode *first_non_open = scan;
2963     I32 stopmin = I32_MAX;
2964     scan_frame *frame = NULL;
2965     GET_RE_DEBUG_FLAGS_DECL;
2966
2967     PERL_ARGS_ASSERT_STUDY_CHUNK;
2968
2969 #ifdef DEBUGGING
2970     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2971 #endif
2972
2973     if ( depth == 0 ) {
2974         while (first_non_open && OP(first_non_open) == OPEN)
2975             first_non_open=regnext(first_non_open);
2976     }
2977
2978
2979   fake_study_recurse:
2980     while ( scan && OP(scan) != END && scan < last ){
2981         UV min_subtract = 0;    /* How much to subtract from the minimum node
2982                                    length to get a real minimum (because the
2983                                    folded version may be shorter) */
2984         bool has_exactf_sharp_s = FALSE;
2985         /* Peephole optimizer: */
2986         DEBUG_STUDYDATA("Peep:", data,depth);
2987         DEBUG_PEEP("Peep",scan,depth);
2988
2989         /* Its not clear to khw or hv why this is done here, and not in the
2990          * clauses that deal with EXACT nodes.  khw's guess is that it's
2991          * because of a previous design */
2992         JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
2993
2994         /* Follow the next-chain of the current node and optimize
2995            away all the NOTHINGs from it.  */
2996         if (OP(scan) != CURLYX) {
2997             const int max = (reg_off_by_arg[OP(scan)]
2998                        ? I32_MAX
2999                        /* I32 may be smaller than U16 on CRAYs! */
3000                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3001             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3002             int noff;
3003             regnode *n = scan;
3004
3005             /* Skip NOTHING and LONGJMP. */
3006             while ((n = regnext(n))
3007                    && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3008                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3009                    && off + noff < max)
3010                 off += noff;
3011             if (reg_off_by_arg[OP(scan)])
3012                 ARG(scan) = off;
3013             else
3014                 NEXT_OFF(scan) = off;
3015         }
3016
3017
3018
3019         /* The principal pseudo-switch.  Cannot be a switch, since we
3020            look into several different things.  */
3021         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3022                    || OP(scan) == IFTHEN) {
3023             next = regnext(scan);
3024             code = OP(scan);
3025             /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3026
3027             if (OP(next) == code || code == IFTHEN) {
3028                 /* NOTE - There is similar code to this block below for handling
3029                    TRIE nodes on a re-study.  If you change stuff here check there
3030                    too. */
3031                 I32 max1 = 0, min1 = I32_MAX, num = 0;
3032                 struct regnode_charclass_class accum;
3033                 regnode * const startbranch=scan;
3034
3035                 if (flags & SCF_DO_SUBSTR)
3036                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3037                 if (flags & SCF_DO_STCLASS)
3038                     cl_init_zero(pRExC_state, &accum);
3039
3040                 while (OP(scan) == code) {
3041                     I32 deltanext, minnext, f = 0, fake;
3042                     struct regnode_charclass_class this_class;
3043
3044                     num++;
3045                     data_fake.flags = 0;
3046                     if (data) {
3047                         data_fake.whilem_c = data->whilem_c;
3048                         data_fake.last_closep = data->last_closep;
3049                     }
3050                     else
3051                         data_fake.last_closep = &fake;
3052
3053                     data_fake.pos_delta = delta;
3054                     next = regnext(scan);
3055                     scan = NEXTOPER(scan);
3056                     if (code != BRANCH)
3057                         scan = NEXTOPER(scan);
3058                     if (flags & SCF_DO_STCLASS) {
3059                         cl_init(pRExC_state, &this_class);
3060                         data_fake.start_class = &this_class;
3061                         f = SCF_DO_STCLASS_AND;
3062                     }
3063                     if (flags & SCF_WHILEM_VISITED_POS)
3064                         f |= SCF_WHILEM_VISITED_POS;
3065
3066                     /* we suppose the run is continuous, last=next...*/
3067                     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3068                                           next, &data_fake,
3069                                           stopparen, recursed, NULL, f,depth+1);
3070                     if (min1 > minnext)
3071                         min1 = minnext;
3072                     if (max1 < minnext + deltanext)
3073                         max1 = minnext + deltanext;
3074                     if (deltanext == I32_MAX)
3075                         is_inf = is_inf_internal = 1;
3076                     scan = next;
3077                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3078                         pars++;
3079                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
3080                         if ( stopmin > minnext)
3081                             stopmin = min + min1;
3082                         flags &= ~SCF_DO_SUBSTR;
3083                         if (data)
3084                             data->flags |= SCF_SEEN_ACCEPT;
3085                     }
3086                     if (data) {
3087                         if (data_fake.flags & SF_HAS_EVAL)
3088                             data->flags |= SF_HAS_EVAL;
3089                         data->whilem_c = data_fake.whilem_c;
3090                     }
3091                     if (flags & SCF_DO_STCLASS)
3092                         cl_or(pRExC_state, &accum, &this_class);
3093                 }
3094                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3095                     min1 = 0;
3096                 if (flags & SCF_DO_SUBSTR) {
3097                     data->pos_min += min1;
3098                     data->pos_delta += max1 - min1;
3099                     if (max1 != min1 || is_inf)
3100                         data->longest = &(data->longest_float);
3101                 }
3102                 min += min1;
3103                 delta += max1 - min1;
3104                 if (flags & SCF_DO_STCLASS_OR) {
3105                     cl_or(pRExC_state, data->start_class, &accum);
3106                     if (min1) {
3107                         cl_and(data->start_class, and_withp);
3108                         flags &= ~SCF_DO_STCLASS;
3109                     }
3110                 }
3111                 else if (flags & SCF_DO_STCLASS_AND) {
3112                     if (min1) {
3113                         cl_and(data->start_class, &accum);
3114                         flags &= ~SCF_DO_STCLASS;
3115                     }
3116                     else {
3117                         /* Switch to OR mode: cache the old value of
3118                          * data->start_class */
3119                         INIT_AND_WITHP;
3120                         StructCopy(data->start_class, and_withp,
3121                                    struct regnode_charclass_class);
3122                         flags &= ~SCF_DO_STCLASS_AND;
3123                         StructCopy(&accum, data->start_class,
3124                                    struct regnode_charclass_class);
3125                         flags |= SCF_DO_STCLASS_OR;
3126                         data->start_class->flags |= ANYOF_EOS;
3127                     }
3128                 }
3129
3130                 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3131                 /* demq.
3132
3133                    Assuming this was/is a branch we are dealing with: 'scan' now
3134                    points at the item that follows the branch sequence, whatever
3135                    it is. We now start at the beginning of the sequence and look
3136                    for subsequences of
3137
3138                    BRANCH->EXACT=>x1
3139                    BRANCH->EXACT=>x2
3140                    tail
3141
3142                    which would be constructed from a pattern like /A|LIST|OF|WORDS/
3143
3144                    If we can find such a subsequence we need to turn the first
3145                    element into a trie and then add the subsequent branch exact
3146                    strings to the trie.
3147
3148                    We have two cases
3149
3150                      1. patterns where the whole set of branches can be converted.
3151
3152                      2. patterns where only a subset can be converted.
3153
3154                    In case 1 we can replace the whole set with a single regop
3155                    for the trie. In case 2 we need to keep the start and end
3156                    branches so
3157
3158                      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3159                      becomes BRANCH TRIE; BRANCH X;
3160
3161                   There is an additional case, that being where there is a
3162                   common prefix, which gets split out into an EXACT like node
3163                   preceding the TRIE node.
3164
3165                   If x(1..n)==tail then we can do a simple trie, if not we make
3166                   a "jump" trie, such that when we match the appropriate word
3167                   we "jump" to the appropriate tail node. Essentially we turn
3168                   a nested if into a case structure of sorts.
3169
3170                 */
3171
3172                     int made=0;
3173                     if (!re_trie_maxbuff) {
3174                         re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3175                         if (!SvIOK(re_trie_maxbuff))
3176                             sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3177                     }
3178                     if ( SvIV(re_trie_maxbuff)>=0  ) {
3179                         regnode *cur;
3180                         regnode *first = (regnode *)NULL;
3181                         regnode *last = (regnode *)NULL;
3182                         regnode *tail = scan;
3183                         U8 optype = 0;
3184                         U32 count=0;
3185
3186 #ifdef DEBUGGING
3187                         SV * const mysv = sv_newmortal();       /* for dumping */
3188 #endif
3189                         /* var tail is used because there may be a TAIL
3190                            regop in the way. Ie, the exacts will point to the
3191                            thing following the TAIL, but the last branch will
3192                            point at the TAIL. So we advance tail. If we
3193                            have nested (?:) we may have to move through several
3194                            tails.
3195                          */
3196
3197                         while ( OP( tail ) == TAIL ) {
3198                             /* this is the TAIL generated by (?:) */
3199                             tail = regnext( tail );
3200                         }
3201
3202
3203                         DEBUG_OPTIMISE_r({
3204                             regprop(RExC_rx, mysv, tail );
3205                             PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3206                                 (int)depth * 2 + 2, "",
3207                                 "Looking for TRIE'able sequences. Tail node is: ",
3208                                 SvPV_nolen_const( mysv )
3209                             );
3210                         });
3211
3212                         /*
3213
3214                            step through the branches, cur represents each
3215                            branch, noper is the first thing to be matched
3216                            as part of that branch and noper_next is the
3217                            regnext() of that node. if noper is an EXACT
3218                            and noper_next is the same as scan (our current
3219                            position in the regex) then the EXACT branch is
3220                            a possible optimization target. Once we have
3221                            two or more consecutive such branches we can
3222                            create a trie of the EXACT's contents and stich
3223                            it in place. If the sequence represents all of
3224                            the branches we eliminate the whole thing and
3225                            replace it with a single TRIE. If it is a
3226                            subsequence then we need to stitch it in. This
3227                            means the first branch has to remain, and needs
3228                            to be repointed at the item on the branch chain
3229                            following the last branch optimized. This could
3230                            be either a BRANCH, in which case the
3231                            subsequence is internal, or it could be the
3232                            item following the branch sequence in which
3233                            case the subsequence is at the end.
3234
3235                         */
3236
3237                         /* dont use tail as the end marker for this traverse */
3238                         for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3239                             regnode * const noper = NEXTOPER( cur );
3240 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3241                             regnode * const noper_next = regnext( noper );
3242 #endif
3243
3244                             DEBUG_OPTIMISE_r({
3245                                 regprop(RExC_rx, mysv, cur);
3246                                 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3247                                    (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3248
3249                                 regprop(RExC_rx, mysv, noper);
3250                                 PerlIO_printf( Perl_debug_log, " -> %s",
3251                                     SvPV_nolen_const(mysv));
3252
3253                                 if ( noper_next ) {
3254                                   regprop(RExC_rx, mysv, noper_next );
3255                                   PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3256                                     SvPV_nolen_const(mysv));
3257                                 }
3258                                 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3259                                    REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3260                             });
3261                             if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3262                                          : PL_regkind[ OP( noper ) ] == EXACT )
3263                                   || OP(noper) == NOTHING )
3264 #ifdef NOJUMPTRIE
3265                                   && noper_next == tail
3266 #endif
3267                                   && count < U16_MAX)
3268                             {
3269                                 count++;
3270                                 if ( !first || optype == NOTHING ) {
3271                                     if (!first) first = cur;
3272                                     optype = OP( noper );
3273                                 } else {
3274                                     last = cur;
3275                                 }
3276                             } else {
3277 /*
3278     Currently the trie logic handles case insensitive matching properly only
3279     when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
3280     semantics).
3281
3282     If/when this is fixed the following define can be swapped
3283     in below to fully enable trie logic.
3284
3285 #define TRIE_TYPE_IS_SAFE 1
3286
3287 Note that join_exact() assumes that the other types of EXACTFish nodes are not
3288 used in tries, so that would have to be updated if this changed
3289
3290 */
3291 #define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
3292
3293                                 if ( last && TRIE_TYPE_IS_SAFE ) {
3294                                     make_trie( pRExC_state,
3295                                             startbranch, first, cur, tail, count,
3296                                             optype, depth+1 );
3297                                 }
3298                                 if ( PL_regkind[ OP( noper ) ] == EXACT
3299 #ifdef NOJUMPTRIE
3300                                      && noper_next == tail
3301 #endif
3302                                 ){
3303                                     count = 1;
3304                                     first = cur;
3305                                     optype = OP( noper );
3306                                 } else {
3307                                     count = 0;
3308                                     first = NULL;
3309                                     optype = 0;
3310                                 }
3311                                 last = NULL;
3312                             }
3313                         }
3314                         DEBUG_OPTIMISE_r({
3315                             regprop(RExC_rx, mysv, cur);
3316                             PerlIO_printf( Perl_debug_log,
3317                               "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3318                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3319
3320                         });
3321
3322                         if ( last && TRIE_TYPE_IS_SAFE ) {
3323                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3324 #ifdef TRIE_STUDY_OPT
3325                             if ( ((made == MADE_EXACT_TRIE &&
3326                                  startbranch == first)
3327                                  || ( first_non_open == first )) &&
3328                                  depth==0 ) {
3329                                 flags |= SCF_TRIE_RESTUDY;
3330                                 if ( startbranch == first
3331                                      && scan == tail )
3332                                 {
3333                                     RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3334                                 }
3335                             }
3336 #endif
3337                         }
3338                     }
3339
3340                 } /* do trie */
3341
3342             }
3343             else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3344                 scan = NEXTOPER(NEXTOPER(scan));
3345             } else                      /* single branch is optimized. */
3346                 scan = NEXTOPER(scan);
3347             continue;
3348         } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3349             scan_frame *newframe = NULL;
3350             I32 paren;
3351             regnode *start;
3352             regnode *end;
3353
3354             if (OP(scan) != SUSPEND) {
3355             /* set the pointer */
3356                 if (OP(scan) == GOSUB) {
3357                     paren = ARG(scan);
3358                     RExC_recurse[ARG2L(scan)] = scan;
3359                     start = RExC_open_parens[paren-1];
3360                     end   = RExC_close_parens[paren-1];
3361                 } else {
3362                     paren = 0;
3363                     start = RExC_rxi->program + 1;
3364                     end   = RExC_opend;
3365                 }
3366                 if (!recursed) {
3367                     Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3368                     SAVEFREEPV(recursed);
3369                 }
3370                 if (!PAREN_TEST(recursed,paren+1)) {
3371                     PAREN_SET(recursed,paren+1);
3372                     Newx(newframe,1,scan_frame);
3373                 } else {
3374                     if (flags & SCF_DO_SUBSTR) {
3375                         SCAN_COMMIT(pRExC_state,data,minlenp);
3376                         data->longest = &(data->longest_float);
3377                     }
3378                     is_inf = is_inf_internal = 1;
3379                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3380                         cl_anything(pRExC_state, data->start_class);
3381                     flags &= ~SCF_DO_STCLASS;
3382                 }
3383             } else {
3384                 Newx(newframe,1,scan_frame);
3385                 paren = stopparen;
3386                 start = scan+2;
3387                 end = regnext(scan);
3388             }
3389             if (newframe) {
3390                 assert(start);
3391                 assert(end);
3392                 SAVEFREEPV(newframe);
3393                 newframe->next = regnext(scan);
3394                 newframe->last = last;
3395                 newframe->stop = stopparen;
3396                 newframe->prev = frame;
3397
3398                 frame = newframe;
3399                 scan =  start;
3400                 stopparen = paren;
3401                 last = end;
3402
3403                 continue;
3404             }
3405         }
3406         else if (OP(scan) == EXACT) {
3407             I32 l = STR_LEN(scan);
3408             UV uc;
3409             if (UTF) {
3410                 const U8 * const s = (U8*)STRING(scan);
3411                 l = utf8_length(s, s + l);
3412                 uc = utf8_to_uvchr(s, NULL);
3413             } else {
3414                 uc = *((U8*)STRING(scan));
3415             }
3416             min += l;
3417             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3418                 /* The code below prefers earlier match for fixed
3419                    offset, later match for variable offset.  */
3420                 if (data->last_end == -1) { /* Update the start info. */
3421                     data->last_start_min = data->pos_min;
3422                     data->last_start_max = is_inf
3423                         ? I32_MAX : data->pos_min + data->pos_delta;
3424                 }
3425                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3426                 if (UTF)
3427                     SvUTF8_on(data->last_found);
3428                 {
3429                     SV * const sv = data->last_found;
3430                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3431                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3432                     if (mg && mg->mg_len >= 0)
3433                         mg->mg_len += utf8_length((U8*)STRING(scan),
3434                                                   (U8*)STRING(scan)+STR_LEN(scan));
3435                 }
3436                 data->last_end = data->pos_min + l;
3437                 data->pos_min += l; /* As in the first entry. */
3438                 data->flags &= ~SF_BEFORE_EOL;
3439             }
3440             if (flags & SCF_DO_STCLASS_AND) {
3441                 /* Check whether it is compatible with what we know already! */
3442                 int compat = 1;
3443
3444
3445                 /* If compatible, we or it in below.  It is compatible if is
3446                  * in the bitmp and either 1) its bit or its fold is set, or 2)
3447                  * it's for a locale.  Even if there isn't unicode semantics
3448                  * here, at runtime there may be because of matching against a
3449                  * utf8 string, so accept a possible false positive for
3450                  * latin1-range folds */
3451                 if (uc >= 0x100 ||
3452                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3453                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3454                     && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3455                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3456                     )
3457                 {
3458                     compat = 0;
3459                 }
3460                 ANYOF_CLASS_ZERO(data->start_class);
3461                 ANYOF_BITMAP_ZERO(data->start_class);
3462                 if (compat)
3463                     ANYOF_BITMAP_SET(data->start_class, uc);
3464                 else if (uc >= 0x100) {
3465                     int i;
3466
3467                     /* Some Unicode code points fold to the Latin1 range; as
3468                      * XXX temporary code, instead of figuring out if this is
3469                      * one, just assume it is and set all the start class bits
3470                      * that could be some such above 255 code point's fold
3471                      * which will generate fals positives.  As the code
3472                      * elsewhere that does compute the fold settles down, it
3473                      * can be extracted out and re-used here */
3474                     for (i = 0; i < 256; i++){
3475                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3476                             ANYOF_BITMAP_SET(data->start_class, i);
3477                         }
3478                     }
3479                 }
3480                 data->start_class->flags &= ~ANYOF_EOS;
3481                 if (uc < 0x100)
3482                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3483             }
3484             else if (flags & SCF_DO_STCLASS_OR) {
3485                 /* false positive possible if the class is case-folded */
3486                 if (uc < 0x100)
3487                     ANYOF_BITMAP_SET(data->start_class, uc);
3488                 else
3489                     data->start_class->flags |= ANYOF_UNICODE_ALL;
3490                 data->start_class->flags &= ~ANYOF_EOS;
3491                 cl_and(data->start_class, and_withp);
3492             }
3493             flags &= ~SCF_DO_STCLASS;
3494         }
3495         else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3496             I32 l = STR_LEN(scan);
3497             UV uc = *((U8*)STRING(scan));
3498
3499             /* Search for fixed substrings supports EXACT only. */
3500             if (flags & SCF_DO_SUBSTR) {
3501                 assert(data);
3502                 SCAN_COMMIT(pRExC_state, data, minlenp);
3503             }
3504             if (UTF) {
3505                 const U8 * const s = (U8 *)STRING(scan);
3506                 l = utf8_length(s, s + l);
3507                 uc = utf8_to_uvchr(s, NULL);
3508             }
3509             else if (has_exactf_sharp_s) {
3510                 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3511             }
3512             min += l - min_subtract;
3513             if (min < 0) {
3514                 min = 0;
3515             }
3516             delta += min_subtract;
3517             if (flags & SCF_DO_SUBSTR) {
3518                 data->pos_min += l - min_subtract;
3519                 if (data->pos_min < 0) {
3520                     data->pos_min = 0;
3521                 }
3522                 data->pos_delta += min_subtract;
3523                 if (min_subtract) {
3524                     data->longest = &(data->longest_float);
3525                 }
3526             }
3527             if (flags & SCF_DO_STCLASS_AND) {
3528                 /* Check whether it is compatible with what we know already! */
3529                 int compat = 1;
3530                 if (uc >= 0x100 ||
3531                  (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3532                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
3533                   && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3534                 {
3535                     compat = 0;
3536                 }
3537                 ANYOF_CLASS_ZERO(data->start_class);
3538                 ANYOF_BITMAP_ZERO(data->start_class);
3539                 if (compat) {
3540                     ANYOF_BITMAP_SET(data->start_class, uc);
3541                     data->start_class->flags &= ~ANYOF_EOS;
3542                     data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3543                     if (OP(scan) == EXACTFL) {
3544                         /* XXX This set is probably no longer necessary, and
3545                          * probably wrong as LOCALE now is on in the initial
3546                          * state */
3547                         data->start_class->flags |= ANYOF_LOCALE;
3548                     }
3549                     else {
3550
3551                         /* Also set the other member of the fold pair.  In case
3552                          * that unicode semantics is called for at runtime, use
3553                          * the full latin1 fold.  (Can't do this for locale,
3554                          * because not known until runtime) */
3555                         ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3556
3557                         /* All other (EXACTFL handled above) folds except under
3558                          * /iaa that include s, S, and sharp_s also may include
3559                          * the others */
3560                         if (OP(scan) != EXACTFA) {
3561                             if (uc == 's' || uc == 'S') {
3562                                 ANYOF_BITMAP_SET(data->start_class,
3563                                                  LATIN_SMALL_LETTER_SHARP_S);
3564                             }
3565                             else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3566                                 ANYOF_BITMAP_SET(data->start_class, 's');
3567                                 ANYOF_BITMAP_SET(data->start_class, 'S');
3568                             }
3569                         }
3570                     }
3571                 }
3572                 else if (uc >= 0x100) {
3573                     int i;
3574                     for (i = 0; i < 256; i++){
3575                         if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3576                             ANYOF_BITMAP_SET(data->start_class, i);
3577                         }
3578                     }
3579                 }
3580             }
3581             else if (flags & SCF_DO_STCLASS_OR) {
3582                 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3583                     /* false positive possible if the class is case-folded.
3584                        Assume that the locale settings are the same... */
3585                     if (uc < 0x100) {
3586                         ANYOF_BITMAP_SET(data->start_class, uc);
3587                         if (OP(scan) != EXACTFL) {
3588
3589                             /* And set the other member of the fold pair, but
3590                              * can't do that in locale because not known until
3591                              * run-time */
3592                             ANYOF_BITMAP_SET(data->start_class,
3593                                              PL_fold_latin1[uc]);
3594
3595                             /* All folds except under /iaa that include s, S,
3596                              * and sharp_s also may include the others */
3597                             if (OP(scan) != EXACTFA) {
3598                                 if (uc == 's' || uc == 'S') {
3599                                     ANYOF_BITMAP_SET(data->start_class,
3600                                                    LATIN_SMALL_LETTER_SHARP_S);
3601                                 }
3602                                 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3603                                     ANYOF_BITMAP_SET(data->start_class, 's');
3604                                     ANYOF_BITMAP_SET(data->start_class, 'S');
3605                                 }
3606                             }
3607                         }
3608                     }
3609                     data->start_class->flags &= ~ANYOF_EOS;
3610                 }
3611                 cl_and(data->start_class, and_withp);
3612             }
3613             flags &= ~SCF_DO_STCLASS;
3614         }
3615         else if (REGNODE_VARIES(OP(scan))) {
3616             I32 mincount, maxcount, minnext, deltanext, fl = 0;
3617             I32 f = flags, pos_before = 0;
3618             regnode * const oscan = scan;
3619             struct regnode_charclass_class this_class;
3620             struct regnode_charclass_class *oclass = NULL;
3621             I32 next_is_eval = 0;
3622
3623             switch (PL_regkind[OP(scan)]) {
3624             case WHILEM:                /* End of (?:...)* . */
3625                 scan = NEXTOPER(scan);
3626                 goto finish;
3627             case PLUS:
3628                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3629                     next = NEXTOPER(scan);
3630                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3631                         mincount = 1;
3632                         maxcount = REG_INFTY;
3633                         next = regnext(scan);
3634                         scan = NEXTOPER(scan);
3635                         goto do_curly;
3636                     }
3637                 }
3638                 if (flags & SCF_DO_SUBSTR)
3639                     data->pos_min++;
3640                 min++;
3641                 /* Fall through. */
3642             case STAR:
3643                 if (flags & SCF_DO_STCLASS) {
3644                     mincount = 0;
3645                     maxcount = REG_INFTY;
3646                     next = regnext(scan);
3647                     scan = NEXTOPER(scan);
3648                     goto do_curly;
3649                 }
3650                 is_inf = is_inf_internal = 1;
3651                 scan = regnext(scan);
3652                 if (flags & SCF_DO_SUBSTR) {
3653                     SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3654                     data->longest = &(data->longest_float);
3655                 }
3656                 goto optimize_curly_tail;
3657             case CURLY:
3658                 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3659                     && (scan->flags == stopparen))
3660                 {
3661                     mincount = 1;
3662                     maxcount = 1;
3663                 } else {
3664                     mincount = ARG1(scan);
3665                     maxcount = ARG2(scan);
3666                 }
3667                 next = regnext(scan);
3668                 if (OP(scan) == CURLYX) {
3669                     I32 lp = (data ? *(data->last_closep) : 0);
3670                     scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3671                 }
3672                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3673                 next_is_eval = (OP(scan) == EVAL);
3674               do_curly:
3675                 if (flags & SCF_DO_SUBSTR) {
3676                     if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3677                     pos_before = data->pos_min;
3678                 }
3679                 if (data) {
3680                     fl = data->flags;
3681                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3682                     if (is_inf)
3683                         data->flags |= SF_IS_INF;
3684                 }
3685                 if (flags & SCF_DO_STCLASS) {
3686                     cl_init(pRExC_state, &this_class);
3687                     oclass = data->start_class;
3688                     data->start_class = &this_class;
3689                     f |= SCF_DO_STCLASS_AND;
3690                     f &= ~SCF_DO_STCLASS_OR;
3691                 }
3692                 /* Exclude from super-linear cache processing any {n,m}
3693                    regops for which the combination of input pos and regex
3694                    pos is not enough information to determine if a match
3695                    will be possible.
3696
3697                    For example, in the regex /foo(bar\s*){4,8}baz/ with the
3698                    regex pos at the \s*, the prospects for a match depend not
3699                    only on the input position but also on how many (bar\s*)
3700                    repeats into the {4,8} we are. */
3701                if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3702                     f &= ~SCF_WHILEM_VISITED_POS;
3703
3704                 /* This will finish on WHILEM, setting scan, or on NULL: */
3705                 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3706                                       last, data, stopparen, recursed, NULL,
3707                                       (mincount == 0
3708                                         ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3709
3710                 if (flags & SCF_DO_STCLASS)
3711                     data->start_class = oclass;
3712                 if (mincount == 0 || minnext == 0) {
3713                     if (flags & SCF_DO_STCLASS_OR) {
3714                         cl_or(pRExC_state, data->start_class, &this_class);
3715                     }
3716                     else if (flags & SCF_DO_STCLASS_AND) {
3717                         /* Switch to OR mode: cache the old value of
3718                          * data->start_class */
3719                         INIT_AND_WITHP;
3720                         StructCopy(data->start_class, and_withp,
3721                                    struct regnode_charclass_class);
3722                         flags &= ~SCF_DO_STCLASS_AND;
3723                         StructCopy(&this_class, data->start_class,
3724                                    struct regnode_charclass_class);
3725                         flags |= SCF_DO_STCLASS_OR;
3726                         data->start_class->flags |= ANYOF_EOS;
3727                     }
3728                 } else {                /* Non-zero len */
3729                     if (flags & SCF_DO_STCLASS_OR) {
3730                         cl_or(pRExC_state, data->start_class, &this_class);
3731                         cl_and(data->start_class, and_withp);
3732                     }
3733                     else if (flags & SCF_DO_STCLASS_AND)
3734                         cl_and(data->start_class, &this_class);
3735                     flags &= ~SCF_DO_STCLASS;
3736                 }
3737                 if (!scan)              /* It was not CURLYX, but CURLY. */
3738                     scan = next;
3739                 if ( /* ? quantifier ok, except for (?{ ... }) */
3740                     (next_is_eval || !(mincount == 0 && maxcount == 1))
3741                     && (minnext == 0) && (deltanext == 0)
3742                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3743                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
3744                 {
3745                     ckWARNreg(RExC_parse,
3746                               "Quantifier unexpected on zero-length expression");
3747                 }
3748
3749                 min += minnext * mincount;
3750                 is_inf_internal |= ((maxcount == REG_INFTY
3751                                      && (minnext + deltanext) > 0)
3752                                     || deltanext == I32_MAX);
3753                 is_inf |= is_inf_internal;
3754                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3755
3756                 /* Try powerful optimization CURLYX => CURLYN. */
3757                 if (  OP(oscan) == CURLYX && data
3758                       && data->flags & SF_IN_PAR
3759                       && !(data->flags & SF_HAS_EVAL)
3760                       && !deltanext && minnext == 1 ) {
3761                     /* Try to optimize to CURLYN.  */
3762                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3763                     regnode * const nxt1 = nxt;
3764 #ifdef DEBUGGING
3765                     regnode *nxt2;
3766 #endif
3767
3768                     /* Skip open. */
3769                     nxt = regnext(nxt);
3770                     if (!REGNODE_SIMPLE(OP(nxt))
3771                         && !(PL_regkind[OP(nxt)] == EXACT
3772                              && STR_LEN(nxt) == 1))
3773                         goto nogo;
3774 #ifdef DEBUGGING
3775                     nxt2 = nxt;
3776 #endif
3777                     nxt = regnext(nxt);
3778                     if (OP(nxt) != CLOSE)
3779                         goto nogo;
3780                     if (RExC_open_parens) {
3781                         RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3782                         RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3783                     }
3784                     /* Now we know that nxt2 is the only contents: */
3785                     oscan->flags = (U8)ARG(nxt);
3786                     OP(oscan) = CURLYN;
3787                     OP(nxt1) = NOTHING; /* was OPEN. */
3788
3789 #ifdef DEBUGGING
3790                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3791                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3792                     NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3793                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
3794                     OP(nxt + 1) = OPTIMIZED; /* was count. */
3795                     NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3796 #endif
3797                 }
3798               nogo:
3799
3800                 /* Try optimization CURLYX => CURLYM. */
3801                 if (  OP(oscan) == CURLYX && data
3802                       && !(data->flags & SF_HAS_PAR)
3803                       && !(data->flags & SF_HAS_EVAL)
3804                       && !deltanext     /* atom is fixed width */
3805                       && minnext != 0   /* CURLYM can't handle zero width */
3806                 ) {
3807                     /* XXXX How to optimize if data == 0? */
3808                     /* Optimize to a simpler form.  */
3809                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3810                     regnode *nxt2;
3811
3812                     OP(oscan) = CURLYM;
3813                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3814                             && (OP(nxt2) != WHILEM))
3815                         nxt = nxt2;
3816                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3817                     /* Need to optimize away parenths. */
3818                     if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3819                         /* Set the parenth number.  */
3820                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3821
3822                         oscan->flags = (U8)ARG(nxt);
3823                         if (RExC_open_parens) {
3824                             RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3825                             RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3826                         }
3827                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
3828                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
3829
3830 #ifdef DEBUGGING
3831                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3832                         OP(nxt + 1) = OPTIMIZED; /* was count. */
3833                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3834                         NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3835 #endif
3836 #if 0
3837                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
3838                             regnode *nnxt = regnext(nxt1);
3839                             if (nnxt == nxt) {
3840                                 if (reg_off_by_arg[OP(nxt1)])
3841                                     ARG_SET(nxt1, nxt2 - nxt1);
3842                                 else if (nxt2 - nxt1 < U16_MAX)
3843                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
3844                                 else
3845                                     OP(nxt) = NOTHING;  /* Cannot beautify */
3846                             }
3847                             nxt1 = nnxt;
3848                         }
3849 #endif
3850                         /* Optimize again: */
3851                         study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3852                                     NULL, stopparen, recursed, NULL, 0,depth+1);
3853                     }
3854                     else
3855                         oscan->flags = 0;
3856                 }
3857                 else if ((OP(oscan) == CURLYX)
3858                          && (flags & SCF_WHILEM_VISITED_POS)
3859                          /* See the comment on a similar expression above.
3860                             However, this time it's not a subexpression
3861                             we care about, but the expression itself. */
3862                          && (maxcount == REG_INFTY)
3863                          && data && ++data->whilem_c < 16) {
3864                     /* This stays as CURLYX, we can put the count/of pair. */
3865                     /* Find WHILEM (as in regexec.c) */
3866                     regnode *nxt = oscan + NEXT_OFF(oscan);
3867
3868                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3869                         nxt += ARG(nxt);
3870                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
3871                         | (RExC_whilem_seen << 4)); /* On WHILEM */
3872                 }
3873                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3874                     pars++;
3875                 if (flags & SCF_DO_SUBSTR) {
3876                     SV *last_str = NULL;
3877                     int counted = mincount != 0;
3878
3879                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3880 #if defined(SPARC64_GCC_WORKAROUND)
3881                         I32 b = 0;
3882                         STRLEN l = 0;
3883                         const char *s = NULL;
3884                         I32 old = 0;
3885
3886                         if (pos_before >= data->last_start_min)
3887                             b = pos_before;
3888                         else
3889                             b = data->last_start_min;
3890
3891                         l = 0;
3892                         s = SvPV_const(data->last_found, l);
3893                         old = b - data->last_start_min;
3894
3895 #else
3896                         I32 b = pos_before >= data->last_start_min
3897                             ? pos_before : data->last_start_min;
3898                         STRLEN l;
3899                         const char * const s = SvPV_const(data->last_found, l);
3900                         I32 old = b - data->last_start_min;
3901 #endif
3902
3903                         if (UTF)
3904                             old = utf8_hop((U8*)s, old) - (U8*)s;
3905                         l -= old;
3906                         /* Get the added string: */
3907                         last_str = newSVpvn_utf8(s  + old, l, UTF);
3908                         if (deltanext == 0 && pos_before == b) {
3909                             /* What was added is a constant string */
3910                             if (mincount > 1) {
3911                                 SvGROW(last_str, (mincount * l) + 1);
3912                                 repeatcpy(SvPVX(last_str) + l,
3913                                           SvPVX_const(last_str), l, mincount - 1);
3914                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3915                                 /* Add additional parts. */
3916                                 SvCUR_set(data->last_found,
3917                                           SvCUR(data->last_found) - l);
3918                                 sv_catsv(data->last_found, last_str);
3919                                 {
3920                                     SV * sv = data->last_found;
3921                                     MAGIC *mg =
3922                                         SvUTF8(sv) && SvMAGICAL(sv) ?
3923                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
3924                                     if (mg && mg->mg_len >= 0)
3925                                         mg->mg_len += CHR_SVLEN(last_str) - l;
3926                                 }
3927                                 data->last_end += l * (mincount - 1);
3928                             }
3929                         } else {
3930                             /* start offset must point into the last copy */
3931                             data->last_start_min += minnext * (mincount - 1);
3932                             data->last_start_max += is_inf ? I32_MAX
3933                                 : (maxcount - 1) * (minnext + data->pos_delta);
3934                         }
3935                     }
3936                     /* It is counted once already... */
3937                     data->pos_min += minnext * (mincount - counted);
3938                     data->pos_delta += - counted * deltanext +
3939                         (minnext + deltanext) * maxcount - minnext * mincount;
3940                     if (mincount != maxcount) {
3941                          /* Cannot extend fixed substrings found inside
3942                             the group.  */
3943                         SCAN_COMMIT(pRExC_state,data,minlenp);
3944                         if (mincount && last_str) {
3945                             SV * const sv = data->last_found;
3946                             MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3947                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3948
3949                             if (mg)
3950                                 mg->mg_len = -1;
3951                             sv_setsv(sv, last_str);
3952                             data->last_end = data->pos_min;
3953                             data->last_start_min =
3954                                 data->pos_min - CHR_SVLEN(last_str);
3955                             data->last_start_max = is_inf
3956                                 ? I32_MAX
3957                                 : data->pos_min + data->pos_delta
3958                                 - CHR_SVLEN(last_str);
3959                         }
3960                         data->longest = &(data->longest_float);
3961                     }
3962                     SvREFCNT_dec(last_str);
3963                 }
3964                 if (data && (fl & SF_HAS_EVAL))
3965                     data->flags |= SF_HAS_EVAL;
3966               optimize_curly_tail:
3967                 if (OP(oscan) != CURLYX) {
3968                     while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3969                            && NEXT_OFF(next))
3970                         NEXT_OFF(oscan) += NEXT_OFF(next);
3971                 }
3972                 continue;
3973             default:                    /* REF, ANYOFV, and CLUMP only? */
3974                 if (flags & SCF_DO_SUBSTR) {
3975                     SCAN_COMMIT(pRExC_state,data,minlenp);      /* Cannot expect anything... */
3976                     data->longest = &(data->longest_float);
3977                 }
3978                 is_inf = is_inf_internal = 1;
3979                 if (flags & SCF_DO_STCLASS_OR)
3980                     cl_anything(pRExC_state, data->start_class);
3981                 flags &= ~SCF_DO_STCLASS;
3982                 break;
3983             }
3984         }
3985         else if (OP(scan) == LNBREAK) {
3986             if (flags & SCF_DO_STCLASS) {
3987                 int value = 0;
3988                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3989                 if (flags & SCF_DO_STCLASS_AND) {
3990                     for (value = 0; value < 256; value++)
3991                         if (!is_VERTWS_cp(value))
3992                             ANYOF_BITMAP_CLEAR(data->start_class, value);
3993                 }
3994                 else {
3995                     for (value = 0; value < 256; value++)
3996                         if (is_VERTWS_cp(value))
3997                             ANYOF_BITMAP_SET(data->start_class, value);
3998                 }
3999                 if (flags & SCF_DO_STCLASS_OR)
4000                     cl_and(data->start_class, and_withp);
4001                 flags &= ~SCF_DO_STCLASS;
4002             }
4003             min += 1;
4004             delta += 1;
4005             if (flags & SCF_DO_SUBSTR) {
4006                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4007                 data->pos_min += 1;
4008                 data->pos_delta += 1;
4009                 data->longest = &(data->longest_float);
4010             }
4011         }
4012         else if (REGNODE_SIMPLE(OP(scan))) {
4013             int value = 0;
4014
4015             if (flags & SCF_DO_SUBSTR) {
4016                 SCAN_COMMIT(pRExC_state,data,minlenp);
4017                 data->pos_min++;
4018             }
4019             min++;
4020             if (flags & SCF_DO_STCLASS) {
4021                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4022
4023                 /* Some of the logic below assumes that switching
4024                    locale on will only add false positives. */
4025                 switch (PL_regkind[OP(scan)]) {
4026                 case SANY:
4027                 default:
4028                   do_default:
4029                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4030                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4031                         cl_anything(pRExC_state, data->start_class);
4032                     break;
4033                 case REG_ANY:
4034                     if (OP(scan) == SANY)
4035                         goto do_default;
4036                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4037                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4038                                  || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4039                         cl_anything(pRExC_state, data->start_class);
4040                     }
4041                     if (flags & SCF_DO_STCLASS_AND || !value)
4042                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4043                     break;
4044                 case ANYOF:
4045                     if (flags & SCF_DO_STCLASS_AND)
4046                         cl_and(data->start_class,
4047                                (struct regnode_charclass_class*)scan);
4048                     else
4049                         cl_or(pRExC_state, data->start_class,
4050                               (struct regnode_charclass_class*)scan);
4051                     break;
4052                 case ALNUM:
4053                     if (flags & SCF_DO_STCLASS_AND) {
4054                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4055                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4056                             if (OP(scan) == ALNUMU) {
4057                                 for (value = 0; value < 256; value++) {
4058                                     if (!isWORDCHAR_L1(value)) {
4059                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4060                                     }
4061                                 }
4062                             } else {
4063                                 for (value = 0; value < 256; value++) {
4064                                     if (!isALNUM(value)) {
4065                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4066                                     }
4067                                 }
4068                             }
4069                         }
4070                     }
4071                     else {
4072                         if (data->start_class->flags & ANYOF_LOCALE)
4073                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4074
4075                         /* Even if under locale, set the bits for non-locale
4076                          * in case it isn't a true locale-node.  This will
4077                          * create false positives if it truly is locale */
4078                         if (OP(scan) == ALNUMU) {
4079                             for (value = 0; value < 256; value++) {
4080                                 if (isWORDCHAR_L1(value)) {
4081                                     ANYOF_BITMAP_SET(data->start_class, value);
4082                                 }
4083                             }
4084                         } else {
4085                             for (value = 0; value < 256; value++) {
4086                                 if (isALNUM(value)) {
4087                                     ANYOF_BITMAP_SET(data->start_class, value);
4088                                 }
4089                             }
4090                         }
4091                     }
4092                     break;
4093                 case NALNUM:
4094                     if (flags & SCF_DO_STCLASS_AND) {
4095                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4096                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4097                             if (OP(scan) == NALNUMU) {
4098                                 for (value = 0; value < 256; value++) {
4099                                     if (isWORDCHAR_L1(value)) {
4100                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4101                                     }
4102                                 }
4103                             } else {
4104                                 for (value = 0; value < 256; value++) {
4105                                     if (isALNUM(value)) {
4106                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4107                                     }
4108                                 }
4109                             }
4110                         }
4111                     }
4112                     else {
4113                         if (data->start_class->flags & ANYOF_LOCALE)
4114                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4115
4116                         /* Even if under locale, set the bits for non-locale in
4117                          * case it isn't a true locale-node.  This will create
4118                          * false positives if it truly is locale */
4119                         if (OP(scan) == NALNUMU) {
4120                             for (value = 0; value < 256; value++) {
4121                                 if (! isWORDCHAR_L1(value)) {
4122                                     ANYOF_BITMAP_SET(data->start_class, value);
4123                                 }
4124                             }
4125                         } else {
4126                             for (value = 0; value < 256; value++) {
4127                                 if (! isALNUM(value)) {
4128                                     ANYOF_BITMAP_SET(data->start_class, value);
4129                                 }
4130                             }
4131                         }
4132                     }
4133                     break;
4134                 case SPACE:
4135                     if (flags & SCF_DO_STCLASS_AND) {
4136                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4137                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4138                             if (OP(scan) == SPACEU) {
4139                                 for (value = 0; value < 256; value++) {
4140                                     if (!isSPACE_L1(value)) {
4141                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4142                                     }
4143                                 }
4144                             } else {
4145                                 for (value = 0; value < 256; value++) {
4146                                     if (!isSPACE(value)) {
4147                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4148                                     }
4149                                 }
4150                             }
4151                         }
4152                     }
4153                     else {
4154                         if (data->start_class->flags & ANYOF_LOCALE) {
4155                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4156                         }
4157                         if (OP(scan) == SPACEU) {
4158                             for (value = 0; value < 256; value++) {
4159                                 if (isSPACE_L1(value)) {
4160                                     ANYOF_BITMAP_SET(data->start_class, value);
4161                                 }
4162                             }
4163                         } else {
4164                             for (value = 0; value < 256; value++) {
4165                                 if (isSPACE(value)) {
4166                                     ANYOF_BITMAP_SET(data->start_class, value);
4167                                 }
4168                             }
4169                         }
4170                     }
4171                     break;
4172                 case NSPACE:
4173                     if (flags & SCF_DO_STCLASS_AND) {
4174                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4175                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4176                             if (OP(scan) == NSPACEU) {
4177                                 for (value = 0; value < 256; value++) {
4178                                     if (isSPACE_L1(value)) {
4179                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4180                                     }
4181                                 }
4182                             } else {
4183                                 for (value = 0; value < 256; value++) {
4184                                     if (isSPACE(value)) {
4185                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
4186                                     }
4187                                 }
4188                             }
4189                         }
4190                     }
4191                     else {
4192                         if (data->start_class->flags & ANYOF_LOCALE)
4193                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4194                         if (OP(scan) == NSPACEU) {
4195                             for (value = 0; value < 256; value++) {
4196                                 if (!isSPACE_L1(value)) {
4197                                     ANYOF_BITMAP_SET(data->start_class, value);
4198                                 }
4199                             }
4200                         }
4201                         else {
4202                             for (value = 0; value < 256; value++) {
4203                                 if (!isSPACE(value)) {
4204                                     ANYOF_BITMAP_SET(data->start_class, value);
4205                                 }
4206                             }
4207                         }
4208                     }
4209                     break;
4210                 case DIGIT:
4211                     if (flags & SCF_DO_STCLASS_AND) {
4212                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
4213                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4214                             for (value = 0; value < 256; value++)
4215                                 if (!isDIGIT(value))
4216                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
4217                         }
4218                     }
4219                     else {
4220                         if (data->start_class->flags & ANYOF_LOCALE)
4221                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4222                         for (value = 0; value < 256; value++)
4223                             if (isDIGIT(value))
4224                                 ANYOF_BITMAP_SET(data->start_class, value);
4225                     }
4226                     break;
4227                 case NDIGIT:
4228                     if (flags & SCF_DO_STCLASS_AND) {
4229                         if (!(data->start_class->flags & ANYOF_LOCALE))
4230                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4231                         for (value = 0; value < 256; value++)
4232                             if (isDIGIT(value))
4233                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
4234                     }
4235                     else {
4236                         if (data->start_class->flags & ANYOF_LOCALE)
4237                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4238                         for (value = 0; value < 256; value++)
4239                             if (!isDIGIT(value))
4240                                 ANYOF_BITMAP_SET(data->start_class, value);
4241                     }
4242                     break;
4243                 CASE_SYNST_FNC(VERTWS);
4244                 CASE_SYNST_FNC(HORIZWS);
4245
4246                 }
4247                 if (flags & SCF_DO_STCLASS_OR)
4248                     cl_and(data->start_class, and_withp);
4249                 flags &= ~SCF_DO_STCLASS;
4250             }
4251         }
4252         else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4253             data->flags |= (OP(scan) == MEOL
4254                             ? SF_BEFORE_MEOL
4255                             : SF_BEFORE_SEOL);
4256         }
4257         else if (  PL_regkind[OP(scan)] == BRANCHJ
4258                  /* Lookbehind, or need to calculate parens/evals/stclass: */
4259                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
4260                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4261             if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4262                 || OP(scan) == UNLESSM )
4263             {
4264                 /* Negative Lookahead/lookbehind
4265                    In this case we can't do fixed string optimisation.
4266                 */
4267
4268                 I32 deltanext, minnext, fake = 0;
4269                 regnode *nscan;
4270                 struct regnode_charclass_class intrnl;
4271                 int f = 0;
4272
4273                 data_fake.flags = 0;
4274                 if (data) {
4275                     data_fake.whilem_c = data->whilem_c;
4276                     data_fake.last_closep = data->last_closep;
4277                 }
4278                 else
4279                     data_fake.last_closep = &fake;
4280                 data_fake.pos_delta = delta;
4281                 if ( flags & SCF_DO_STCLASS && !scan->flags
4282                      && OP(scan) == IFMATCH ) { /* Lookahead */
4283                     cl_init(pRExC_state, &intrnl);
4284                     data_fake.start_class = &intrnl;
4285                     f |= SCF_DO_STCLASS_AND;
4286                 }
4287                 if (flags & SCF_WHILEM_VISITED_POS)
4288                     f |= SCF_WHILEM_VISITED_POS;
4289                 next = regnext(scan);
4290                 nscan = NEXTOPER(NEXTOPER(scan));
4291                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4292                     last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4293                 if (scan->flags) {
4294                     if (deltanext) {
4295                         FAIL("Variable length lookbehind not implemented");
4296                     }
4297                     else if (minnext > (I32)U8_MAX) {
4298                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4299                     }
4300                     scan->flags = (U8)minnext;
4301                 }
4302                 if (data) {
4303                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4304                         pars++;
4305                     if (data_fake.flags & SF_HAS_EVAL)
4306                         data->flags |= SF_HAS_EVAL;
4307                     data->whilem_c = data_fake.whilem_c;
4308                 }
4309                 if (f & SCF_DO_STCLASS_AND) {
4310                     if (flags & SCF_DO_STCLASS_OR) {
4311                         /* OR before, AND after: ideally we would recurse with
4312                          * data_fake to get the AND applied by study of the
4313                          * remainder of the pattern, and then derecurse;
4314                          * *** HACK *** for now just treat as "no information".
4315                          * See [perl #56690].
4316                          */
4317                         cl_init(pRExC_state, data->start_class);
4318                     }  else {
4319                         /* AND before and after: combine and continue */
4320                         const int was = (data->start_class->flags & ANYOF_EOS);
4321
4322                         cl_and(data->start_class, &intrnl);
4323                         if (was)
4324                             data->start_class->flags |= ANYOF_EOS;
4325                     }
4326                 }
4327             }
4328 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4329             else {
4330                 /* Positive Lookahead/lookbehind
4331                    In this case we can do fixed string optimisation,
4332                    but we must be careful about it. Note in the case of
4333                    lookbehind the positions will be offset by the minimum
4334                    length of the pattern, something we won't know about
4335                    until after the recurse.
4336                 */
4337                 I32 deltanext, fake = 0;
4338                 regnode *nscan;
4339                 struct regnode_charclass_class intrnl;
4340                 int f = 0;
4341                 /* We use SAVEFREEPV so that when the full compile
4342                     is finished perl will clean up the allocated
4343                     minlens when it's all done. This way we don't
4344                     have to worry about freeing them when we know
4345                     they wont be used, which would be a pain.
4346                  */
4347                 I32 *minnextp;
4348                 Newx( minnextp, 1, I32 );
4349                 SAVEFREEPV(minnextp);
4350
4351                 if (data) {
4352                     StructCopy(data, &data_fake, scan_data_t);
4353                     if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4354                         f |= SCF_DO_SUBSTR;
4355                         if (scan->flags)
4356                             SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4357                         data_fake.last_found=newSVsv(data->last_found);
4358                     }
4359                 }
4360                 else
4361                     data_fake.last_closep = &fake;
4362                 data_fake.flags = 0;
4363                 data_fake.pos_delta = delta;
4364                 if (is_inf)
4365                     data_fake.flags |= SF_IS_INF;
4366                 if ( flags & SCF_DO_STCLASS && !scan->flags
4367                      && OP(scan) == IFMATCH ) { /* Lookahead */
4368                     cl_init(pRExC_state, &intrnl);
4369                     data_fake.start_class = &intrnl;
4370                     f |= SCF_DO_STCLASS_AND;
4371                 }
4372                 if (flags & SCF_WHILEM_VISITED_POS)
4373                     f |= SCF_WHILEM_VISITED_POS;
4374                 next = regnext(scan);
4375                 nscan = NEXTOPER(NEXTOPER(scan));
4376
4377                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4378                     last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4379                 if (scan->flags) {
4380                     if (deltanext) {
4381                         FAIL("Variable length lookbehind not implemented");
4382                     }
4383                     else if (*minnextp > (I32)U8_MAX) {
4384                         FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4385                     }
4386                     scan->flags = (U8)*minnextp;
4387                 }
4388
4389                 *minnextp += min;
4390
4391                 if (f & SCF_DO_STCLASS_AND) {
4392                     const int was = (data->start_class->flags & ANYOF_EOS);
4393
4394                     cl_and(data->start_class, &intrnl);
4395                     if (was)
4396                         data->start_class->flags |= ANYOF_EOS;
4397                 }
4398                 if (data) {
4399                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4400                         pars++;
4401                     if (data_fake.flags & SF_HAS_EVAL)
4402                         data->flags |= SF_HAS_EVAL;
4403                     data->whilem_c = data_fake.whilem_c;
4404                     if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4405                         if (RExC_rx->minlen<*minnextp)
4406                             RExC_rx->minlen=*minnextp;
4407                         SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4408                         SvREFCNT_dec(data_fake.last_found);
4409
4410                         if ( data_fake.minlen_fixed != minlenp )
4411                         {
4412                             data->offset_fixed= data_fake.offset_fixed;
4413                             data->minlen_fixed= data_fake.minlen_fixed;
4414                             data->lookbehind_fixed+= scan->flags;
4415                         }
4416                         if ( data_fake.minlen_float != minlenp )
4417                         {
4418                             data->minlen_float= data_fake.minlen_float;
4419                             data->offset_float_min=data_fake.offset_float_min;
4420                             data->offset_float_max=data_fake.offset_float_max;
4421                             data->lookbehind_float+= scan->flags;
4422                         }
4423                     }
4424                 }
4425
4426
4427             }
4428 #endif
4429         }
4430         else if (OP(scan) == OPEN) {
4431             if (stopparen != (I32)ARG(scan))
4432                 pars++;
4433         }
4434         else if (OP(scan) == CLOSE) {
4435             if (stopparen == (I32)ARG(scan)) {
4436                 break;
4437             }
4438             if ((I32)ARG(scan) == is_par) {
4439                 next = regnext(scan);
4440
4441                 if ( next && (OP(next) != WHILEM) && next < last)
4442                     is_par = 0;         /* Disable optimization */
4443             }
4444             if (data)
4445                 *(data->last_closep) = ARG(scan);
4446         }
4447         else if (OP(scan) == EVAL) {
4448                 if (data)
4449                     data->flags |= SF_HAS_EVAL;
4450         }
4451         else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4452             if (flags & SCF_DO_SUBSTR) {
4453                 SCAN_COMMIT(pRExC_state,data,minlenp);
4454                 flags &= ~SCF_DO_SUBSTR;
4455             }
4456             if (data && OP(scan)==ACCEPT) {
4457                 data->flags |= SCF_SEEN_ACCEPT;
4458                 if (stopmin > min)
4459                     stopmin = min;
4460             }
4461         }
4462         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4463         {
4464                 if (flags & SCF_DO_SUBSTR) {
4465                     SCAN_COMMIT(pRExC_state,data,minlenp);
4466                     data->longest = &(data->longest_float);
4467                 }
4468                 is_inf = is_inf_internal = 1;
4469                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4470                     cl_anything(pRExC_state, data->start_class);
4471                 flags &= ~SCF_DO_STCLASS;
4472         }
4473         else if (OP(scan) == GPOS) {
4474             if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4475                 !(delta || is_inf || (data && data->pos_delta)))
4476             {
4477                 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4478                     RExC_rx->extflags |= RXf_ANCH_GPOS;
4479                 if (RExC_rx->gofs < (U32)min)
4480                     RExC_rx->gofs = min;
4481             } else {
4482                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4483                 RExC_rx->gofs = 0;
4484             }
4485         }
4486 #ifdef TRIE_STUDY_OPT
4487 #ifdef FULL_TRIE_STUDY
4488         else if (PL_regkind[OP(scan)] == TRIE) {
4489             /* NOTE - There is similar code to this block above for handling
4490                BRANCH nodes on the initial study.  If you change stuff here
4491                check there too. */
4492             regnode *trie_node= scan;
4493             regnode *tail= regnext(scan);
4494             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4495             I32 max1 = 0, min1 = I32_MAX;
4496             struct regnode_charclass_class accum;
4497
4498             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4499                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4500             if (flags & SCF_DO_STCLASS)
4501                 cl_init_zero(pRExC_state, &accum);
4502
4503             if (!trie->jump) {
4504                 min1= trie->minlen;
4505                 max1= trie->maxlen;
4506             } else {
4507                 const regnode *nextbranch= NULL;
4508                 U32 word;
4509
4510                 for ( word=1 ; word <= trie->wordcount ; word++)
4511                 {
4512                     I32 deltanext=0, minnext=0, f = 0, fake;
4513                     struct regnode_charclass_class this_class;
4514
4515                     data_fake.flags = 0;
4516                     if (data) {
4517                         data_fake.whilem_c = data->whilem_c;
4518                         data_fake.last_closep = data->last_closep;
4519                     }
4520                     else
4521                         data_fake.last_closep = &fake;
4522                     data_fake.pos_delta = delta;
4523                     if (flags & SCF_DO_STCLASS) {
4524                         cl_init(pRExC_state, &this_class);
4525                         data_fake.start_class = &this_class;
4526                         f = SCF_DO_STCLASS_AND;
4527                     }
4528                     if (flags & SCF_WHILEM_VISITED_POS)
4529                         f |= SCF_WHILEM_VISITED_POS;
4530
4531                     if (trie->jump[word]) {
4532                         if (!nextbranch)
4533                             nextbranch = trie_node + trie->jump[0];
4534                         scan= trie_node + trie->jump[word];
4535                         /* We go from the jump point to the branch that follows
4536                            it. Note this means we need the vestigal unused branches
4537                            even though they arent otherwise used.
4538                          */
4539                         minnext = study_chunk(pRExC_state, &scan, minlenp,
4540                             &deltanext, (regnode *)nextbranch, &data_fake,
4541                             stopparen, recursed, NULL, f,depth+1);
4542                     }
4543                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4544                         nextbranch= regnext((regnode*)nextbranch);
4545
4546                     if (min1 > (I32)(minnext + trie->minlen))
4547                         min1 = minnext + trie->minlen;
4548                     if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4549                         max1 = minnext + deltanext + trie->maxlen;
4550                     if (deltanext == I32_MAX)
4551                         is_inf = is_inf_internal = 1;
4552
4553                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4554                         pars++;
4555                     if (data_fake.flags & SCF_SEEN_ACCEPT) {
4556                         if ( stopmin > min + min1)
4557                             stopmin = min + min1;
4558                         flags &= ~SCF_DO_SUBSTR;
4559                         if (data)
4560                             data->flags |= SCF_SEEN_ACCEPT;
4561                     }
4562                     if (data) {
4563                         if (data_fake.flags & SF_HAS_EVAL)
4564                             data->flags |= SF_HAS_EVAL;
4565                         data->whilem_c = data_fake.whilem_c;
4566                     }
4567                     if (flags & SCF_DO_STCLASS)
4568                         cl_or(pRExC_state, &accum, &this_class);
4569                 }
4570             }
4571             if (flags & SCF_DO_SUBSTR) {
4572                 data->pos_min += min1;
4573                 data->pos_delta += max1 - min1;
4574                 if (max1 != min1 || is_inf)
4575                     data->longest = &(data->longest_float);
4576             }
4577             min += min1;
4578             delta += max1 - min1;
4579             if (flags & SCF_DO_STCLASS_OR) {
4580                 cl_or(pRExC_state, data->start_class, &accum);
4581                 if (min1) {
4582                     cl_and(data->start_class, and_withp);
4583                     flags &= ~SCF_DO_STCLASS;
4584                 }
4585             }
4586             else if (flags & SCF_DO_STCLASS_AND) {
4587                 if (min1) {
4588                     cl_and(data->start_class, &accum);
4589                     flags &= ~SCF_DO_STCLASS;
4590                 }
4591                 else {
4592                     /* Switch to OR mode: cache the old value of
4593                      * data->start_class */
4594                     INIT_AND_WITHP;
4595                     StructCopy(data->start_class, and_withp,
4596                                struct regnode_charclass_class);
4597                     flags &= ~SCF_DO_STCLASS_AND;
4598                     StructCopy(&accum, data->start_class,
4599                                struct regnode_charclass_class);
4600                     flags |= SCF_DO_STCLASS_OR;
4601                     data->start_class->flags |= ANYOF_EOS;
4602                 }
4603             }
4604             scan= tail;
4605             continue;
4606         }
4607 #else
4608         else if (PL_regkind[OP(scan)] == TRIE) {
4609             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4610             U8*bang=NULL;
4611
4612             min += trie->minlen;
4613             delta += (trie->maxlen - trie->minlen);
4614             flags &= ~SCF_DO_STCLASS; /* xxx */
4615             if (flags & SCF_DO_SUBSTR) {
4616                 SCAN_COMMIT(pRExC_state,data,minlenp);  /* Cannot expect anything... */
4617                 data->pos_min += trie->minlen;
4618                 data->pos_delta += (trie->maxlen - trie->minlen);
4619                 if (trie->maxlen != trie->minlen)
4620                     data->longest = &(data->longest_float);
4621             }
4622             if (trie->jump) /* no more substrings -- for now /grr*/
4623                 flags &= ~SCF_DO_SUBSTR;
4624         }
4625 #endif /* old or new */
4626 #endif /* TRIE_STUDY_OPT */
4627
4628         /* Else: zero-length, ignore. */
4629         scan = regnext(scan);
4630     }
4631     if (frame) {
4632         last = frame->last;
4633         scan = frame->next;
4634         stopparen = frame->stop;
4635         frame = frame->prev;
4636         goto fake_study_recurse;
4637     }
4638
4639   finish:
4640     assert(!frame);
4641     DEBUG_STUDYDATA("pre-fin:",data,depth);
4642
4643     *scanp = scan;
4644     *deltap = is_inf_internal ? I32_MAX : delta;
4645     if (flags & SCF_DO_SUBSTR && is_inf)
4646         data->pos_delta = I32_MAX - data->pos_min;
4647     if (is_par > (I32)U8_MAX)
4648         is_par = 0;
4649     if (is_par && pars==1 && data) {
4650         data->flags |= SF_IN_PAR;
4651         data->flags &= ~SF_HAS_PAR;
4652     }
4653     else if (pars && data) {
4654         data->flags |= SF_HAS_PAR;
4655         data->flags &= ~SF_IN_PAR;
4656     }
4657     if (flags & SCF_DO_STCLASS_OR)
4658         cl_and(data->start_class, and_withp);
4659     if (flags & SCF_TRIE_RESTUDY)
4660         data->flags |=  SCF_TRIE_RESTUDY;
4661
4662     DEBUG_STUDYDATA("post-fin:",data,depth);
4663
4664     return min < stopmin ? min : stopmin;
4665 }
4666
4667 STATIC U32
4668 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4669 {
4670     U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4671
4672     PERL_ARGS_ASSERT_ADD_DATA;
4673
4674     Renewc(RExC_rxi->data,
4675            sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4676            char, struct reg_data);
4677     if(count)
4678         Renew(RExC_rxi->data->what, count + n, U8);
4679     else
4680         Newx(RExC_rxi->data->what, n, U8);
4681     RExC_rxi->data->count = count + n;
4682     Copy(s, RExC_rxi->data->what + count, n, U8);
4683     return count;
4684 }
4685
4686 /*XXX: todo make this not included in a non debugging perl */
4687 #ifndef PERL_IN_XSUB_RE
4688 void
4689 Perl_reginitcolors(pTHX)
4690 {
4691     dVAR;
4692     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4693     if (s) {
4694         char *t = savepv(s);
4695         int i = 0;
4696         PL_colors[0] = t;
4697         while (++i < 6) {
4698             t = strchr(t, '\t');
4699             if (t) {
4700                 *t = '\0';
4701                 PL_colors[i] = ++t;
4702             }
4703             else
4704                 PL_colors[i] = t = (char *)"";
4705         }
4706     } else {
4707         int i = 0;
4708         while (i < 6)
4709             PL_colors[i++] = (char *)"";
4710     }
4711     PL_colorset = 1;
4712 }
4713 #endif
4714
4715
4716 #ifdef TRIE_STUDY_OPT
4717 #define CHECK_RESTUDY_GOTO                                  \
4718         if (                                                \
4719               (data.flags & SCF_TRIE_RESTUDY)               \
4720               && ! restudied++                              \
4721         )     goto reStudy
4722 #else
4723 #define CHECK_RESTUDY_GOTO
4724 #endif
4725
4726 /*
4727  - pregcomp - compile a regular expression into internal code
4728  *
4729  * We can't allocate space until we know how big the compiled form will be,
4730  * but we can't compile it (and thus know how big it is) until we've got a
4731  * place to put the code.  So we cheat:  we compile it twice, once with code
4732  * generation turned off and size counting turned on, and once "for real".
4733  * This also means that we don't allocate space until we are sure that the
4734  * thing really will compile successfully, and we never have to move the
4735  * code and thus invalidate pointers into it.  (Note that it has to be in
4736  * one piece because free() must be able to free it all.) [NB: not true in perl]
4737  *
4738  * Beware that the optimization-preparation code in here knows about some
4739  * of the structure of the compiled regexp.  [I'll say.]
4740  */
4741
4742
4743
4744 #ifndef PERL_IN_XSUB_RE
4745 #define RE_ENGINE_PTR &PL_core_reg_engine
4746 #else
4747 extern const struct regexp_engine my_reg_engine;
4748 #define RE_ENGINE_PTR &my_reg_engine
4749 #endif
4750
4751 #ifndef PERL_IN_XSUB_RE
4752 REGEXP *
4753 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4754 {
4755     dVAR;
4756     HV * const table = GvHV(PL_hintgv);
4757
4758     PERL_ARGS_ASSERT_PREGCOMP;
4759
4760     /* Dispatch a request to compile a regexp to correct
4761        regexp engine. */
4762     if (table) {
4763         SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4764         GET_RE_DEBUG_FLAGS_DECL;
4765         if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4766             const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4767             DEBUG_COMPILE_r({
4768                 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4769                     SvIV(*ptr));
4770             });
4771             return CALLREGCOMP_ENG(eng, pattern, flags);
4772         }
4773     }
4774     return Perl_re_compile(aTHX_ pattern, flags);
4775 }
4776 #endif
4777
4778 REGEXP *
4779 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4780 {
4781     dVAR;
4782     REGEXP *rx;
4783     struct regexp *r;
4784     register regexp_internal *ri;
4785     STRLEN plen;
4786     char* VOL exp;
4787     char* xend;
4788     regnode *scan;
4789     I32 flags;
4790     I32 minlen = 0;
4791     U32 pm_flags;
4792
4793     /* these are all flags - maybe they should be turned
4794      * into a single int with different bit masks */
4795     I32 sawlookahead = 0;
4796     I32 sawplus = 0;
4797     I32 sawopen = 0;
4798     bool used_setjump = FALSE;
4799     regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4800
4801     U8 jump_ret = 0;
4802     dJMPENV;
4803     scan_data_t data;
4804     RExC_state_t RExC_state;
4805     RExC_state_t * const pRExC_state = &RExC_state;
4806 #ifdef TRIE_STUDY_OPT
4807     int restudied;
4808     RExC_state_t copyRExC_state;
4809 #endif
4810     GET_RE_DEBUG_FLAGS_DECL;
4811
4812     PERL_ARGS_ASSERT_RE_COMPILE;
4813
4814     DEBUG_r(if (!PL_colorset) reginitcolors());
4815
4816     exp = SvPV(pattern, plen);
4817
4818     if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4819         RExC_utf8 = RExC_orig_utf8 = 0;
4820     }
4821     else {
4822         RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4823     }
4824     RExC_uni_semantics = 0;
4825     RExC_contains_locale = 0;
4826
4827     /****************** LONG JUMP TARGET HERE***********************/
4828     /* Longjmp back to here if have to switch in midstream to utf8 */
4829     if (! RExC_orig_utf8) {
4830         JMPENV_PUSH(jump_ret);
4831         used_setjump = TRUE;
4832     }
4833
4834     if (jump_ret == 0) {    /* First time through */
4835         xend = exp + plen;
4836
4837         DEBUG_COMPILE_r({
4838             SV *dsv= sv_newmortal();
4839             RE_PV_QUOTED_DECL(s, RExC_utf8,
4840                 dsv, exp, plen, 60);
4841             PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4842                            PL_colors[4],PL_colors[5],s);
4843         });
4844     }
4845     else {  /* longjumped back */
4846         STRLEN len = plen;
4847
4848         /* If the cause for the longjmp was other than changing to utf8, pop
4849          * our own setjmp, and longjmp to the correct handler */
4850         if (jump_ret != UTF8_LONGJMP) {
4851             JMPENV_POP;
4852             JMPENV_JUMP(jump_ret);
4853         }
4854
4855         GET_RE_DEBUG_FLAGS;
4856
4857         /* It's possible to write a regexp in ascii that represents Unicode
4858         codepoints outside of the byte range, such as via \x{100}. If we
4859         detect such a sequence we have to convert the entire pattern to utf8
4860         and then recompile, as our sizing calculation will have been based
4861         on 1 byte == 1 character, but we will need to use utf8 to encode
4862         at least some part of the pattern, and therefore must convert the whole
4863         thing.
4864         -- dmq */
4865         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4866             "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4867         exp = (char*)Perl_bytes_to_utf8(aTHX_
4868                                         (U8*)SvPV_nomg(pattern, plen),
4869                                         &len);
4870         xend = exp + len;
4871         RExC_orig_utf8 = RExC_utf8 = 1;
4872         SAVEFREEPV(exp);
4873     }
4874
4875 #ifdef TRIE_STUDY_OPT
4876     restudied = 0;
4877 #endif
4878
4879     pm_flags = orig_pm_flags;
4880
4881     if (initial_charset == REGEX_LOCALE_CHARSET) {
4882         RExC_contains_locale = 1;
4883     }
4884     else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4885
4886         /* Set to use unicode semantics if the pattern is in utf8 and has the
4887          * 'depends' charset specified, as it means unicode when utf8  */
4888         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4889     }
4890
4891     RExC_precomp = exp;
4892     RExC_flags = pm_flags;
4893     RExC_sawback = 0;
4894
4895     RExC_seen = 0;
4896     RExC_in_lookbehind = 0;
4897     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4898     RExC_seen_evals = 0;
4899     RExC_extralen = 0;
4900     RExC_override_recoding = 0;
4901
4902     /* First pass: determine size, legality. */
4903     RExC_parse = exp;
4904     RExC_start = exp;
4905     RExC_end = xend;
4906     RExC_naughty = 0;
4907     RExC_npar = 1;
4908     RExC_nestroot = 0;
4909     RExC_size = 0L;
4910     RExC_emit = &PL_regdummy;
4911     RExC_whilem_seen = 0;
4912     RExC_open_parens = NULL;
4913     RExC_close_parens = NULL;
4914     RExC_opend = NULL;
4915     RExC_paren_names = NULL;
4916 #ifdef DEBUGGING
4917     RExC_paren_name_list = NULL;
4918 #endif
4919     RExC_recurse = NULL;
4920     RExC_recurse_count = 0;
4921
4922 #if 0 /* REGC() is (currently) a NOP at the first pass.
4923        * Clever compilers notice this and complain. --jhi */
4924     REGC((U8)REG_MAGIC, (char*)RExC_emit);
4925 #endif
4926     DEBUG_PARSE_r(
4927         PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
4928         RExC_lastnum=0;
4929         RExC_lastparse=NULL;
4930     );
4931     if (reg(pRExC_state, 0, &flags,1) == NULL) {
4932         RExC_precomp = NULL;
4933         return(NULL);
4934     }
4935
4936     /* Here, finished first pass.  Get rid of any added setjmp */
4937     if (used_setjump) {
4938         JMPENV_POP;
4939     }
4940
4941     DEBUG_PARSE_r({
4942         PerlIO_printf(Perl_debug_log,
4943             "Required size %"IVdf" nodes\n"
4944             "Starting second pass (creation)\n",
4945             (IV)RExC_size);
4946         RExC_lastnum=0;
4947         RExC_lastparse=NULL;
4948     });
4949
4950     /* The first pass could have found things that force Unicode semantics */
4951     if ((RExC_utf8 || RExC_uni_semantics)
4952          && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4953     {
4954         set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4955     }
4956
4957     /* Small enough for pointer-storage convention?
4958        If extralen==0, this means that we will not need long jumps. */
4959     if (RExC_size >= 0x10000L && RExC_extralen)
4960         RExC_size += RExC_extralen;
4961     else
4962         RExC_extralen = 0;
4963     if (RExC_whilem_seen > 15)
4964         RExC_whilem_seen = 15;
4965
4966     /* Allocate space and zero-initialize. Note, the two step process
4967        of zeroing when in debug mode, thus anything assigned has to
4968        happen after that */
4969     rx = (REGEXP*) newSV_type(SVt_REGEXP);
4970     r = (struct regexp*)SvANY(rx);
4971     Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4972          char, regexp_internal);
4973     if ( r == NULL || ri == NULL )
4974         FAIL("Regexp out of space");
4975 #ifdef DEBUGGING
4976     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4977     Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4978 #else
4979     /* bulk initialize base fields with 0. */
4980     Zero(ri, sizeof(regexp_internal), char);
4981 #endif
4982
4983     /* non-zero initialization begins here */
4984     RXi_SET( r, ri );
4985     r->engine= RE_ENGINE_PTR;
4986     r->extflags = pm_flags;
4987     {
4988         bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4989         bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4990
4991         /* The caret is output if there are any defaults: if not all the STD
4992          * flags are set, or if no character set specifier is needed */
4993         bool has_default =
4994                     (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4995                     || ! has_charset);
4996         bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4997         U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4998                             >> RXf_PMf_STD_PMMOD_SHIFT);
4999         const char *fptr = STD_PAT_MODS;        /*"msix"*/
5000         char *p;
5001         /* Allocate for the worst case, which is all the std flags are turned
5002          * on.  If more precision is desired, we could do a population count of
5003          * the flags set.  This could be done with a small lookup table, or by
5004          * shifting, masking and adding, or even, when available, assembly
5005          * language for a machine-language population count.
5006          * We never output a minus, as all those are defaults, so are
5007          * covered by the caret */
5008         const STRLEN wraplen = plen + has_p + has_runon
5009             + has_default       /* If needs a caret */
5010
5011                 /* If needs a character set specifier */
5012             + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5013             + (sizeof(STD_PAT_MODS) - 1)
5014             + (sizeof("(?:)") - 1);
5015
5016         p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5017         SvPOK_on(rx);
5018         SvFLAGS(rx) |= SvUTF8(pattern);
5019         *p++='('; *p++='?';
5020
5021         /* If a default, cover it using the caret */
5022         if (has_default) {
5023             *p++= DEFAULT_PAT_MOD;
5024         }
5025         if (has_charset) {
5026             STRLEN len;
5027             const char* const name = get_regex_charset_name(r->extflags, &len);
5028             Copy(name, p, len, char);
5029             p += len;
5030         }
5031         if (has_p)
5032             *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5033         {
5034             char ch;
5035             while((ch = *fptr++)) {
5036                 if(reganch & 1)
5037                     *p++ = ch;
5038                 reganch >>= 1;
5039             }
5040         }
5041
5042         *p++ = ':';
5043         Copy(RExC_precomp, p, plen, char);
5044         assert ((RX_WRAPPED(rx) - p) < 16);
5045         r->pre_prefix = p - RX_WRAPPED(rx);
5046         p += plen;
5047         if (has_runon)
5048             *p++ = '\n';
5049         *p++ = ')';
5050         *p = 0;
5051         SvCUR_set(rx, p - SvPVX_const(rx));
5052     }
5053
5054     r->intflags = 0;
5055     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5056
5057     if (RExC_seen & REG_SEEN_RECURSE) {
5058         Newxz(RExC_open_parens, RExC_npar,regnode *);
5059         SAVEFREEPV(RExC_open_parens);
5060         Newxz(RExC_close_parens,RExC_npar,regnode *);
5061         SAVEFREEPV(RExC_close_parens);
5062     }
5063
5064     /* Useful during FAIL. */
5065 #ifdef RE_TRACK_PATTERN_OFFSETS
5066     Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5067     DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5068                           "%s %"UVuf" bytes for offset annotations.\n",
5069                           ri->u.offsets ? "Got" : "Couldn't get",
5070                           (UV)((2*RExC_size+1) * sizeof(U32))));
5071 #endif
5072     SetProgLen(ri,RExC_size);
5073     RExC_rx_sv = rx;
5074     RExC_rx = r;
5075     RExC_rxi = ri;
5076
5077     /* Second pass: emit code. */
5078     RExC_flags = pm_flags;      /* don't let top level (?i) bleed */
5079     RExC_parse = exp;
5080     RExC_end = xend;
5081     RExC_naughty = 0;
5082     RExC_npar = 1;
5083     RExC_emit_start = ri->program;
5084     RExC_emit = ri->program;
5085     RExC_emit_bound = ri->program + RExC_size + 1;
5086
5087     /* Store the count of eval-groups for security checks: */
5088     RExC_rx->seen_evals = RExC_seen_evals;
5089     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5090     if (reg(pRExC_state, 0, &flags,1) == NULL) {
5091         ReREFCNT_dec(rx);
5092         return(NULL);
5093     }
5094     /* XXXX To minimize changes to RE engine we always allocate
5095        3-units-long substrs field. */
5096     Newx(r->substrs, 1, struct reg_substr_data);
5097     if (RExC_recurse_count) {
5098         Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5099         SAVEFREEPV(RExC_recurse);
5100     }
5101
5102 reStudy:
5103     r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5104     Zero(r->substrs, 1, struct reg_substr_data);
5105
5106 #ifdef TRIE_STUDY_OPT
5107     if (!restudied) {
5108         StructCopy(&zero_scan_data, &data, scan_data_t);
5109         copyRExC_state = RExC_state;
5110     } else {
5111         U32 seen=RExC_seen;
5112         DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5113
5114         RExC_state = copyRExC_state;
5115         if (seen & REG_TOP_LEVEL_BRANCHES)
5116             RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5117         else
5118             RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5119         if (data.last_found) {
5120             SvREFCNT_dec(data.longest_fixed);
5121             SvREFCNT_dec(data.longest_float);
5122             SvREFCNT_dec(data.last_found);
5123         }
5124         StructCopy(&zero_scan_data, &data, scan_data_t);
5125     }
5126 #else
5127     StructCopy(&zero_scan_data, &data, scan_data_t);
5128 #endif
5129
5130     /* Dig out information for optimizations. */
5131     r->extflags = RExC_flags; /* was pm_op */
5132     /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5133
5134     if (UTF)
5135         SvUTF8_on(rx);  /* Unicode in it? */
5136     ri->regstclass = NULL;
5137     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
5138         r->intflags |= PREGf_NAUGHTY;
5139     scan = ri->program + 1;             /* First BRANCH. */
5140
5141     /* testing for BRANCH here tells us whether there is "must appear"
5142        data in the pattern. If there is then we can use it for optimisations */
5143     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
5144         I32 fake;
5145         STRLEN longest_float_length, longest_fixed_length;
5146         struct regnode_charclass_class ch_class; /* pointed to by data */
5147         int stclass_flag;
5148         I32 last_close = 0; /* pointed to by data */
5149         regnode *first= scan;
5150         regnode *first_next= regnext(first);
5151         /*
5152          * Skip introductions and multiplicators >= 1
5153          * so that we can extract the 'meat' of the pattern that must
5154          * match in the large if() sequence following.
5155          * NOTE that EXACT is NOT covered here, as it is normally
5156          * picked up by the optimiser separately.
5157          *
5158          * This is unfortunate as the optimiser isnt handling lookahead
5159          * properly currently.
5160          *
5161          */
5162         while ((OP(first) == OPEN && (sawopen = 1)) ||
5163                /* An OR of *one* alternative - should not happen now. */
5164             (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5165             /* for now we can't handle lookbehind IFMATCH*/
5166             (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5167             (OP(first) == PLUS) ||
5168             (OP(first) == MINMOD) ||
5169                /* An {n,m} with n>0 */
5170             (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5171             (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5172         {
5173                 /*
5174                  * the only op that could be a regnode is PLUS, all the rest
5175                  * will be regnode_1 or regnode_2.
5176                  *
5177                  */
5178                 if (OP(first) == PLUS)
5179                     sawplus = 1;
5180                 else
5181                     first += regarglen[OP(first)];
5182
5183                 first = NEXTOPER(first);
5184                 first_next= regnext(first);
5185         }
5186
5187         /* Starting-point info. */
5188       again:
5189         DEBUG_PEEP("first:",first,0);
5190         /* Ignore EXACT as we deal with it later. */
5191         if (PL_regkind[OP(first)] == EXACT) {
5192             if (OP(first) == EXACT)
5193                 NOOP;   /* Empty, get anchored substr later. */
5194             else
5195                 ri->regstclass = first;
5196         }
5197 #ifdef TRIE_STCLASS
5198         else if (PL_regkind[OP(first)] == TRIE &&
5199                 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5200         {
5201             regnode *trie_op;
5202             /* this can happen only on restudy */
5203             if ( OP(first) == TRIE ) {
5204                 struct regnode_1 *trieop = (struct regnode_1 *)
5205                     PerlMemShared_calloc(1, sizeof(struct regnode_1));
5206                 StructCopy(first,trieop,struct regnode_1);
5207                 trie_op=(regnode *)trieop;
5208             } else {
5209                 struct regnode_charclass *trieop = (struct regnode_charclass *)
5210                     PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5211                 StructCopy(first,trieop,struct regnode_charclass);
5212                 trie_op=(regnode *)trieop;
5213             }
5214             OP(trie_op)+=2;
5215             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5216             ri->regstclass = trie_op;
5217         }
5218 #endif
5219         else if (REGNODE_SIMPLE(OP(first)))
5220             ri->regstclass = first;
5221         else if (PL_regkind[OP(first)] == BOUND ||
5222                  PL_regkind[OP(first)] == NBOUND)
5223             ri->regstclass = first;
5224         else if (PL_regkind[OP(first)] == BOL) {
5225             r->extflags |= (OP(first) == MBOL
5226                            ? RXf_ANCH_MBOL
5227                            : (OP(first) == SBOL
5228                               ? RXf_ANCH_SBOL
5229                               : RXf_ANCH_BOL));
5230             first = NEXTOPER(first);
5231             goto again;
5232         }
5233         else if (OP(first) == GPOS) {
5234             r->extflags |= RXf_ANCH_GPOS;
5235             first = NEXTOPER(first);
5236             goto again;
5237         }
5238         else if ((!sawopen || !RExC_sawback) &&
5239             (OP(first) == STAR &&
5240             PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5241             !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5242         {
5243             /* turn .* into ^.* with an implied $*=1 */
5244             const int type =
5245                 (OP(NEXTOPER(first)) == REG_ANY)
5246                     ? RXf_ANCH_MBOL
5247                     : RXf_ANCH_SBOL;
5248             r->extflags |= type;
5249             r->intflags |= PREGf_IMPLICIT;
5250             first = NEXTOPER(first);
5251             goto again;
5252         }
5253         if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5254             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5255             /* x+ must match at the 1st pos of run of x's */
5256             r->intflags |= PREGf_SKIP;
5257
5258         /* Scan is after the zeroth branch, first is atomic matcher. */
5259 #ifdef TRIE_STUDY_OPT
5260         DEBUG_PARSE_r(
5261             if (!restudied)
5262                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5263                               (IV)(first - scan + 1))
5264         );
5265 #else
5266         DEBUG_PARSE_r(
5267             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5268                 (IV)(first - scan + 1))
5269         );
5270 #endif
5271
5272
5273         /*
5274         * If there's something expensive in the r.e., find the
5275         * longest literal string that must appear and make it the
5276         * regmust.  Resolve ties in favor of later strings, since
5277         * the regstart check works with the beginning of the r.e.
5278         * and avoiding duplication strengthens checking.  Not a
5279         * strong reason, but sufficient in the absence of others.
5280         * [Now we resolve ties in favor of the earlier string if
5281         * it happens that c_offset_min has been invalidated, since the
5282         * earlier string may buy us something the later one won't.]
5283         */
5284
5285         data.longest_fixed = newSVpvs("");
5286         data.longest_float = newSVpvs("");
5287         data.last_found = newSVpvs("");
5288         data.longest = &(data.longest_fixed);
5289         first = scan;
5290         if (!ri->regstclass) {
5291             cl_init(pRExC_state, &ch_class);
5292             data.start_class = &ch_class;
5293             stclass_flag = SCF_DO_STCLASS_AND;
5294         } else                          /* XXXX Check for BOUND? */
5295             stclass_flag = 0;
5296         data.last_closep = &last_close;
5297
5298         minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5299             &data, -1, NULL, NULL,
5300             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5301
5302
5303         CHECK_RESTUDY_GOTO;
5304
5305
5306         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5307              && data.last_start_min == 0 && data.last_end > 0
5308              && !RExC_seen_zerolen
5309              && !(RExC_seen & REG_SEEN_VERBARG)
5310              && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5311             r->extflags |= RXf_CHECK_ALL;
5312         scan_commit(pRExC_state, &data,&minlen,0);
5313         SvREFCNT_dec(data.last_found);
5314
5315         /* Note that code very similar to this but for anchored string
5316            follows immediately below, changes may need to be made to both.
5317            Be careful.
5318          */
5319         longest_float_length = CHR_SVLEN(data.longest_float);
5320         if (longest_float_length
5321             || (data.flags & SF_FL_BEFORE_EOL
5322                 && (!(data.flags & SF_FL_BEFORE_MEOL)
5323                     || (RExC_flags & RXf_PMf_MULTILINE))))
5324         {
5325             I32 t,ml;
5326
5327             /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5328             if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5329                 || (SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
5330                     && data.offset_fixed == data.offset_float_min
5331                     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5332                     goto remove_float;          /* As in (a)+. */
5333
5334             /* copy the information about the longest float from the reg_scan_data
5335                over to the program. */
5336             if (SvUTF8(data.longest_float)) {
5337                 r->float_utf8 = data.longest_float;
5338                 r->float_substr = NULL;
5339             } else {
5340                 r->float_substr = data.longest_float;
5341                 r->float_utf8 = NULL;
5342             }
5343             /* float_end_shift is how many chars that must be matched that
5344                follow this item. We calculate it ahead of time as once the
5345                lookbehind offset is added in we lose the ability to correctly
5346                calculate it.*/
5347             ml = data.minlen_float ? *(data.minlen_float)
5348                                    : (I32)longest_float_length;
5349             r->float_end_shift = ml - data.offset_float_min
5350                 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5351                 + data.lookbehind_float;
5352             r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5353             r->float_max_offset = data.offset_float_max;
5354             if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5355                 r->float_max_offset -= data.lookbehind_float;
5356
5357             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5358                        && (!(data.flags & SF_FL_BEFORE_MEOL)
5359                            || (RExC_flags & RXf_PMf_MULTILINE)));
5360             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5361         }
5362         else {
5363           remove_float:
5364             r->float_substr = r->float_utf8 = NULL;
5365             SvREFCNT_dec(data.longest_float);
5366             longest_float_length = 0;
5367         }
5368
5369         /* Note that code very similar to this but for floating string
5370            is immediately above, changes may need to be made to both.
5371            Be careful.
5372          */
5373         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5374
5375         /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5376         if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5377             && (longest_fixed_length
5378                 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5379                     && (!(data.flags & SF_FIX_BEFORE_MEOL)
5380                         || (RExC_flags & RXf_PMf_MULTILINE)))) )
5381         {
5382             I32 t,ml;
5383
5384             /* copy the information about the longest fixed
5385                from the reg_scan_data over to the program. */
5386             if (SvUTF8(data.longest_fixed)) {
5387                 r->anchored_utf8 = data.longest_fixed;
5388                 r->anchored_substr = NULL;
5389             } else {
5390                 r->anchored_substr = data.longest_fixed;
5391                 r->anchored_utf8 = NULL;
5392             }
5393             /* fixed_end_shift is how many chars that must be matched that
5394                follow this item. We calculate it ahead of time as once the
5395                lookbehind offset is added in we lose the ability to correctly
5396                calculate it.*/
5397             ml = data.minlen_fixed ? *(data.minlen_fixed)
5398                                    : (I32)longest_fixed_length;
5399             r->anchored_end_shift = ml - data.offset_fixed
5400                 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5401                 + data.lookbehind_fixed;
5402             r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5403
5404             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5405                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
5406                      || (RExC_flags & RXf_PMf_MULTILINE)));
5407             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5408         }
5409         else {
5410             r->anchored_substr = r->anchored_utf8 = NULL;
5411             SvREFCNT_dec(data.longest_fixed);
5412             longest_fixed_length = 0;
5413         }
5414         if (ri->regstclass
5415             && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5416             ri->regstclass = NULL;
5417
5418         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5419             && stclass_flag
5420             && !(data.start_class->flags & ANYOF_EOS)
5421             && !cl_is_anything(data.start_class))
5422         {
5423             const U32 n = add_data(pRExC_state, 1, "f");
5424             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5425
5426             Newx(RExC_rxi->data->data[n], 1,
5427                 struct regnode_charclass_class);
5428             StructCopy(data.start_class,
5429                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5430                        struct regnode_charclass_class);
5431             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5432             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5433             DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5434                       regprop(r, sv, (regnode*)data.start_class);
5435                       PerlIO_printf(Perl_debug_log,
5436                                     "synthetic stclass \"%s\".\n",
5437                                     SvPVX_const(sv));});
5438         }
5439
5440         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5441         if (longest_fixed_length > longest_float_length) {
5442             r->check_end_shift = r->anchored_end_shift;
5443             r->check_substr = r->anchored_substr;
5444             r->check_utf8 = r->anchored_utf8;
5445             r->check_offset_min = r->check_offset_max = r->anchored_offset;
5446             if (r->extflags & RXf_ANCH_SINGLE)
5447                 r->extflags |= RXf_NOSCAN;
5448         }
5449         else {
5450             r->check_end_shift = r->float_end_shift;
5451             r->check_substr = r->float_substr;
5452             r->check_utf8 = r->float_utf8;
5453             r->check_offset_min = r->float_min_offset;
5454             r->check_offset_max = r->float_max_offset;
5455         }
5456         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5457            This should be changed ASAP!  */
5458         if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5459             r->extflags |= RXf_USE_INTUIT;
5460             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5461                 r->extflags |= RXf_INTUIT_TAIL;
5462         }
5463         /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5464         if ( (STRLEN)minlen < longest_float_length )
5465             minlen= longest_float_length;
5466         if ( (STRLEN)minlen < longest_fixed_length )
5467             minlen= longest_fixed_length;
5468         */
5469     }
5470     else {
5471         /* Several toplevels. Best we can is to set minlen. */
5472         I32 fake;
5473         struct regnode_charclass_class ch_class;
5474         I32 last_close = 0;
5475
5476         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5477
5478         scan = ri->program + 1;
5479         cl_init(pRExC_state, &ch_class);
5480         data.start_class = &ch_class;
5481         data.last_closep = &last_close;
5482
5483
5484         minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5485             &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5486
5487         CHECK_RESTUDY_GOTO;
5488
5489         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5490                 = r->float_substr = r->float_utf8 = NULL;
5491
5492         if (!(data.start_class->flags & ANYOF_EOS)
5493             && !cl_is_anything(data.start_class))
5494         {
5495             const U32 n = add_data(pRExC_state, 1, "f");
5496             data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5497
5498             Newx(RExC_rxi->data->data[n], 1,
5499                 struct regnode_charclass_class);
5500             StructCopy(data.start_class,
5501                        (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5502                        struct regnode_charclass_class);
5503             ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5504             r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5505             DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5506                       regprop(r, sv, (regnode*)data.start_class);
5507                       PerlIO_printf(Perl_debug_log,
5508                                     "synthetic stclass \"%s\".\n",
5509                                     SvPVX_const(sv));});
5510         }
5511     }
5512
5513     /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5514        the "real" pattern. */
5515     DEBUG_OPTIMISE_r({
5516         PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5517                       (IV)minlen, (IV)r->minlen);
5518     });
5519     r->minlenret = minlen;
5520     if (r->minlen < minlen)
5521         r->minlen = minlen;
5522
5523     if (RExC_seen & REG_SEEN_GPOS)
5524         r->extflags |= RXf_GPOS_SEEN;
5525     if (RExC_seen & REG_SEEN_LOOKBEHIND)
5526         r->extflags |= RXf_LOOKBEHIND_SEEN;
5527     if (RExC_seen & REG_SEEN_EVAL)
5528         r->extflags |= RXf_EVAL_SEEN;
5529     if (RExC_seen & REG_SEEN_CANY)
5530         r->extflags |= RXf_CANY_SEEN;
5531     if (RExC_seen & REG_SEEN_VERBARG)
5532         r->intflags |= PREGf_VERBARG_SEEN;
5533     if (RExC_seen & REG_SEEN_CUTGROUP)
5534         r->intflags |= PREGf_CUTGROUP_SEEN;
5535     if (RExC_paren_names)
5536         RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5537     else
5538         RXp_PAREN_NAMES(r) = NULL;
5539
5540 #ifdef STUPID_PATTERN_CHECKS
5541     if (RX_PRELEN(rx) == 0)
5542         r->extflags |= RXf_NULL;
5543     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5544         /* XXX: this should happen BEFORE we compile */
5545         r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5546     else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5547         r->extflags |= RXf_WHITE;
5548     else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5549         r->extflags |= RXf_START_ONLY;
5550 #else
5551     if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5552             /* XXX: this should happen BEFORE we compile */
5553             r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5554     else {
5555         regnode *first = ri->program + 1;
5556         U8 fop = OP(first);
5557
5558         if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5559             r->extflags |= RXf_NULL;
5560         else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5561             r->extflags |= RXf_START_ONLY;
5562         else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5563                              && OP(regnext(first)) == END)
5564             r->extflags |= RXf_WHITE;
5565     }
5566 #endif
5567 #ifdef DEBUGGING
5568     if (RExC_paren_names) {
5569         ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5570         ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5571     } else
5572 #endif
5573         ri->name_list_idx = 0;
5574
5575     if (RExC_recurse_count) {
5576         for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5577             const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5578             ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5579         }
5580     }
5581     Newxz(r->offs, RExC_npar, regexp_paren_pair);
5582     /* assume we don't need to swap parens around before we match */
5583
5584     DEBUG_DUMP_r({
5585         PerlIO_printf(Perl_debug_log,"Final program:\n");
5586         regdump(r);
5587     });
5588 #ifdef RE_TRACK_PATTERN_OFFSETS
5589     DEBUG_OFFSETS_r(if (ri->u.offsets) {
5590         const U32 len = ri->u.offsets[0];
5591         U32 i;
5592         GET_RE_DEBUG_FLAGS_DECL;
5593         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5594         for (i = 1; i <= len; i++) {
5595             if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5596                 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5597                 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5598             }
5599         PerlIO_printf(Perl_debug_log, "\n");
5600     });
5601 #endif
5602     return rx;
5603 }
5604
5605 #undef RE_ENGINE_PTR
5606
5607
5608 SV*
5609 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5610                     const U32 flags)
5611 {
5612     PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5613
5614     PERL_UNUSED_ARG(value);
5615
5616     if (flags & RXapif_FETCH) {
5617         return reg_named_buff_fetch(rx, key, flags);
5618     } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5619         Perl_croak_no_modify(aTHX);
5620         return NULL;
5621     } else if (flags & RXapif_EXISTS) {
5622         return reg_named_buff_exists(rx, key, flags)
5623             ? &PL_sv_yes
5624             : &PL_sv_no;
5625     } else if (flags & RXapif_REGNAMES) {
5626         return reg_named_buff_all(rx, flags);
5627     } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5628         return reg_named_buff_scalar(rx, flags);
5629     } else {
5630         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5631         return NULL;
5632     }
5633 }
5634
5635 SV*
5636 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5637                          const U32 flags)
5638 {
5639     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5640     PERL_UNUSED_ARG(lastkey);
5641
5642     if (flags & RXapif_FIRSTKEY)
5643         return reg_named_buff_firstkey(rx, flags);
5644     else if (flags & RXapif_NEXTKEY)
5645         return reg_named_buff_nextkey(rx, flags);
5646     else {
5647         Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5648         return NULL;
5649     }
5650 }
5651
5652 SV*
5653 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5654                           const U32 flags)
5655 {
5656     AV *retarray = NULL;
5657     SV *ret;
5658     struct regexp *const rx = (struct regexp *)SvANY(r);
5659
5660     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5661
5662     if (flags & RXapif_ALL)
5663         retarray=newAV();
5664
5665     if (rx && RXp_PAREN_NAMES(rx)) {
5666         HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5667         if (he_str) {
5668             IV i;
5669             SV* sv_dat=HeVAL(he_str);
5670             I32 *nums=(I32*)SvPVX(sv_dat);
5671             for ( i=0; i<SvIVX(sv_dat); i++ ) {
5672                 if ((I32)(rx->nparens) >= nums[i]
5673                     && rx->offs[nums[i]].start != -1
5674                     && rx->offs[nums[i]].end != -1)
5675                 {
5676                     ret = newSVpvs("");
5677                     CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5678                     if (!retarray)
5679                         return ret;
5680                 } else {
5681                     if (retarray)
5682                         ret = newSVsv(&PL_sv_undef);
5683                 }
5684                 if (retarray)
5685                     av_push(retarray, ret);
5686             }
5687             if (retarray)
5688                 return newRV_noinc(MUTABLE_SV(retarray));
5689         }
5690     }
5691     return NULL;
5692 }
5693
5694 bool
5695 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5696                            const U32 flags)
5697 {
5698     struct regexp *const rx = (struct regexp *)SvANY(r);
5699
5700     PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5701
5702     if (rx && RXp_PAREN_NAMES(rx)) {
5703         if (flags & RXapif_ALL) {
5704             return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5705         } else {
5706             SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5707             if (sv) {
5708                 SvREFCNT_dec(sv);
5709                 return TRUE;
5710             } else {
5711                 return FALSE;
5712             }
5713         }
5714     } else {
5715         return FALSE;
5716     }
5717 }
5718
5719 SV*
5720 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5721 {
5722     struct regexp *const rx = (struct regexp *)SvANY(r);
5723
5724     PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5725
5726     if ( rx && RXp_PAREN_NAMES(rx) ) {
5727         (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5728
5729         return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5730     } else {
5731         return FALSE;
5732     }
5733 }
5734
5735 SV*
5736 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5737 {
5738     struct regexp *const rx = (struct regexp *)SvANY(r);
5739     GET_RE_DEBUG_FLAGS_DECL;
5740
5741     PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5742
5743     if (rx && RXp_PAREN_NAMES(rx)) {
5744         HV *hv = RXp_PAREN_NAMES(rx);
5745         HE *temphe;
5746         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5747             IV i;
5748             IV parno = 0;
5749             SV* sv_dat = HeVAL(temphe);
5750             I32 *nums = (I32*)SvPVX(sv_dat);
5751             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5752                 if ((I32)(rx->lastparen) >= nums[i] &&
5753                     rx->offs[nums[i]].start != -1 &&
5754                     rx->offs[nums[i]].end != -1)
5755                 {
5756                     parno = nums[i];
5757                     break;
5758                 }
5759             }
5760             if (parno || flags & RXapif_ALL) {
5761                 return newSVhek(HeKEY_hek(temphe));
5762             }
5763         }
5764     }
5765     return NULL;
5766 }
5767
5768 SV*
5769 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5770 {
5771     SV *ret;
5772     AV *av;
5773     I32 length;
5774     struct regexp *const rx = (struct regexp *)SvANY(r);
5775
5776     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5777
5778     if (rx && RXp_PAREN_NAMES(rx)) {
5779         if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5780             return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5781         } else if (flags & RXapif_ONE) {
5782             ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5783             av = MUTABLE_AV(SvRV(ret));
5784             length = av_len(av);
5785             SvREFCNT_dec(ret);
5786             return newSViv(length + 1);
5787         } else {
5788             Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5789             return NULL;
5790         }
5791     }
5792     return &PL_sv_undef;
5793 }
5794
5795 SV*
5796 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5797 {
5798     struct regexp *const rx = (struct regexp *)SvANY(r);
5799     AV *av = newAV();
5800
5801     PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5802
5803     if (rx && RXp_PAREN_NAMES(rx)) {
5804         HV *hv= RXp_PAREN_NAMES(rx);
5805         HE *temphe;
5806         (void)hv_iterinit(hv);
5807         while ( (temphe = hv_iternext_flags(hv,0)) ) {
5808             IV i;
5809             IV parno = 0;
5810             SV* sv_dat = HeVAL(temphe);
5811             I32 *nums = (I32*)SvPVX(sv_dat);
5812             for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5813                 if ((I32)(rx->lastparen) >= nums[i] &&
5814                     rx->offs[nums[i]].start != -1 &&
5815                     rx->offs[nums[i]].end != -1)
5816                 {
5817                     parno = nums[i];
5818                     break;
5819                 }
5820             }
5821             if (parno || flags & RXapif_ALL) {
5822                 av_push(av, newSVhek(HeKEY_hek(temphe)));
5823             }
5824         }
5825     }
5826
5827     return newRV_noinc(MUTABLE_SV(av));
5828 }
5829
5830 void
5831 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5832                              SV * const sv)
5833 {
5834     struct regexp *const rx = (struct regexp *)SvANY(r);
5835     char *s = NULL;
5836     I32 i = 0;
5837     I32 s1, t1;
5838
5839     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5840
5841     if (!rx->subbeg) {
5842         sv_setsv(sv,&PL_sv_undef);
5843         return;
5844     }
5845     else
5846     if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5847         /* $` */
5848         i = rx->offs[0].start;
5849         s = rx->subbeg;
5850     }
5851     else
5852     if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5853         /* $' */
5854         s = rx->subbeg + rx->offs[0].end;
5855         i = rx->sublen - rx->offs[0].end;
5856     }
5857     else
5858     if ( 0 <= paren && paren <= (I32)rx->nparens &&
5859         (s1 = rx->offs[paren].start) != -1 &&
5860         (t1 = rx->offs[paren].end) != -1)
5861     {
5862         /* $& $1 ... */
5863         i = t1 - s1;
5864         s = rx->subbeg + s1;
5865     } else {
5866         sv_setsv(sv,&PL_sv_undef);
5867         return;
5868     }
5869     assert(rx->sublen >= (s - rx->subbeg) + i );
5870     if (i >= 0) {
5871         const int oldtainted = PL_tainted;
5872         TAINT_NOT;
5873         sv_setpvn(sv, s, i);
5874         PL_tainted = oldtainted;
5875         if ( (rx->extflags & RXf_CANY_SEEN)
5876             ? (RXp_MATCH_UTF8(rx)
5877                         && (!i || is_utf8_string((U8*)s, i)))
5878             : (RXp_MATCH_UTF8(rx)) )
5879         {
5880             SvUTF8_on(sv);
5881         }
5882         else
5883             SvUTF8_off(sv);
5884         if (PL_tainting) {
5885             if (RXp_MATCH_TAINTED(rx)) {
5886                 if (SvTYPE(sv) >= SVt_PVMG) {
5887                     MAGIC* const mg = SvMAGIC(sv);
5888                     MAGIC* mgt;
5889                     PL_tainted = 1;
5890                     SvMAGIC_set(sv, mg->mg_moremagic);
5891                     SvTAINT(sv);
5892                     if ((mgt = SvMAGIC(sv))) {
5893                         mg->mg_moremagic = mgt;
5894                         SvMAGIC_set(sv, mg);
5895                     }
5896                 } else {
5897                     PL_tainted = 1;
5898                     SvTAINT(sv);
5899                 }
5900             } else
5901                 SvTAINTED_off(sv);
5902         }
5903     } else {
5904         sv_setsv(sv,&PL_sv_undef);
5905         return;
5906     }
5907 }
5908
5909 void
5910 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5911                                                          SV const * const value)
5912 {
5913     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5914
5915     PERL_UNUSED_ARG(rx);
5916     PERL_UNUSED_ARG(paren);
5917     PERL_UNUSED_ARG(value);
5918
5919     if (!PL_localizing)
5920         Perl_croak_no_modify(aTHX);
5921 }
5922
5923 I32
5924 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5925                               const I32 paren)
5926 {
5927     struct regexp *const rx = (struct regexp *)SvANY(r);
5928     I32 i;
5929     I32 s1, t1;
5930
5931     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5932
5933     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5934         switch (paren) {
5935       /* $` / ${^PREMATCH} */
5936       case RX_BUFF_IDX_PREMATCH:
5937         if (rx->offs[0].start != -1) {
5938                         i = rx->offs[0].start;
5939                         if (i > 0) {
5940                                 s1 = 0;
5941                                 t1 = i;
5942                                 goto getlen;
5943                         }
5944             }
5945         return 0;
5946       /* $' / ${^POSTMATCH} */
5947       case RX_BUFF_IDX_POSTMATCH:
5948             if (rx->offs[0].end != -1) {
5949                         i = rx->sublen - rx->offs[0].end;
5950                         if (i > 0) {
5951                                 s1 = rx->offs[0].end;
5952                                 t1 = rx->sublen;
5953                                 goto getlen;
5954                         }
5955             }
5956         return 0;
5957       /* $& / ${^MATCH}, $1, $2, ... */
5958       default:
5959             if (paren <= (I32)rx->nparens &&
5960             (s1 = rx->offs[paren].start) != -1 &&
5961             (t1 = rx->offs[paren].end) != -1)
5962             {
5963             i = t1 - s1;
5964             goto getlen;
5965         } else {
5966             if (ckWARN(WARN_UNINITIALIZED))
5967                 report_uninit((const SV *)sv);
5968             return 0;
5969         }
5970     }
5971   getlen:
5972     if (i > 0 && RXp_MATCH_UTF8(rx)) {
5973         const char * const s = rx->subbeg + s1;
5974         const U8 *ep;
5975         STRLEN el;
5976
5977         i = t1 - s1;
5978         if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5979                         i = el;
5980     }
5981     return i;
5982 }
5983
5984 SV*
5985 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5986 {
5987     PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5988         PERL_UNUSED_ARG(rx);
5989         if (0)
5990             return NULL;
5991         else
5992             return newSVpvs("Regexp");
5993 }
5994
5995 /* Scans the name of a named buffer from the pattern.
5996  * If flags is REG_RSN_RETURN_NULL returns null.
5997  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5998  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5999  * to the parsed name as looked up in the RExC_paren_names hash.
6000  * If there is an error throws a vFAIL().. type exception.
6001  */
6002
6003 #define REG_RSN_RETURN_NULL    0
6004 #define REG_RSN_RETURN_NAME    1
6005 #define REG_RSN_RETURN_DATA    2
6006
6007 STATIC SV*
6008 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6009 {
6010     char *name_start = RExC_parse;
6011
6012     PERL_ARGS_ASSERT_REG_SCAN_NAME;
6013
6014     if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6015          /* skip IDFIRST by using do...while */
6016         if (UTF)
6017             do {
6018                 RExC_parse += UTF8SKIP(RExC_parse);
6019             } while (isALNUM_utf8((U8*)RExC_parse));
6020         else
6021             do {
6022                 RExC_parse++;
6023             } while (isALNUM(*RExC_parse));
6024     }
6025
6026     if ( flags ) {
6027         SV* sv_name
6028             = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6029                              SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6030         if ( flags == REG_RSN_RETURN_NAME)
6031             return sv_name;
6032         else if (flags==REG_RSN_RETURN_DATA) {
6033             HE *he_str = NULL;
6034             SV *sv_dat = NULL;
6035             if ( ! sv_name )      /* should not happen*/
6036                 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6037             if (RExC_paren_names)
6038                 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6039             if ( he_str )
6040                 sv_dat = HeVAL(he_str);
6041             if ( ! sv_dat )
6042                 vFAIL("Reference to nonexistent named group");
6043             return sv_dat;
6044         }
6045         else {
6046             Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6047                        (unsigned long) flags);
6048         }
6049         /* NOT REACHED */
6050     }
6051     return NULL;
6052 }
6053
6054 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6055     int rem=(int)(RExC_end - RExC_parse);                       \
6056     int cut;                                                    \
6057     int num;                                                    \
6058     int iscut=0;                                                \
6059     if (rem>10) {                                               \
6060         rem=10;                                                 \
6061         iscut=1;                                                \
6062     }                                                           \
6063     cut=10-rem;                                                 \
6064     if (RExC_lastparse!=RExC_parse)                             \
6065         PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6066             rem, RExC_parse,                                    \
6067             cut + 4,                                            \
6068             iscut ? "..." : "<"                                 \
6069         );                                                      \
6070     else                                                        \
6071         PerlIO_printf(Perl_debug_log,"%16s","");                \
6072                                                                 \
6073     if (SIZE_ONLY)                                              \
6074        num = RExC_size + 1;                                     \
6075     else                                                        \
6076        num=REG_NODE_NUM(RExC_emit);                             \
6077     if (RExC_lastnum!=num)                                      \
6078        PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6079     else                                                        \
6080        PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6081     PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6082         (int)((depth*2)), "",                                   \
6083         (funcname)                                              \
6084     );                                                          \
6085     RExC_lastnum=num;                                           \
6086     RExC_lastparse=RExC_parse;                                  \
6087 })
6088
6089
6090
6091 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6092     DEBUG_PARSE_MSG((funcname));                            \
6093     PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6094 })
6095 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6096     DEBUG_PARSE_MSG((funcname));                            \
6097     PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6098 })
6099
6100 /* This section of code defines the inversion list object and its methods.  The
6101  * interfaces are highly subject to change, so as much as possible is static to
6102  * this file.  An inversion list is here implemented as a malloc'd C UV array
6103  * with some added info that is placed as UVs at the beginning in a header
6104  * portion.  An inversion list for Unicode is an array of code points, sorted
6105  * by ordinal number.  The zeroth element is the first code point in the list.
6106  * The 1th element is the first element beyond that not in the list.  In other
6107  * words, the first range is
6108  *  invlist[0]..(invlist[1]-1)
6109  * The other ranges follow.  Thus every element whose index is divisible by two
6110  * marks the beginning of a range that is in the list, and every element not
6111  * divisible by two marks the beginning of a range not in the list.  A single
6112  * element inversion list that contains the single code point N generally
6113  * consists of two elements
6114  *  invlist[0] == N
6115  *  invlist[1] == N+1
6116  * (The exception is when N is the highest representable value on the
6117  * machine, in which case the list containing just it would be a single
6118  * element, itself.  By extension, if the last range in the list extends to
6119  * infinity, then the first element of that range will be in the inversion list
6120  * at a position that is divisible by two, and is the final element in the
6121  * list.)
6122  * Taking the complement (inverting) an inversion list is quite simple, if the
6123  * first element is 0, remove it; otherwise add a 0 element at the beginning.
6124  * This implementation reserves an element at the beginning of each inversion list
6125  * to contain 0 when the list contains 0, and contains 1 otherwise.  The actual
6126  * beginning of the list is either that element if 0, or the next one if 1.
6127  *
6128  * More about inversion lists can be found in "Unicode Demystified"
6129  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6130  * More will be coming when functionality is added later.
6131  *
6132  * The inversion list data structure is currently implemented as an SV pointing
6133  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
6134  * array of UV whose memory management is automatically handled by the existing
6135  * facilities for SV's.
6136  *
6137  * Some of the methods should always be private to the implementation, and some
6138  * should eventually be made public */
6139
6140 #define INVLIST_LEN_OFFSET 0    /* Number of elements in the inversion list */
6141 #define INVLIST_ITER_OFFSET 1   /* Current iteration position */
6142
6143 #define INVLIST_ZERO_OFFSET 2   /* 0 or 1; must be last element in header */
6144 /* The UV at position ZERO contains either 0 or 1.  If 0, the inversion list
6145  * contains the code point U+00000, and begins here.  If 1, the inversion list
6146  * doesn't contain U+0000, and it begins at the next UV in the array.
6147  * Inverting an inversion list consists of adding or removing the 0 at the
6148  * beginning of it.  By reserving a space for that 0, inversion can be made
6149  * very fast */
6150
6151 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6152
6153 /* Internally things are UVs */
6154 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6155 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6156
6157 #define INVLIST_INITIAL_LEN 10
6158
6159 PERL_STATIC_INLINE UV*
6160 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6161 {
6162     /* Returns a pointer to the first element in the inversion list's array.
6163      * This is called upon initialization of an inversion list.  Where the
6164      * array begins depends on whether the list has the code point U+0000
6165      * in it or not.  The other parameter tells it whether the code that
6166      * follows this call is about to put a 0 in the inversion list or not.
6167      * The first element is either the element with 0, if 0, or the next one,
6168      * if 1 */
6169
6170     UV* zero = get_invlist_zero_addr(invlist);
6171
6172     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6173
6174     /* Must be empty */
6175     assert(! *get_invlist_len_addr(invlist));
6176
6177     /* 1^1 = 0; 1^0 = 1 */
6178     *zero = 1 ^ will_have_0;
6179     return zero + *zero;
6180 }
6181
6182 PERL_STATIC_INLINE UV*
6183 S_invlist_array(pTHX_ SV* const invlist)
6184 {
6185     /* Returns the pointer to the inversion list's array.  Every time the
6186      * length changes, this needs to be called in case malloc or realloc moved
6187      * it */
6188
6189     PERL_ARGS_ASSERT_INVLIST_ARRAY;
6190
6191     /* Must not be empty.  If these fail, you probably didn't check for <len>
6192      * being non-zero before trying to get the array */
6193     assert(*get_invlist_len_addr(invlist));
6194     assert(*get_invlist_zero_addr(invlist) == 0
6195            || *get_invlist_zero_addr(invlist) == 1);
6196
6197     /* The array begins either at the element reserved for zero if the
6198      * list contains 0 (that element will be set to 0), or otherwise the next
6199      * element (in which case the reserved element will be set to 1). */
6200     return (UV *) (get_invlist_zero_addr(invlist)
6201                    + *get_invlist_zero_addr(invlist));
6202 }
6203
6204 PERL_STATIC_INLINE UV*
6205 S_get_invlist_len_addr(pTHX_ SV* invlist)
6206 {
6207     /* Return the address of the UV that contains the current number
6208      * of used elements in the inversion list */
6209
6210     PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6211
6212     return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6213 }
6214
6215 PERL_STATIC_INLINE UV
6216 S_invlist_len(pTHX_ SV* const invlist)
6217 {
6218     /* Returns the current number of elements stored in the inversion list's
6219      * array */
6220
6221     PERL_ARGS_ASSERT_INVLIST_LEN;
6222
6223     return *get_invlist_len_addr(invlist);
6224 }
6225
6226 PERL_STATIC_INLINE void
6227 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6228 {
6229     /* Sets the current number of elements stored in the inversion list */
6230
6231     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6232
6233     *get_invlist_len_addr(invlist) = len;
6234
6235     assert(len <= SvLEN(invlist));
6236
6237     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6238     /* If the list contains U+0000, that element is part of the header,
6239      * and should not be counted as part of the array.  It will contain
6240      * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
6241      * subtract:
6242      *  SvCUR_set(invlist,
6243      *            TO_INTERNAL_SIZE(len
6244      *                             - (*get_invlist_zero_addr(inv_list) ^ 1)));
6245      * But, this is only valid if len is not 0.  The consequences of not doing
6246      * this is that the memory allocation code may think that 1 more UV is
6247      * being used than actually is, and so might do an unnecessary grow.  That
6248      * seems worth not bothering to make this the precise amount.
6249      *
6250      * Note that when inverting, SvCUR shouldn't change */
6251 }
6252
6253 PERL_STATIC_INLINE UV
6254 S_invlist_max(pTHX_ SV* const invlist)
6255 {
6256     /* Returns the maximum number of elements storable in the inversion list's
6257      * array, without having to realloc() */
6258
6259     PERL_ARGS_ASSERT_INVLIST_MAX;
6260
6261     return FROM_INTERNAL_SIZE(SvLEN(invlist));
6262 }
6263
6264 PERL_STATIC_INLINE UV*
6265 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6266 {
6267     /* Return the address of the UV that is reserved to hold 0 if the inversion
6268      * list contains 0.  This has to be the last element of the heading, as the
6269      * list proper starts with either it if 0, or the next element if not.
6270      * (But we force it to contain either 0 or 1) */
6271
6272     PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6273
6274     return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6275 }
6276
6277 #ifndef PERL_IN_XSUB_RE
6278 SV*
6279 Perl__new_invlist(pTHX_ IV initial_size)
6280 {
6281
6282     /* Return a pointer to a newly constructed inversion list, with enough
6283      * space to store 'initial_size' elements.  If that number is negative, a
6284      * system default is used instead */
6285
6286     SV* new_list;
6287
6288     if (initial_size < 0) {
6289         initial_size = INVLIST_INITIAL_LEN;
6290     }
6291
6292     /* Allocate the initial space */
6293     new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6294     invlist_set_len(new_list, 0);
6295
6296     /* Force iterinit() to be used to get iteration to work */
6297     *get_invlist_iter_addr(new_list) = UV_MAX;
6298
6299     /* This should force a segfault if a method doesn't initialize this
6300      * properly */
6301     *get_invlist_zero_addr(new_list) = UV_MAX;
6302
6303     return new_list;
6304 }
6305 #endif
6306
6307 STATIC void
6308 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6309 {
6310     /* Grow the maximum size of an inversion list */
6311
6312     PERL_ARGS_ASSERT_INVLIST_EXTEND;
6313
6314     SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6315 }
6316
6317 PERL_STATIC_INLINE void
6318 S_invlist_trim(pTHX_ SV* const invlist)
6319 {
6320     PERL_ARGS_ASSERT_INVLIST_TRIM;
6321
6322     /* Change the length of the inversion list to how many entries it currently
6323      * has */
6324
6325     SvPV_shrink_to_cur((SV *) invlist);
6326 }
6327
6328 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6329  * etc */
6330 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6331 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6332
6333 #ifndef PERL_IN_XSUB_RE
6334 void
6335 Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6336 {
6337    /* Subject to change or removal.  Append the range from 'start' to 'end' at
6338     * the end of the inversion list.  The range must be above any existing
6339     * ones. */
6340
6341     UV* array;
6342     UV max = invlist_max(invlist);
6343     UV len = invlist_len(invlist);
6344
6345     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6346
6347     if (len == 0) { /* Empty lists must be initialized */
6348         array = _invlist_array_init(invlist, start == 0);
6349     }
6350     else {
6351         /* Here, the existing list is non-empty. The current max entry in the
6352          * list is generally the first value not in the set, except when the
6353          * set extends to the end of permissible values, in which case it is
6354          * the first entry in that final set, and so this call is an attempt to
6355          * append out-of-order */
6356
6357         UV final_element = len - 1;
6358         array = invlist_array(invlist);
6359         if (array[final_element] > start
6360             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6361         {
6362             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6363                        array[final_element], start,
6364                        ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6365         }
6366
6367         /* Here, it is a legal append.  If the new range begins with the first
6368          * value not in the set, it is extending the set, so the new first
6369          * value not in the set is one greater than the newly extended range.
6370          * */
6371         if (array[final_element] == start) {
6372             if (end != UV_MAX) {
6373                 array[final_element] = end + 1;
6374             }
6375             else {
6376                 /* But if the end is the maximum representable on the machine,
6377                  * just let the range that this would extend to have no end */
6378                 invlist_set_len(invlist, len - 1);
6379             }
6380             return;
6381         }
6382     }
6383
6384     /* Here the new range doesn't extend any existing set.  Add it */
6385
6386     len += 2;   /* Includes an element each for the start and end of range */
6387
6388     /* If overflows the existing space, extend, which may cause the array to be
6389      * moved */
6390     if (max < len) {
6391         invlist_extend(invlist, len);
6392         invlist_set_len(invlist, len);  /* Have to set len here to avoid assert
6393                                            failure in invlist_array() */
6394         array = invlist_array(invlist);
6395     }
6396     else {
6397         invlist_set_len(invlist, len);
6398     }
6399
6400     /* The next item on the list starts the range, the one after that is
6401      * one past the new range.  */
6402     array[len - 2] = start;
6403     if (end != UV_MAX) {
6404         array[len - 1] = end + 1;
6405     }
6406     else {
6407         /* But if the end is the maximum representable on the machine, just let
6408          * the range have no end */
6409         invlist_set_len(invlist, len - 1);
6410     }
6411 }
6412
6413 STATIC IV
6414 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6415 {
6416     /* Searches the inversion list for the entry that contains the input code
6417      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
6418      * return value is the index into the list's array of the range that
6419      * contains <cp> */
6420
6421     IV low = 0;
6422     IV high = invlist_len(invlist);
6423     const UV * const array = invlist_array(invlist);
6424
6425     PERL_ARGS_ASSERT_INVLIST_SEARCH;
6426
6427     /* If list is empty or the code point is before the first element, return
6428      * failure. */
6429     if (high == 0 || cp < array[0]) {
6430         return -1;
6431     }
6432
6433     /* Binary search.  What we are looking for is <i> such that
6434      *  array[i] <= cp < array[i+1]
6435      * The loop below converges on the i+1. */
6436     while (low < high) {
6437         IV mid = (low + high) / 2;
6438         if (array[mid] <= cp) {
6439             low = mid + 1;
6440
6441             /* We could do this extra test to exit the loop early.
6442             if (cp < array[low]) {
6443                 return mid;
6444             }
6445             */
6446         }
6447         else { /* cp < array[mid] */
6448             high = mid;
6449         }
6450     }
6451
6452     return high - 1;
6453 }
6454
6455 void
6456 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6457 {
6458     /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6459      * but is used when the swash has an inversion list.  This makes this much
6460      * faster, as it uses a binary search instead of a linear one.  This is
6461      * intimately tied to that function, and perhaps should be in utf8.c,
6462      * except it is intimately tied to inversion lists as well.  It assumes
6463      * that <swatch> is all 0's on input */
6464
6465     UV current = start;
6466     const IV len = invlist_len(invlist);
6467     IV i;
6468     const UV * array;
6469
6470     PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6471
6472     if (len == 0) { /* Empty inversion list */
6473         return;
6474     }
6475
6476     array = invlist_array(invlist);
6477
6478     /* Find which element it is */
6479     i = invlist_search(invlist, start);
6480
6481     /* We populate from <start> to <end> */
6482     while (current < end) {
6483         UV upper;
6484
6485         /* The inversion list gives the results for every possible code point
6486          * after the first one in the list.  Only those ranges whose index is
6487          * even are ones that the inversion list matches.  For the odd ones,
6488          * and if the initial code point is not in the list, we have to skip
6489          * forward to the next element */
6490         if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6491             i++;
6492             if (i >= len) { /* Finished if beyond the end of the array */
6493                 return;
6494             }
6495             current = array[i];
6496             if (current >= end) {   /* Finished if beyond the end of what we
6497                                        are populating */
6498                 return;
6499             }
6500         }
6501         assert(current >= start);
6502
6503         /* The current range ends one below the next one, except don't go past
6504          * <end> */
6505         i++;
6506         upper = (i < len && array[i] < end) ? array[i] : end;
6507
6508         /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
6509          * for each code point in it */
6510         for (; current < upper; current++) {
6511             const STRLEN offset = (STRLEN)(current - start);
6512             swatch[offset >> 3] |= 1 << (offset & 7);
6513         }
6514
6515         /* Quit if at the end of the list */
6516         if (i >= len) {
6517
6518             /* But first, have to deal with the highest possible code point on
6519              * the platform.  The previous code assumes that <end> is one
6520              * beyond where we want to populate, but that is impossible at the
6521              * platform's infinity, so have to handle it specially */
6522             if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6523             {
6524                 const STRLEN offset = (STRLEN)(end - start);
6525                 swatch[offset >> 3] |= 1 << (offset & 7);
6526             }
6527             return;
6528         }
6529
6530         /* Advance to the next range, which will be for code points not in the
6531          * inversion list */
6532         current = array[i];
6533     }
6534
6535     return;
6536 }
6537
6538 void
6539 Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
6540 {
6541     /* Take the union of two inversion lists and point <output> to it.  *output
6542      * should be defined upon input, and if it points to one of the two lists,
6543      * the reference count to that list will be decremented.
6544      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6545      * Richard Gillam, published by Addison-Wesley, and explained at some
6546      * length there.  The preface says to incorporate its examples into your
6547      * code at your own risk.
6548      *
6549      * The algorithm is like a merge sort.
6550      *
6551      * XXX A potential performance improvement is to keep track as we go along
6552      * if only one of the inputs contributes to the result, meaning the other
6553      * is a subset of that one.  In that case, we can skip the final copy and
6554      * return the larger of the input lists, but then outside code might need
6555      * to keep track of whether to free the input list or not */
6556
6557     UV* array_a;    /* a's array */
6558     UV* array_b;
6559     UV len_a;       /* length of a's array */
6560     UV len_b;
6561
6562     SV* u;                      /* the resulting union */
6563     UV* array_u;
6564     UV len_u;
6565
6566     UV i_a = 0;             /* current index into a's array */
6567     UV i_b = 0;
6568     UV i_u = 0;
6569
6570     /* running count, as explained in the algorithm source book; items are
6571      * stopped accumulating and are output when the count changes to/from 0.
6572      * The count is incremented when we start a range that's in the set, and
6573      * decremented when we start a range that's not in the set.  So its range
6574      * is 0 to 2.  Only when the count is zero is something not in the set.
6575      */
6576     UV count = 0;
6577
6578     PERL_ARGS_ASSERT__INVLIST_UNION;
6579     assert(a != b);
6580
6581     /* If either one is empty, the union is the other one */
6582     len_a = invlist_len(a);
6583     if (len_a == 0) {
6584         if (*output == a) {
6585             SvREFCNT_dec(a);
6586         }
6587         if (*output != b) {
6588             *output = invlist_clone(b);
6589         } /* else *output already = b; */
6590         return;
6591     }
6592     else if ((len_b = invlist_len(b)) == 0) {
6593         if (*output == b) {
6594             SvREFCNT_dec(b);
6595         }
6596         if (*output != a) {
6597             *output = invlist_clone(a);
6598         }
6599         /* else *output already = a; */
6600         return;
6601     }
6602
6603     /* Here both lists exist and are non-empty */
6604     array_a = invlist_array(a);
6605     array_b = invlist_array(b);
6606
6607     /* Size the union for the worst case: that the sets are completely
6608      * disjoint */
6609     u = _new_invlist(len_a + len_b);
6610
6611     /* Will contain U+0000 if either component does */
6612     array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6613                                       || (len_b > 0 && array_b[0] == 0));
6614
6615     /* Go through each list item by item, stopping when exhausted one of
6616      * them */
6617     while (i_a < len_a && i_b < len_b) {
6618         UV cp;      /* The element to potentially add to the union's array */
6619         bool cp_in_set;   /* is it in the the input list's set or not */
6620
6621         /* We need to take one or the other of the two inputs for the union.
6622          * Since we are merging two sorted lists, we take the smaller of the
6623          * next items.  In case of a tie, we take the one that is in its set
6624          * first.  If we took one not in the set first, it would decrement the
6625          * count, possibly to 0 which would cause it to be output as ending the
6626          * range, and the next time through we would take the same number, and
6627          * output it again as beginning the next range.  By doing it the
6628          * opposite way, there is no possibility that the count will be
6629          * momentarily decremented to 0, and thus the two adjoining ranges will
6630          * be seamlessly merged.  (In a tie and both are in the set or both not
6631          * in the set, it doesn't matter which we take first.) */
6632         if (array_a[i_a] < array_b[i_b]
6633             || (array_a[i_a] == array_b[i_b]
6634                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6635         {
6636             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6637             cp= array_a[i_a++];
6638         }
6639         else {
6640             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6641             cp= array_b[i_b++];
6642         }
6643
6644         /* Here, have chosen which of the two inputs to look at.  Only output
6645          * if the running count changes to/from 0, which marks the
6646          * beginning/end of a range in that's in the set */
6647         if (cp_in_set) {
6648             if (count == 0) {
6649                 array_u[i_u++] = cp;
6650             }
6651             count++;
6652         }
6653         else {
6654             count--;
6655             if (count == 0) {
6656                 array_u[i_u++] = cp;
6657             }
6658         }
6659     }
6660
6661     /* Here, we are finished going through at least one of the lists, which
6662      * means there is something remaining in at most one.  We check if the list
6663      * that hasn't been exhausted is positioned such that we are in the middle
6664      * of a range in its set or not.  (i_a and i_b point to the element beyond
6665      * the one we care about.) If in the set, we decrement 'count'; if 0, there
6666      * is potentially more to output.
6667      * There are four cases:
6668      *  1) Both weren't in their sets, count is 0, and remains 0.  What's left
6669      *     in the union is entirely from the non-exhausted set.
6670      *  2) Both were in their sets, count is 2.  Nothing further should
6671      *     be output, as everything that remains will be in the exhausted
6672      *     list's set, hence in the union; decrementing to 1 but not 0 insures
6673      *     that
6674      *  3) the exhausted was in its set, non-exhausted isn't, count is 1.
6675      *     Nothing further should be output because the union includes
6676      *     everything from the exhausted set.  Not decrementing ensures that.
6677      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6678      *     decrementing to 0 insures that we look at the remainder of the
6679      *     non-exhausted set */
6680     if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6681         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6682     {
6683         count--;
6684     }
6685
6686     /* The final length is what we've output so far, plus what else is about to
6687      * be output.  (If 'count' is non-zero, then the input list we exhausted
6688      * has everything remaining up to the machine's limit in its set, and hence
6689      * in the union, so there will be no further output. */
6690     len_u = i_u;
6691     if (count == 0) {
6692         /* At most one of the subexpressions will be non-zero */
6693         len_u += (len_a - i_a) + (len_b - i_b);
6694     }
6695
6696     /* Set result to final length, which can change the pointer to array_u, so
6697      * re-find it */
6698     if (len_u != invlist_len(u)) {
6699         invlist_set_len(u, len_u);
6700         invlist_trim(u);
6701         array_u = invlist_array(u);
6702     }
6703
6704     /* When 'count' is 0, the list that was exhausted (if one was shorter than
6705      * the other) ended with everything above it not in its set.  That means
6706      * that the remaining part of the union is precisely the same as the
6707      * non-exhausted list, so can just copy it unchanged.  (If both list were
6708      * exhausted at the same time, then the operations below will be both 0.)
6709      */
6710     if (count == 0) {
6711         IV copy_count; /* At most one will have a non-zero copy count */
6712         if ((copy_count = len_a - i_a) > 0) {
6713             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6714         }
6715         else if ((copy_count = len_b - i_b) > 0) {
6716             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6717         }
6718     }
6719
6720     /*  We may be removing a reference to one of the inputs */
6721     if (a == *output || b == *output) {
6722         SvREFCNT_dec(*output);
6723     }
6724
6725     *output = u;
6726     return;
6727 }
6728
6729 void
6730 Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
6731 {
6732     /* Take the intersection of two inversion lists and point <i> to it.  *i
6733      * should be defined upon input, and if it points to one of the two lists,
6734      * the reference count to that list will be decremented.
6735      * The basis for this comes from "Unicode Demystified" Chapter 13 by
6736      * Richard Gillam, published by Addison-Wesley, and explained at some
6737      * length there.  The preface says to incorporate its examples into your
6738      * code at your own risk.  In fact, it had bugs
6739      *
6740      * The algorithm is like a merge sort, and is essentially the same as the
6741      * union above
6742      */
6743
6744     UV* array_a;                /* a's array */
6745     UV* array_b;
6746     UV len_a;   /* length of a's array */
6747     UV len_b;
6748
6749     SV* r;                   /* the resulting intersection */
6750     UV* array_r;
6751     UV len_r;
6752
6753     UV i_a = 0;             /* current index into a's array */
6754     UV i_b = 0;
6755     UV i_r = 0;
6756
6757     /* running count, as explained in the algorithm source book; items are
6758      * stopped accumulating and are output when the count changes to/from 2.
6759      * The count is incremented when we start a range that's in the set, and
6760      * decremented when we start a range that's not in the set.  So its range
6761      * is 0 to 2.  Only when the count is 2 is something in the intersection.
6762      */
6763     UV count = 0;
6764
6765     PERL_ARGS_ASSERT__INVLIST_INTERSECTION;
6766     assert(a != b);
6767
6768     /* If either one is empty, the intersection is null */
6769     len_a = invlist_len(a);
6770     if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
6771
6772         /* If the result is the same as one of the inputs, the input is being
6773          * overwritten */
6774         if (*i == a) {
6775             SvREFCNT_dec(a);
6776         }
6777         else if (*i == b) {
6778             SvREFCNT_dec(b);
6779         }
6780
6781         *i = _new_invlist(0);
6782         return;
6783     }
6784
6785     /* Here both lists exist and are non-empty */
6786     array_a = invlist_array(a);
6787     array_b = invlist_array(b);
6788
6789     /* Size the intersection for the worst case: that the intersection ends up
6790      * fragmenting everything to be completely disjoint */
6791     r= _new_invlist(len_a + len_b);
6792
6793     /* Will contain U+0000 iff both components do */
6794     array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
6795                                      && len_b > 0 && array_b[0] == 0);
6796
6797     /* Go through each list item by item, stopping when exhausted one of
6798      * them */
6799     while (i_a < len_a && i_b < len_b) {
6800         UV cp;      /* The element to potentially add to the intersection's
6801                        array */
6802         bool cp_in_set; /* Is it in the input list's set or not */
6803
6804         /* We need to take one or the other of the two inputs for the
6805          * intersection.  Since we are merging two sorted lists, we take the
6806          * smaller of the next items.  In case of a tie, we take the one that
6807          * is not in its set first (a difference from the union algorithm).  If
6808          * we took one in the set first, it would increment the count, possibly
6809          * to 2 which would cause it to be output as starting a range in the
6810          * intersection, and the next time through we would take that same
6811          * number, and output it again as ending the set.  By doing it the
6812          * opposite of this, there is no possibility that the count will be
6813          * momentarily incremented to 2.  (In a tie and both are in the set or
6814          * both not in the set, it doesn't matter which we take first.) */
6815         if (array_a[i_a] < array_b[i_b]
6816             || (array_a[i_a] == array_b[i_b]
6817                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6818         {
6819             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6820             cp= array_a[i_a++];
6821         }
6822         else {
6823             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6824             cp= array_b[i_b++];
6825         }
6826
6827         /* Here, have chosen which of the two inputs to look at.  Only output
6828          * if the running count changes to/from 2, which marks the
6829          * beginning/end of a range that's in the intersection */
6830         if (cp_in_set) {
6831             count++;
6832             if (count == 2) {
6833                 array_r[i_r++] = cp;
6834             }
6835         }
6836         else {
6837             if (count == 2) {
6838                 array_r[i_r++] = cp;
6839             }
6840             count--;
6841         }
6842     }
6843
6844     /* Here, we are finished going through at least one of the lists, which
6845      * means there is something remaining in at most one.  We check if the list
6846      * that has been exhausted is positioned such that we are in the middle
6847      * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
6848      * the ones we care about.)  There are four cases:
6849      *  1) Both weren't in their sets, count is 0, and remains 0.  There's
6850      *     nothing left in the intersection.
6851      *  2) Both were in their sets, count is 2 and perhaps is incremented to
6852      *     above 2.  What should be output is exactly that which is in the
6853      *     non-exhausted set, as everything it has is also in the intersection
6854      *     set, and everything it doesn't have can't be in the intersection
6855      *  3) The exhausted was in its set, non-exhausted isn't, count is 1, and
6856      *     gets incremented to 2.  Like the previous case, the intersection is
6857      *     everything that remains in the non-exhausted set.
6858      *  4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
6859      *     remains 1.  And the intersection has nothing more. */
6860     if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6861         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6862     {
6863         count++;
6864     }
6865
6866     /* The final length is what we've output so far plus what else is in the
6867      * intersection.  At most one of the subexpressions below will be non-zero */
6868     len_r = i_r;
6869     if (count >= 2) {
6870         len_r += (len_a - i_a) + (len_b - i_b);
6871     }
6872
6873     /* Set result to final length, which can change the pointer to array_r, so
6874      * re-find it */
6875     if (len_r != invlist_len(r)) {
6876         invlist_set_len(r, len_r);
6877         invlist_trim(r);
6878         array_r = invlist_array(r);
6879     }
6880
6881     /* Finish outputting any remaining */
6882     if (count >= 2) { /* At most one will have a non-zero copy count */
6883         IV copy_count;
6884         if ((copy_count = len_a - i_a) > 0) {
6885             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6886         }
6887         else if ((copy_count = len_b - i_b) > 0) {
6888             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6889         }
6890     }
6891
6892     /*  We may be removing a reference to one of the inputs */
6893     if (a == *i || b == *i) {
6894         SvREFCNT_dec(*i);
6895     }
6896
6897     *i = r;
6898     return;
6899 }
6900
6901 #endif
6902
6903 STATIC SV*
6904 S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
6905 {
6906     /* Add the range from 'start' to 'end' inclusive to the inversion list's
6907      * set.  A pointer to the inversion list is returned.  This may actually be
6908      * a new list, in which case the passed in one has been destroyed.  The
6909      * passed in inversion list can be NULL, in which case a new one is created
6910      * with just the one range in it */
6911
6912     SV* range_invlist;
6913     UV len;
6914
6915     if (invlist == NULL) {
6916         invlist = _new_invlist(2);
6917         len = 0;
6918     }
6919     else {
6920         len = invlist_len(invlist);
6921     }
6922
6923     /* If comes after the final entry, can just append it to the end */
6924     if (len == 0
6925         || start >= invlist_array(invlist)
6926                                     [invlist_len(invlist) - 1])
6927     {
6928         _append_range_to_invlist(invlist, start, end);
6929         return invlist;
6930     }
6931
6932     /* Here, can't just append things, create and return a new inversion list
6933      * which is the union of this range and the existing inversion list */
6934     range_invlist = _new_invlist(2);
6935     _append_range_to_invlist(range_invlist, start, end);
6936
6937     _invlist_union(invlist, range_invlist, &invlist);
6938
6939     /* The temporary can be freed */
6940     SvREFCNT_dec(range_invlist);
6941
6942     return invlist;
6943 }
6944
6945 PERL_STATIC_INLINE SV*
6946 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
6947     return add_range_to_invlist(invlist, cp, cp);
6948 }
6949
6950 #ifndef PERL_IN_XSUB_RE
6951 void
6952 Perl__invlist_invert(pTHX_ SV* const invlist)
6953 {
6954     /* Complement the input inversion list.  This adds a 0 if the list didn't
6955      * have a zero; removes it otherwise.  As described above, the data
6956      * structure is set up so that this is very efficient */
6957
6958     UV* len_pos = get_invlist_len_addr(invlist);
6959
6960     PERL_ARGS_ASSERT__INVLIST_INVERT;
6961
6962     /* The inverse of matching nothing is matching everything */
6963     if (*len_pos == 0) {
6964         _append_range_to_invlist(invlist, 0, UV_MAX);
6965         return;
6966     }
6967
6968     /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
6969      * zero element was a 0, so it is being removed, so the length decrements
6970      * by 1; and vice-versa.  SvCUR is unaffected */
6971     if (*get_invlist_zero_addr(invlist) ^= 1) {
6972         (*len_pos)--;
6973     }
6974     else {
6975         (*len_pos)++;
6976     }
6977 }
6978
6979 void
6980 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
6981 {
6982     /* Complement the input inversion list (which must be a Unicode property,
6983      * all of which don't match above the Unicode maximum code point.)  And
6984      * Perl has chosen to not have the inversion match above that either.  This
6985      * adds a 0x110000 if the list didn't end with it, and removes it if it did
6986      */
6987
6988     UV len;
6989     UV* array;
6990
6991     PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
6992
6993     _invlist_invert(invlist);
6994
6995     len = invlist_len(invlist);
6996
6997     if (len != 0) { /* If empty do nothing */
6998         array = invlist_array(invlist);
6999         if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7000             /* Add 0x110000.  First, grow if necessary */
7001             len++;
7002             if (invlist_max(invlist) < len) {
7003                 invlist_extend(invlist, len);
7004                 array = invlist_array(invlist);
7005             }
7006             invlist_set_len(invlist, len);
7007             array[len - 1] = PERL_UNICODE_MAX + 1;
7008         }
7009         else {  /* Remove the 0x110000 */
7010             invlist_set_len(invlist, len - 1);
7011         }
7012     }
7013
7014     return;
7015 }
7016 #endif
7017
7018 PERL_STATIC_INLINE SV*
7019 S_invlist_clone(pTHX_ SV* const invlist)
7020 {
7021
7022     /* Return a new inversion list that is a copy of the input one, which is
7023      * unchanged */
7024
7025     /* Need to allocate extra space to accommodate Perl's addition of a
7026      * trailing NUL to SvPV's, since it thinks they are always strings */
7027     SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7028     STRLEN length = SvCUR(invlist);
7029
7030     PERL_ARGS_ASSERT_INVLIST_CLONE;
7031
7032     SvCUR_set(new_invlist, length); /* This isn't done automatically */
7033     Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7034
7035     return new_invlist;
7036 }
7037
7038 #ifndef PERL_IN_XSUB_RE
7039 void
7040 Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
7041 {
7042     /* Point <result> to an inversion list which consists of all elements in
7043      * <a> that aren't also in <b>.  *result should be defined upon input, and
7044      * if it points to C<b> its reference count will be decremented. */
7045
7046     PERL_ARGS_ASSERT__INVLIST_SUBTRACT;
7047     assert(a != b);
7048
7049     /* Subtracting nothing retains the original */
7050     if (invlist_len(b) == 0) {
7051
7052         if (*result == b) {
7053             SvREFCNT_dec(b);
7054         }
7055
7056         /* If the result is not to be the same variable as the original, create
7057          * a copy */
7058         if (*result != a) {
7059             *result = invlist_clone(a);
7060         }
7061     } else {
7062         SV *b_copy = invlist_clone(b);
7063         _invlist_invert(b_copy);        /* Everything not in 'b' */
7064
7065         if (*result == b) {
7066             SvREFCNT_dec(b);
7067         }
7068
7069         _invlist_intersection(a, b_copy, result);    /* Everything in 'a' not in
7070                                                        'b' */
7071         SvREFCNT_dec(b_copy);
7072     }
7073
7074     return;
7075 }
7076 #endif
7077
7078 PERL_STATIC_INLINE UV*
7079 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7080 {
7081     /* Return the address of the UV that contains the current iteration
7082      * position */
7083
7084     PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7085
7086     return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7087 }
7088
7089 PERL_STATIC_INLINE void
7090 S_invlist_iterinit(pTHX_ SV* invlist)   /* Initialize iterator for invlist */
7091 {
7092     PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7093
7094     *get_invlist_iter_addr(invlist) = 0;
7095 }
7096
7097 STATIC bool
7098 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7099 {
7100     /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7101      * This call sets in <*start> and <*end>, the next range in <invlist>.
7102      * Returns <TRUE> if successful and the next call will return the next
7103      * range; <FALSE> if was already at the end of the list.  If the latter,
7104      * <*start> and <*end> are unchanged, and the next call to this function
7105      * will start over at the beginning of the list */
7106
7107     UV* pos = get_invlist_iter_addr(invlist);
7108     UV len = invlist_len(invlist);
7109     UV *array;
7110
7111     PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7112
7113     if (*pos >= len) {
7114         *pos = UV_MAX;  /* Force iternit() to be required next time */
7115         return FALSE;
7116     }
7117
7118     array = invlist_array(invlist);
7119
7120     *start = array[(*pos)++];
7121
7122     if (*pos >= len) {
7123         *end = UV_MAX;
7124     }
7125     else {
7126         *end = array[(*pos)++] - 1;
7127     }
7128
7129     return TRUE;
7130 }
7131
7132 #ifndef PERL_IN_XSUB_RE
7133 SV *
7134 Perl__invlist_contents(pTHX_ SV* const invlist)
7135 {
7136     /* Get the contents of an inversion list into a string SV so that they can
7137      * be printed out.  It uses the format traditionally done for debug tracing
7138      */
7139
7140     UV start, end;
7141     SV* output = newSVpvs("\n");
7142
7143     PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7144
7145     invlist_iterinit(invlist);
7146     while (invlist_iternext(invlist, &start, &end)) {
7147         if (end == UV_MAX) {
7148             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7149         }
7150         else if (end != start) {
7151             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7152                     start,       end);
7153         }
7154         else {
7155             Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7156         }
7157     }
7158
7159     return output;
7160 }
7161 #endif
7162
7163 #if 0
7164 void
7165 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7166 {
7167     /* Dumps out the ranges in an inversion list.  The string 'header'
7168      * if present is output on a line before the first range */
7169
7170     UV start, end;
7171
7172     if (header && strlen(header)) {
7173         PerlIO_printf(Perl_debug_log, "%s\n", header);
7174     }
7175     invlist_iterinit(invlist);
7176     while (invlist_iternext(invlist, &start, &end)) {
7177         if (end == UV_MAX) {
7178             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7179         }
7180         else {
7181             PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7182         }
7183     }
7184 }
7185 #endif
7186
7187 #undef HEADER_LENGTH
7188 #undef INVLIST_INITIAL_LENGTH
7189 #undef TO_INTERNAL_SIZE
7190 #undef FROM_INTERNAL_SIZE
7191 #undef INVLIST_LEN_OFFSET
7192 #undef INVLIST_ZERO_OFFSET
7193 #undef INVLIST_ITER_OFFSET
7194
7195 /* End of inversion list object */
7196
7197 /*
7198  - reg - regular expression, i.e. main body or parenthesized thing
7199  *
7200  * Caller must absorb opening parenthesis.
7201  *
7202  * Combining parenthesis handling with the base level of regular expression
7203  * is a trifle forced, but the need to tie the tails of the branches to what
7204  * follows makes it hard to avoid.
7205  */
7206 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7207 #ifdef DEBUGGING
7208 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7209 #else
7210 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7211 #endif
7212
7213 STATIC regnode *
7214 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7215     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7216 {
7217     dVAR;
7218     register regnode *ret;              /* Will be the head of the group. */
7219     register regnode *br;
7220     register regnode *lastbr;
7221     register regnode *ender = NULL;
7222     register I32 parno = 0;
7223     I32 flags;
7224     U32 oregflags = RExC_flags;
7225     bool have_branch = 0;
7226     bool is_open = 0;
7227     I32 freeze_paren = 0;
7228     I32 after_freeze = 0;
7229
7230     /* for (?g), (?gc), and (?o) warnings; warning
7231        about (?c) will warn about (?g) -- japhy    */
7232
7233 #define WASTED_O  0x01
7234 #define WASTED_G  0x02
7235 #define WASTED_C  0x04
7236 #define WASTED_GC (0x02|0x04)
7237     I32 wastedflags = 0x00;
7238
7239     char * parse_start = RExC_parse; /* MJD */
7240     char * const oregcomp_parse = RExC_parse;
7241
7242     GET_RE_DEBUG_FLAGS_DECL;
7243
7244     PERL_ARGS_ASSERT_REG;
7245     DEBUG_PARSE("reg ");
7246
7247     *flagp = 0;                         /* Tentatively. */
7248
7249
7250     /* Make an OPEN node, if parenthesized. */
7251     if (paren) {
7252         if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7253             char *start_verb = RExC_parse;
7254             STRLEN verb_len = 0;
7255             char *start_arg = NULL;
7256             unsigned char op = 0;
7257             int argok = 1;
7258             int internal_argval = 0; /* internal_argval is only useful if !argok */
7259             while ( *RExC_parse && *RExC_parse != ')' ) {
7260                 if ( *RExC_parse == ':' ) {
7261                     start_arg = RExC_parse + 1;
7262                     break;
7263                 }
7264                 RExC_parse++;
7265             }
7266             ++start_verb;
7267             verb_len = RExC_parse - start_verb;
7268             if ( start_arg ) {
7269                 RExC_parse++;
7270                 while ( *RExC_parse && *RExC_parse != ')' )
7271                     RExC_parse++;
7272                 if ( *RExC_parse != ')' )
7273                     vFAIL("Unterminated verb pattern argument");
7274                 if ( RExC_parse == start_arg )
7275                     start_arg = NULL;
7276             } else {
7277                 if ( *RExC_parse != ')' )
7278                     vFAIL("Unterminated verb pattern");
7279             }
7280
7281             switch ( *start_verb ) {
7282             case 'A':  /* (*ACCEPT) */
7283                 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7284                     op = ACCEPT;
7285                     internal_argval = RExC_nestroot;
7286                 }
7287                 break;
7288             case 'C':  /* (*COMMIT) */
7289                 if ( memEQs(start_verb,verb_len,"COMMIT") )
7290                     op = COMMIT;
7291                 break;
7292             case 'F':  /* (*FAIL) */
7293                 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7294                     op = OPFAIL;
7295                     argok = 0;
7296                 }
7297                 break;
7298             case ':':  /* (*:NAME) */
7299             case 'M':  /* (*MARK:NAME) */
7300                 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7301                     op = MARKPOINT;
7302                     argok = -1;
7303                 }
7304                 break;
7305             case 'P':  /* (*PRUNE) */
7306                 if ( memEQs(start_verb,verb_len,"PRUNE") )
7307                     op = PRUNE;
7308                 break;
7309             case 'S':   /* (*SKIP) */
7310                 if ( memEQs(start_verb,verb_len,"SKIP") )
7311                     op = SKIP;
7312                 break;
7313             case 'T':  /* (*THEN) */
7314                 /* [19:06] <TimToady> :: is then */
7315                 if ( memEQs(start_verb,verb_len,"THEN") ) {
7316                     op = CUTGROUP;
7317                     RExC_seen |= REG_SEEN_CUTGROUP;
7318                 }
7319                 break;
7320             }
7321             if ( ! op ) {
7322                 RExC_parse++;
7323                 vFAIL3("Unknown verb pattern '%.*s'",
7324                     verb_len, start_verb);
7325             }
7326             if ( argok ) {
7327                 if ( start_arg && internal_argval ) {
7328                     vFAIL3("Verb pattern '%.*s' may not have an argument",
7329                         verb_len, start_verb);
7330                 } else if ( argok < 0 && !start_arg ) {
7331                     vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7332                         verb_len, start_verb);
7333                 } else {
7334                     ret = reganode(pRExC_state, op, internal_argval);
7335                     if ( ! internal_argval && ! SIZE_ONLY ) {
7336                         if (start_arg) {
7337                             SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7338                             ARG(ret) = add_data( pRExC_state, 1, "S" );
7339                             RExC_rxi->data->data[ARG(ret)]=(void*)sv;
7340                             ret->flags = 0;
7341                         } else {
7342                             ret->flags = 1;
7343                         }
7344                     }
7345                 }
7346                 if (!internal_argval)
7347                     RExC_seen |= REG_SEEN_VERBARG;
7348             } else if ( start_arg ) {
7349                 vFAIL3("Verb pattern '%.*s' may not have an argument",
7350                         verb_len, start_verb);
7351             } else {
7352                 ret = reg_node(pRExC_state, op);
7353             }
7354             nextchar(pRExC_state);
7355             return ret;
7356         } else
7357         if (*RExC_parse == '?') { /* (?...) */
7358             bool is_logical = 0;
7359             const char * const seqstart = RExC_parse;
7360             bool has_use_defaults = FALSE;
7361
7362             RExC_parse++;
7363             paren = *RExC_parse++;
7364             ret = NULL;                 /* For look-ahead/behind. */
7365             switch (paren) {
7366
7367             case 'P':   /* (?P...) variants for those used to PCRE/Python */
7368                 paren = *RExC_parse++;
7369                 if ( paren == '<')         /* (?P<...>) named capture */
7370                     goto named_capture;
7371                 else if (paren == '>') {   /* (?P>name) named recursion */
7372                     goto named_recursion;
7373                 }
7374                 else if (paren == '=') {   /* (?P=...)  named backref */
7375                     /* this pretty much dupes the code for \k<NAME> in regatom(), if
7376                        you change this make sure you change that */
7377                     char* name_start = RExC_parse;
7378                     U32 num = 0;
7379                     SV *sv_dat = reg_scan_name(pRExC_state,
7380                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7381                     if (RExC_parse == name_start || *RExC_parse != ')')
7382                         vFAIL2("Sequence %.3s... not terminated",parse_start);
7383
7384                     if (!SIZE_ONLY) {
7385                         num = add_data( pRExC_state, 1, "S" );
7386                         RExC_rxi->data->data[num]=(void*)sv_dat;
7387                         SvREFCNT_inc_simple_void(sv_dat);
7388                     }
7389                     RExC_sawback = 1;
7390                     ret = reganode(pRExC_state,
7391                                    ((! FOLD)
7392                                      ? NREF
7393                                      : (MORE_ASCII_RESTRICTED)
7394                                        ? NREFFA
7395                                        : (AT_LEAST_UNI_SEMANTICS)
7396                                          ? NREFFU
7397                                          : (LOC)
7398                                            ? NREFFL
7399                                            : NREFF),
7400                                     num);
7401                     *flagp |= HASWIDTH;
7402
7403                     Set_Node_Offset(ret, parse_start+1);
7404                     Set_Node_Cur_Length(ret); /* MJD */
7405
7406                     nextchar(pRExC_state);
7407                     return ret;
7408                 }
7409                 RExC_parse++;
7410                 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7411                 /*NOTREACHED*/
7412             case '<':           /* (?<...) */
7413                 if (*RExC_parse == '!')
7414                     paren = ',';
7415                 else if (*RExC_parse != '=')
7416               named_capture:
7417                 {               /* (?<...>) */
7418                     char *name_start;
7419                     SV *svname;
7420                     paren= '>';
7421             case '\'':          /* (?'...') */
7422                     name_start= RExC_parse;
7423                     svname = reg_scan_name(pRExC_state,
7424                         SIZE_ONLY ?  /* reverse test from the others */
7425                         REG_RSN_RETURN_NAME :
7426                         REG_RSN_RETURN_NULL);
7427                     if (RExC_parse == name_start) {
7428                         RExC_parse++;
7429                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7430                         /*NOTREACHED*/
7431                     }
7432                     if (*RExC_parse != paren)
7433                         vFAIL2("Sequence (?%c... not terminated",
7434                             paren=='>' ? '<' : paren);
7435                     if (SIZE_ONLY) {
7436                         HE *he_str;
7437                         SV *sv_dat = NULL;
7438                         if (!svname) /* shouldn't happen */
7439                             Perl_croak(aTHX_
7440                                 "panic: reg_scan_name returned NULL");
7441                         if (!RExC_paren_names) {
7442                             RExC_paren_names= newHV();
7443                             sv_2mortal(MUTABLE_SV(RExC_paren_names));
7444 #ifdef DEBUGGING
7445                             RExC_paren_name_list= newAV();
7446                             sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
7447 #endif
7448                         }
7449                         he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
7450                         if ( he_str )
7451                             sv_dat = HeVAL(he_str);
7452                         if ( ! sv_dat ) {
7453                             /* croak baby croak */
7454                             Perl_croak(aTHX_
7455                                 "panic: paren_name hash element allocation failed");
7456                         } else if ( SvPOK(sv_dat) ) {
7457                             /* (?|...) can mean we have dupes so scan to check
7458                                its already been stored. Maybe a flag indicating
7459                                we are inside such a construct would be useful,
7460                                but the arrays are likely to be quite small, so
7461                                for now we punt -- dmq */
7462                             IV count = SvIV(sv_dat);
7463                             I32 *pv = (I32*)SvPVX(sv_dat);
7464                             IV i;
7465                             for ( i = 0 ; i < count ; i++ ) {
7466                                 if ( pv[i] == RExC_npar ) {
7467                                     count = 0;
7468                                     break;
7469                                 }
7470                             }
7471                             if ( count ) {
7472                                 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7473                                 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7474                                 pv[count] = RExC_npar;
7475                                 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
7476                             }
7477                         } else {
7478                             (void)SvUPGRADE(sv_dat,SVt_PVNV);
7479                             sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7480                             SvIOK_on(sv_dat);
7481                             SvIV_set(sv_dat, 1);
7482                         }
7483 #ifdef DEBUGGING
7484                         /* Yes this does cause a memory leak in debugging Perls */
7485                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7486                             SvREFCNT_dec(svname);
7487 #endif
7488
7489                         /*sv_dump(sv_dat);*/
7490                     }
7491                     nextchar(pRExC_state);
7492                     paren = 1;
7493                     goto capturing_parens;
7494                 }
7495                 RExC_seen |= REG_SEEN_LOOKBEHIND;
7496                 RExC_in_lookbehind++;
7497                 RExC_parse++;
7498             case '=':           /* (?=...) */
7499                 RExC_seen_zerolen++;
7500                 break;
7501             case '!':           /* (?!...) */
7502                 RExC_seen_zerolen++;
7503                 if (*RExC_parse == ')') {
7504                     ret=reg_node(pRExC_state, OPFAIL);
7505                     nextchar(pRExC_state);
7506                     return ret;
7507                 }
7508                 break;
7509             case '|':           /* (?|...) */
7510                 /* branch reset, behave like a (?:...) except that
7511                    buffers in alternations share the same numbers */
7512                 paren = ':';
7513                 after_freeze = freeze_paren = RExC_npar;
7514                 break;
7515             case ':':           /* (?:...) */
7516             case '>':           /* (?>...) */
7517                 break;
7518             case '$':           /* (?$...) */
7519             case '@':           /* (?@...) */
7520                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
7521                 break;
7522             case '#':           /* (?#...) */
7523                 while (*RExC_parse && *RExC_parse != ')')
7524                     RExC_parse++;
7525                 if (*RExC_parse != ')')
7526                     FAIL("Sequence (?#... not terminated");
7527                 nextchar(pRExC_state);
7528                 *flagp = TRYAGAIN;
7529                 return NULL;
7530             case '0' :           /* (?0) */
7531             case 'R' :           /* (?R) */
7532                 if (*RExC_parse != ')')
7533                     FAIL("Sequence (?R) not terminated");
7534                 ret = reg_node(pRExC_state, GOSTART);
7535                 *flagp |= POSTPONED;
7536                 nextchar(pRExC_state);
7537                 return ret;
7538                 /*notreached*/
7539             { /* named and numeric backreferences */
7540                 I32 num;
7541             case '&':            /* (?&NAME) */
7542                 parse_start = RExC_parse - 1;
7543               named_recursion:
7544                 {
7545                     SV *sv_dat = reg_scan_name(pRExC_state,
7546                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7547                      num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7548                 }
7549                 goto gen_recurse_regop;
7550                 /* NOT REACHED */
7551             case '+':
7552                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7553                     RExC_parse++;
7554                     vFAIL("Illegal pattern");
7555                 }
7556                 goto parse_recursion;
7557                 /* NOT REACHED*/
7558             case '-': /* (?-1) */
7559                 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7560                     RExC_parse--; /* rewind to let it be handled later */
7561                     goto parse_flags;
7562                 }
7563                 /*FALLTHROUGH */
7564             case '1': case '2': case '3': case '4': /* (?1) */
7565             case '5': case '6': case '7': case '8': case '9':
7566                 RExC_parse--;
7567               parse_recursion:
7568                 num = atoi(RExC_parse);
7569                 parse_start = RExC_parse - 1; /* MJD */
7570                 if (*RExC_parse == '-')
7571                     RExC_parse++;
7572                 while (isDIGIT(*RExC_parse))
7573                         RExC_parse++;
7574                 if (*RExC_parse!=')')
7575                     vFAIL("Expecting close bracket");
7576
7577               gen_recurse_regop:
7578                 if ( paren == '-' ) {
7579                     /*
7580                     Diagram of capture buffer numbering.
7581                     Top line is the normal capture buffer numbers
7582                     Bottom line is the negative indexing as from
7583                     the X (the (?-2))
7584
7585                     +   1 2    3 4 5 X          6 7
7586                        /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7587                     -   5 4    3 2 1 X          x x
7588
7589                     */
7590                     num = RExC_npar + num;
7591                     if (num < 1)  {
7592                         RExC_parse++;
7593                         vFAIL("Reference to nonexistent group");
7594                     }
7595                 } else if ( paren == '+' ) {
7596                     num = RExC_npar + num - 1;
7597                 }
7598
7599                 ret = reganode(pRExC_state, GOSUB, num);
7600                 if (!SIZE_ONLY) {
7601                     if (num > (I32)RExC_rx->nparens) {
7602                         RExC_parse++;
7603                         vFAIL("Reference to nonexistent group");
7604                     }
7605                     ARG2L_SET( ret, RExC_recurse_count++);
7606                     RExC_emit++;
7607                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7608                         "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
7609                 } else {
7610                     RExC_size++;
7611                 }
7612                 RExC_seen |= REG_SEEN_RECURSE;
7613                 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
7614                 Set_Node_Offset(ret, parse_start); /* MJD */
7615
7616                 *flagp |= POSTPONED;
7617                 nextchar(pRExC_state);
7618                 return ret;
7619             } /* named and numeric backreferences */
7620             /* NOT REACHED */
7621
7622             case '?':           /* (??...) */
7623                 is_logical = 1;
7624                 if (*RExC_parse != '{') {
7625                     RExC_parse++;
7626                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7627                     /*NOTREACHED*/
7628                 }
7629                 *flagp |= POSTPONED;
7630                 paren = *RExC_parse++;
7631                 /* FALL THROUGH */
7632             case '{':           /* (?{...}) */
7633             {
7634                 I32 count = 1;
7635                 U32 n = 0;
7636                 char c;
7637                 char *s = RExC_parse;
7638
7639                 RExC_seen_zerolen++;
7640                 RExC_seen |= REG_SEEN_EVAL;
7641                 while (count && (c = *RExC_parse)) {
7642                     if (c == '\\') {
7643                         if (RExC_parse[1])
7644                             RExC_parse++;
7645                     }
7646                     else if (c == '{')
7647                         count++;
7648                     else if (c == '}')
7649                         count--;
7650                     RExC_parse++;
7651                 }
7652                 if (*RExC_parse != ')') {
7653                     RExC_parse = s;
7654                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7655                 }
7656                 if (!SIZE_ONLY) {
7657                     PAD *pad;
7658                     OP_4tree *sop, *rop;
7659                     SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
7660
7661                     ENTER;
7662                     Perl_save_re_context(aTHX);
7663                     rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
7664                     sop->op_private |= OPpREFCOUNTED;
7665                     /* re_dup will OpREFCNT_inc */
7666                     OpREFCNT_set(sop, 1);
7667                     LEAVE;
7668
7669                     n = add_data(pRExC_state, 3, "nop");
7670                     RExC_rxi->data->data[n] = (void*)rop;
7671                     RExC_rxi->data->data[n+1] = (void*)sop;
7672                     RExC_rxi->data->data[n+2] = (void*)pad;
7673                     SvREFCNT_dec(sv);
7674                 }
7675                 else {                                          /* First pass */
7676                     if (PL_reginterp_cnt < ++RExC_seen_evals
7677                         && IN_PERL_RUNTIME)
7678                         /* No compiled RE interpolated, has runtime
7679                            components ===> unsafe.  */
7680                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
7681                     if (PL_tainting && PL_tainted)
7682                         FAIL("Eval-group in insecure regular expression");
7683 #if PERL_VERSION > 8
7684                     if (IN_PERL_COMPILETIME)
7685                         PL_cv_has_eval = 1;
7686 #endif
7687                 }
7688
7689                 nextchar(pRExC_state);
7690                 if (is_logical) {
7691                     ret = reg_node(pRExC_state, LOGICAL);
7692                     if (!SIZE_ONLY)
7693                         ret->flags = 2;
7694                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
7695                     /* deal with the length of this later - MJD */
7696                     return ret;
7697                 }
7698                 ret = reganode(pRExC_state, EVAL, n);
7699                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7700                 Set_Node_Offset(ret, parse_start);
7701                 return ret;
7702             }
7703             case '(':           /* (?(?{...})...) and (?(?=...)...) */
7704             {
7705                 int is_define= 0;
7706                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
7707                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7708                         || RExC_parse[1] == '<'
7709                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
7710                         I32 flag;
7711
7712                         ret = reg_node(pRExC_state, LOGICAL);
7713                         if (!SIZE_ONLY)
7714                             ret->flags = 1;
7715                         REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
7716                         goto insert_if;
7717                     }
7718                 }
7719                 else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
7720                          || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
7721                 {
7722                     char ch = RExC_parse[0] == '<' ? '>' : '\'';
7723                     char *name_start= RExC_parse++;
7724                     U32 num = 0;
7725                     SV *sv_dat=reg_scan_name(pRExC_state,
7726                         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7727                     if (RExC_parse == name_start || *RExC_parse != ch)
7728                         vFAIL2("Sequence (?(%c... not terminated",
7729                             (ch == '>' ? '<' : ch));
7730                     RExC_parse++;
7731                     if (!SIZE_ONLY) {
7732                         num = add_data( pRExC_state, 1, "S" );
7733                         RExC_rxi->data->data[num]=(void*)sv_dat;
7734                         SvREFCNT_inc_simple_void(sv_dat);
7735                     }
7736                     ret = reganode(pRExC_state,NGROUPP,num);
7737                     goto insert_if_check_paren;
7738                 }
7739                 else if (RExC_parse[0] == 'D' &&
7740                          RExC_parse[1] == 'E' &&
7741                          RExC_parse[2] == 'F' &&
7742                          RExC_parse[3] == 'I' &&
7743                          RExC_parse[4] == 'N' &&
7744                          RExC_parse[5] == 'E')
7745                 {
7746                     ret = reganode(pRExC_state,DEFINEP,0);
7747                     RExC_parse +=6 ;
7748                     is_define = 1;
7749                     goto insert_if_check_paren;
7750                 }
7751                 else if (RExC_parse[0] == 'R') {
7752                     RExC_parse++;
7753                     parno = 0;
7754                     if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7755                         parno = atoi(RExC_parse++);
7756                         while (isDIGIT(*RExC_parse))
7757                             RExC_parse++;
7758                     } else if (RExC_parse[0] == '&') {
7759                         SV *sv_dat;
7760                         RExC_parse++;
7761                         sv_dat = reg_scan_name(pRExC_state,
7762                             SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7763                         parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7764                     }
7765                     ret = reganode(pRExC_state,INSUBP,parno);
7766                     goto insert_if_check_paren;
7767                 }
7768                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7769                     /* (?(1)...) */
7770                     char c;
7771                     parno = atoi(RExC_parse++);
7772
7773                     while (isDIGIT(*RExC_parse))
7774                         RExC_parse++;
7775                     ret = reganode(pRExC_state, GROUPP, parno);
7776
7777                  insert_if_check_paren:
7778                     if ((c = *nextchar(pRExC_state)) != ')')
7779                         vFAIL("Switch condition not recognized");
7780                   insert_if:
7781                     REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
7782                     br = regbranch(pRExC_state, &flags, 1,depth+1);
7783                     if (br == NULL)
7784                         br = reganode(pRExC_state, LONGJMP, 0);
7785                     else
7786                         REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
7787                     c = *nextchar(pRExC_state);
7788                     if (flags&HASWIDTH)
7789                         *flagp |= HASWIDTH;
7790                     if (c == '|') {
7791                         if (is_define)
7792                             vFAIL("(?(DEFINE)....) does not allow branches");
7793                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7794                         regbranch(pRExC_state, &flags, 1,depth+1);
7795                         REGTAIL(pRExC_state, ret, lastbr);
7796                         if (flags&HASWIDTH)
7797                             *flagp |= HASWIDTH;
7798                         c = *nextchar(pRExC_state);
7799                     }
7800                     else
7801                         lastbr = NULL;
7802                     if (c != ')')
7803                         vFAIL("Switch (?(condition)... contains too many branches");
7804                     ender = reg_node(pRExC_state, TAIL);
7805                     REGTAIL(pRExC_state, br, ender);
7806                     if (lastbr) {
7807                         REGTAIL(pRExC_state, lastbr, ender);
7808                         REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7809                     }
7810                     else
7811                         REGTAIL(pRExC_state, ret, ender);
7812                     RExC_size++; /* XXX WHY do we need this?!!
7813                                     For large programs it seems to be required
7814                                     but I can't figure out why. -- dmq*/
7815                     return ret;
7816                 }
7817                 else {
7818                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7819                 }
7820             }
7821             case 0:
7822                 RExC_parse--; /* for vFAIL to print correctly */
7823                 vFAIL("Sequence (? incomplete");
7824                 break;
7825             case DEFAULT_PAT_MOD:   /* Use default flags with the exceptions
7826                                        that follow */
7827                 has_use_defaults = TRUE;
7828                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7829                 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
7830                                                 ? REGEX_UNICODE_CHARSET
7831                                                 : REGEX_DEPENDS_CHARSET);
7832                 goto parse_flags;
7833             default:
7834                 --RExC_parse;
7835                 parse_flags:      /* (?i) */
7836             {
7837                 U32 posflags = 0, negflags = 0;
7838                 U32 *flagsp = &posflags;
7839                 char has_charset_modifier = '\0';
7840                 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
7841                                     ? REGEX_UNICODE_CHARSET
7842                                     : REGEX_DEPENDS_CHARSET;
7843
7844                 while (*RExC_parse) {
7845                     /* && strchr("iogcmsx", *RExC_parse) */
7846                     /* (?g), (?gc) and (?o) are useless here
7847                        and must be globally applied -- japhy */
7848                     switch (*RExC_parse) {
7849                     CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7850                     case LOCALE_PAT_MOD:
7851                         if (has_charset_modifier) {
7852                             goto excess_modifier;
7853                         }
7854                         else if (flagsp == &negflags) {
7855                             goto neg_modifier;
7856                         }
7857                         cs = REGEX_LOCALE_CHARSET;
7858                         has_charset_modifier = LOCALE_PAT_MOD;
7859                         RExC_contains_locale = 1;
7860                         break;
7861                     case UNICODE_PAT_MOD:
7862                         if (has_charset_modifier) {
7863                             goto excess_modifier;
7864                         }
7865                         else if (flagsp == &negflags) {
7866                             goto neg_modifier;
7867                         }
7868                         cs = REGEX_UNICODE_CHARSET;
7869                         has_charset_modifier = UNICODE_PAT_MOD;
7870                         break;
7871                     case ASCII_RESTRICT_PAT_MOD:
7872                         if (flagsp == &negflags) {
7873                             goto neg_modifier;
7874                         }
7875                         if (has_charset_modifier) {
7876                             if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
7877                                 goto excess_modifier;
7878                             }
7879                             /* Doubled modifier implies more restricted */
7880                             cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7881                         }
7882                         else {
7883                             cs = REGEX_ASCII_RESTRICTED_CHARSET;
7884                         }
7885                         has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
7886                         break;
7887                     case DEPENDS_PAT_MOD:
7888                         if (has_use_defaults) {
7889                             goto fail_modifiers;
7890                         }
7891                         else if (flagsp == &negflags) {
7892                             goto neg_modifier;
7893                         }
7894                         else if (has_charset_modifier) {
7895                             goto excess_modifier;
7896                         }
7897
7898                         /* The dual charset means unicode semantics if the
7899                          * pattern (or target, not known until runtime) are
7900                          * utf8, or something in the pattern indicates unicode
7901                          * semantics */
7902                         cs = (RExC_utf8 || RExC_uni_semantics)
7903                              ? REGEX_UNICODE_CHARSET
7904                              : REGEX_DEPENDS_CHARSET;
7905                         has_charset_modifier = DEPENDS_PAT_MOD;
7906                         break;
7907                     excess_modifier:
7908                         RExC_parse++;
7909                         if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
7910                             vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
7911                         }
7912                         else if (has_charset_modifier == *(RExC_parse - 1)) {
7913                             vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
7914                         }
7915                         else {
7916                             vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
7917                         }
7918                         /*NOTREACHED*/
7919                     neg_modifier:
7920                         RExC_parse++;
7921                         vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
7922                         /*NOTREACHED*/
7923                     case ONCE_PAT_MOD: /* 'o' */
7924                     case GLOBAL_PAT_MOD: /* 'g' */
7925                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7926                             const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7927                             if (! (wastedflags & wflagbit) ) {
7928                                 wastedflags |= wflagbit;
7929                                 vWARN5(
7930                                     RExC_parse + 1,
7931                                     "Useless (%s%c) - %suse /%c modifier",
7932                                     flagsp == &negflags ? "?-" : "?",
7933                                     *RExC_parse,
7934                                     flagsp == &negflags ? "don't " : "",
7935                                     *RExC_parse
7936                                 );
7937                             }
7938                         }
7939                         break;
7940
7941                     case CONTINUE_PAT_MOD: /* 'c' */
7942                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7943                             if (! (wastedflags & WASTED_C) ) {
7944                                 wastedflags |= WASTED_GC;
7945                                 vWARN3(
7946                                     RExC_parse + 1,
7947                                     "Useless (%sc) - %suse /gc modifier",
7948                                     flagsp == &negflags ? "?-" : "?",
7949                                     flagsp == &negflags ? "don't " : ""
7950                                 );
7951                             }
7952                         }
7953                         break;
7954                     case KEEPCOPY_PAT_MOD: /* 'p' */
7955                         if (flagsp == &negflags) {
7956                             if (SIZE_ONLY)
7957                                 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7958                         } else {
7959                             *flagsp |= RXf_PMf_KEEPCOPY;
7960                         }
7961                         break;
7962                     case '-':
7963                         /* A flag is a default iff it is following a minus, so
7964                          * if there is a minus, it means will be trying to
7965                          * re-specify a default which is an error */
7966                         if (has_use_defaults || flagsp == &negflags) {
7967             fail_modifiers:
7968                             RExC_parse++;
7969                             vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7970                             /*NOTREACHED*/
7971                         }
7972                         flagsp = &negflags;
7973                         wastedflags = 0;  /* reset so (?g-c) warns twice */
7974                         break;
7975                     case ':':
7976                         paren = ':';
7977                         /*FALLTHROUGH*/
7978                     case ')':
7979                         RExC_flags |= posflags;
7980                         RExC_flags &= ~negflags;
7981                         set_regex_charset(&RExC_flags, cs);
7982                         if (paren != ':') {
7983                             oregflags |= posflags;
7984                             oregflags &= ~negflags;
7985                             set_regex_charset(&oregflags, cs);
7986                         }
7987                         nextchar(pRExC_state);
7988                         if (paren != ':') {
7989                             *flagp = TRYAGAIN;
7990                             return NULL;
7991                         } else {
7992                             ret = NULL;
7993                             goto parse_rest;
7994                         }
7995                         /*NOTREACHED*/
7996                     default:
7997                         RExC_parse++;
7998                         vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7999                         /*NOTREACHED*/
8000                     }
8001                     ++RExC_parse;
8002                 }
8003             }} /* one for the default block, one for the switch */
8004         }
8005         else {                  /* (...) */
8006           capturing_parens:
8007             parno = RExC_npar;
8008             RExC_npar++;
8009
8010             ret = reganode(pRExC_state, OPEN, parno);
8011             if (!SIZE_ONLY ){
8012                 if (!RExC_nestroot)
8013                     RExC_nestroot = parno;
8014                 if (RExC_seen & REG_SEEN_RECURSE
8015                     && !RExC_open_parens[parno-1])
8016                 {
8017                     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8018                         "Setting open paren #%"IVdf" to %d\n",
8019                         (IV)parno, REG_NODE_NUM(ret)));
8020                     RExC_open_parens[parno-1]= ret;
8021                 }
8022             }
8023             Set_Node_Length(ret, 1); /* MJD */
8024             Set_Node_Offset(ret, RExC_parse); /* MJD */
8025             is_open = 1;
8026         }
8027     }
8028     else                        /* ! paren */
8029         ret = NULL;
8030
8031    parse_rest:
8032     /* Pick up the branches, linking them together. */
8033     parse_start = RExC_parse;   /* MJD */
8034     br = regbranch(pRExC_state, &flags, 1,depth+1);
8035
8036     /*     branch_len = (paren != 0); */
8037
8038     if (br == NULL)
8039         return(NULL);
8040     if (*RExC_parse == '|') {
8041         if (!SIZE_ONLY && RExC_extralen) {
8042             reginsert(pRExC_state, BRANCHJ, br, depth+1);
8043         }
8044         else {                  /* MJD */
8045             reginsert(pRExC_state, BRANCH, br, depth+1);
8046             Set_Node_Length(br, paren != 0);
8047             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8048         }
8049         have_branch = 1;
8050         if (SIZE_ONLY)
8051             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
8052     }
8053     else if (paren == ':') {
8054         *flagp |= flags&SIMPLE;
8055     }
8056     if (is_open) {                              /* Starts with OPEN. */
8057         REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
8058     }
8059     else if (paren != '?')              /* Not Conditional */
8060         ret = br;
8061     *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8062     lastbr = br;
8063     while (*RExC_parse == '|') {
8064         if (!SIZE_ONLY && RExC_extralen) {
8065             ender = reganode(pRExC_state, LONGJMP,0);
8066             REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8067         }
8068         if (SIZE_ONLY)
8069             RExC_extralen += 2;         /* Account for LONGJMP. */
8070         nextchar(pRExC_state);
8071         if (freeze_paren) {
8072             if (RExC_npar > after_freeze)
8073                 after_freeze = RExC_npar;
8074             RExC_npar = freeze_paren;
8075         }
8076         br = regbranch(pRExC_state, &flags, 0, depth+1);
8077
8078         if (br == NULL)
8079             return(NULL);
8080         REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
8081         lastbr = br;
8082         *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8083     }
8084
8085     if (have_branch || paren != ':') {
8086         /* Make a closing node, and hook it on the end. */
8087         switch (paren) {
8088         case ':':
8089             ender = reg_node(pRExC_state, TAIL);
8090             break;
8091         case 1:
8092             ender = reganode(pRExC_state, CLOSE, parno);
8093             if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8094                 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8095                         "Setting close paren #%"IVdf" to %d\n",
8096                         (IV)parno, REG_NODE_NUM(ender)));
8097                 RExC_close_parens[parno-1]= ender;
8098                 if (RExC_nestroot == parno)
8099                     RExC_nestroot = 0;
8100             }
8101             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8102             Set_Node_Length(ender,1); /* MJD */
8103             break;
8104         case '<':
8105         case ',':
8106         case '=':
8107         case '!':
8108             *flagp &= ~HASWIDTH;
8109             /* FALL THROUGH */
8110         case '>':
8111             ender = reg_node(pRExC_state, SUCCEED);
8112             break;
8113         case 0:
8114             ender = reg_node(pRExC_state, END);
8115             if (!SIZE_ONLY) {
8116                 assert(!RExC_opend); /* there can only be one! */
8117                 RExC_opend = ender;
8118             }
8119             break;
8120         }
8121         REGTAIL(pRExC_state, lastbr, ender);
8122
8123         if (have_branch && !SIZE_ONLY) {
8124             if (depth==1)
8125                 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8126
8127             /* Hook the tails of the branches to the closing node. */
8128             for (br = ret; br; br = regnext(br)) {
8129                 const U8 op = PL_regkind[OP(br)];
8130                 if (op == BRANCH) {
8131                     REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8132                 }
8133                 else if (op == BRANCHJ) {
8134                     REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8135                 }
8136             }
8137         }
8138     }
8139
8140     {
8141         const char *p;
8142         static const char parens[] = "=!<,>";
8143
8144         if (paren && (p = strchr(parens, paren))) {
8145             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8146             int flag = (p - parens) > 1;
8147
8148             if (paren == '>')
8149                 node = SUSPEND, flag = 0;
8150             reginsert(pRExC_state, node,ret, depth+1);
8151             Set_Node_Cur_Length(ret);
8152             Set_Node_Offset(ret, parse_start + 1);
8153             ret->flags = flag;
8154             REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8155         }
8156     }
8157
8158     /* Check for proper termination. */
8159     if (paren) {
8160         RExC_flags = oregflags;
8161         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8162             RExC_parse = oregcomp_parse;
8163             vFAIL("Unmatched (");
8164         }
8165     }
8166     else if (!paren && RExC_parse < RExC_end) {
8167         if (*RExC_parse == ')') {
8168             RExC_parse++;
8169             vFAIL("Unmatched )");
8170         }
8171         else
8172             FAIL("Junk on end of regexp");      /* "Can't happen". */
8173         /* NOTREACHED */
8174     }
8175
8176     if (RExC_in_lookbehind) {
8177         RExC_in_lookbehind--;
8178     }
8179     if (after_freeze > RExC_npar)
8180         RExC_npar = after_freeze;
8181     return(ret);
8182 }
8183
8184 /*
8185  - regbranch - one alternative of an | operator
8186  *
8187  * Implements the concatenation operator.
8188  */
8189 STATIC regnode *
8190 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8191 {
8192     dVAR;
8193     register regnode *ret;
8194     register regnode *chain = NULL;
8195     register regnode *latest;
8196     I32 flags = 0, c = 0;
8197     GET_RE_DEBUG_FLAGS_DECL;
8198
8199     PERL_ARGS_ASSERT_REGBRANCH;
8200
8201     DEBUG_PARSE("brnc");
8202
8203     if (first)
8204         ret = NULL;
8205     else {
8206         if (!SIZE_ONLY && RExC_extralen)
8207             ret = reganode(pRExC_state, BRANCHJ,0);
8208         else {
8209             ret = reg_node(pRExC_state, BRANCH);
8210             Set_Node_Length(ret, 1);
8211         }
8212     }
8213
8214     if (!first && SIZE_ONLY)
8215         RExC_extralen += 1;                     /* BRANCHJ */
8216
8217     *flagp = WORST;                     /* Tentatively. */
8218
8219     RExC_parse--;
8220     nextchar(pRExC_state);
8221     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8222         flags &= ~TRYAGAIN;
8223         latest = regpiece(pRExC_state, &flags,depth+1);
8224         if (latest == NULL) {
8225             if (flags & TRYAGAIN)
8226                 continue;
8227             return(NULL);
8228         }
8229         else if (ret == NULL)
8230             ret = latest;
8231         *flagp |= flags&(HASWIDTH|POSTPONED);
8232         if (chain == NULL)      /* First piece. */
8233             *flagp |= flags&SPSTART;
8234         else {
8235             RExC_naughty++;
8236             REGTAIL(pRExC_state, chain, latest);
8237         }
8238         chain = latest;
8239         c++;
8240     }
8241     if (chain == NULL) {        /* Loop ran zero times. */
8242         chain = reg_node(pRExC_state, NOTHING);
8243         if (ret == NULL)
8244             ret = chain;
8245     }
8246     if (c == 1) {
8247         *flagp |= flags&SIMPLE;
8248     }
8249
8250     return ret;
8251 }
8252
8253 /*
8254  - regpiece - something followed by possible [*+?]
8255  *
8256  * Note that the branching code sequences used for ? and the general cases
8257  * of * and + are somewhat optimized:  they use the same NOTHING node as
8258  * both the endmarker for their branch list and the body of the last branch.
8259  * It might seem that this node could be dispensed with entirely, but the
8260  * endmarker role is not redundant.
8261  */
8262 STATIC regnode *
8263 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8264 {
8265     dVAR;
8266     register regnode *ret;
8267     register char op;
8268     register char *next;
8269     I32 flags;
8270     const char * const origparse = RExC_parse;
8271     I32 min;
8272     I32 max = REG_INFTY;
8273 #ifdef RE_TRACK_PATTERN_OFFSETS
8274     char *parse_start;
8275 #endif
8276     const char *maxpos = NULL;
8277     GET_RE_DEBUG_FLAGS_DECL;
8278
8279     PERL_ARGS_ASSERT_REGPIECE;
8280
8281     DEBUG_PARSE("piec");
8282
8283     ret = regatom(pRExC_state, &flags,depth+1);
8284     if (ret == NULL) {
8285         if (flags & TRYAGAIN)
8286             *flagp |= TRYAGAIN;
8287         return(NULL);
8288     }
8289
8290     op = *RExC_parse;
8291
8292     if (op == '{' && regcurly(RExC_parse)) {
8293         maxpos = NULL;
8294 #ifdef RE_TRACK_PATTERN_OFFSETS
8295         parse_start = RExC_parse; /* MJD */
8296 #endif
8297         next = RExC_parse + 1;
8298         while (isDIGIT(*next) || *next == ',') {
8299             if (*next == ',') {
8300                 if (maxpos)
8301                     break;
8302                 else
8303                     maxpos = next;
8304             }
8305             next++;
8306         }
8307         if (*next == '}') {             /* got one */
8308             if (!maxpos)
8309                 maxpos = next;
8310             RExC_parse++;
8311             min = atoi(RExC_parse);
8312             if (*maxpos == ',')
8313                 maxpos++;
8314             else
8315                 maxpos = RExC_parse;
8316             max = atoi(maxpos);
8317             if (!max && *maxpos != '0')
8318                 max = REG_INFTY;                /* meaning "infinity" */
8319             else if (max >= REG_INFTY)
8320                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
8321             RExC_parse = next;
8322             nextchar(pRExC_state);
8323
8324         do_curly:
8325             if ((flags&SIMPLE)) {
8326                 RExC_naughty += 2 + RExC_naughty / 2;
8327                 reginsert(pRExC_state, CURLY, ret, depth+1);
8328                 Set_Node_Offset(ret, parse_start+1); /* MJD */
8329                 Set_Node_Cur_Length(ret);
8330             }
8331             else {
8332                 regnode * const w = reg_node(pRExC_state, WHILEM);
8333
8334                 w->flags = 0;
8335                 REGTAIL(pRExC_state, ret, w);
8336                 if (!SIZE_ONLY && RExC_extralen) {
8337                     reginsert(pRExC_state, LONGJMP,ret, depth+1);
8338                     reginsert(pRExC_state, NOTHING,ret, depth+1);
8339                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
8340                 }
8341                 reginsert(pRExC_state, CURLYX,ret, depth+1);
8342                                 /* MJD hk */
8343                 Set_Node_Offset(ret, parse_start+1);
8344                 Set_Node_Length(ret,
8345                                 op == '{' ? (RExC_parse - parse_start) : 1);
8346
8347                 if (!SIZE_ONLY && RExC_extralen)
8348                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
8349                 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
8350                 if (SIZE_ONLY)
8351                     RExC_whilem_seen++, RExC_extralen += 3;
8352                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
8353             }
8354             ret->flags = 0;
8355
8356             if (min > 0)
8357                 *flagp = WORST;
8358             if (max > 0)
8359                 *flagp |= HASWIDTH;
8360             if (max < min)
8361                 vFAIL("Can't do {n,m} with n > m");
8362             if (!SIZE_ONLY) {
8363                 ARG1_SET(ret, (U16)min);
8364                 ARG2_SET(ret, (U16)max);
8365             }
8366
8367             goto nest_check;
8368         }
8369     }
8370
8371     if (!ISMULT1(op)) {
8372         *flagp = flags;
8373         return(ret);
8374     }
8375
8376 #if 0                           /* Now runtime fix should be reliable. */
8377
8378     /* if this is reinstated, don't forget to put this back into perldiag:
8379
8380             =item Regexp *+ operand could be empty at {#} in regex m/%s/
8381
8382            (F) The part of the regexp subject to either the * or + quantifier
8383            could match an empty string. The {#} shows in the regular
8384            expression about where the problem was discovered.
8385
8386     */
8387
8388     if (!(flags&HASWIDTH) && op != '?')
8389       vFAIL("Regexp *+ operand could be empty");
8390 #endif
8391
8392 #ifdef RE_TRACK_PATTERN_OFFSETS
8393     parse_start = RExC_parse;
8394 #endif
8395     nextchar(pRExC_state);
8396
8397     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
8398
8399     if (op == '*' && (flags&SIMPLE)) {
8400         reginsert(pRExC_state, STAR, ret, depth+1);
8401         ret->flags = 0;
8402         RExC_naughty += 4;
8403     }
8404     else if (op == '*') {
8405         min = 0;
8406         goto do_curly;
8407     }
8408     else if (op == '+' && (flags&SIMPLE)) {
8409         reginsert(pRExC_state, PLUS, ret, depth+1);
8410         ret->flags = 0;
8411         RExC_naughty += 3;
8412     }
8413     else if (op == '+') {
8414         min = 1;
8415         goto do_curly;
8416     }
8417     else if (op == '?') {
8418         min = 0; max = 1;
8419         goto do_curly;
8420     }
8421   nest_check:
8422     if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8423         ckWARN3reg(RExC_parse,
8424                    "%.*s matches null string many times",
8425                    (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8426                    origparse);
8427     }
8428
8429     if (RExC_parse < RExC_end && *RExC_parse == '?') {
8430         nextchar(pRExC_state);
8431         reginsert(pRExC_state, MINMOD, ret, depth+1);
8432         REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
8433     }
8434 #ifndef REG_ALLOW_MINMOD_SUSPEND
8435     else
8436 #endif
8437     if (RExC_parse < RExC_end && *RExC_parse == '+') {
8438         regnode *ender;
8439         nextchar(pRExC_state);
8440         ender = reg_node(pRExC_state, SUCCEED);
8441         REGTAIL(pRExC_state, ret, ender);
8442         reginsert(pRExC_state, SUSPEND, ret, depth+1);
8443         ret->flags = 0;
8444         ender = reg_node(pRExC_state, TAIL);
8445         REGTAIL(pRExC_state, ret, ender);
8446         /*ret= ender;*/
8447     }
8448
8449     if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
8450         RExC_parse++;
8451         vFAIL("Nested quantifiers");
8452     }
8453
8454     return(ret);
8455 }
8456
8457
8458 /* reg_namedseq(pRExC_state,UVp, UV depth)
8459
8460    This is expected to be called by a parser routine that has
8461    recognized '\N' and needs to handle the rest. RExC_parse is
8462    expected to point at the first char following the N at the time
8463    of the call.
8464
8465    The \N may be inside (indicated by valuep not being NULL) or outside a
8466    character class.
8467
8468    \N may begin either a named sequence, or if outside a character class, mean
8469    to match a non-newline.  For non single-quoted regexes, the tokenizer has
8470    attempted to decide which, and in the case of a named sequence converted it
8471    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8472    where c1... are the characters in the sequence.  For single-quoted regexes,
8473    the tokenizer passes the \N sequence through unchanged; this code will not
8474    attempt to determine this nor expand those.  The net effect is that if the
8475    beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8476    signals that this \N occurrence means to match a non-newline.
8477
8478    Only the \N{U+...} form should occur in a character class, for the same
8479    reason that '.' inside a character class means to just match a period: it
8480    just doesn't make sense.
8481
8482    If valuep is non-null then it is assumed that we are parsing inside
8483    of a charclass definition and the first codepoint in the resolved
8484    string is returned via *valuep and the routine will return NULL.
8485    In this mode if a multichar string is returned from the charnames
8486    handler, a warning will be issued, and only the first char in the
8487    sequence will be examined. If the string returned is zero length
8488    then the value of *valuep is undefined and NON-NULL will
8489    be returned to indicate failure. (This will NOT be a valid pointer
8490    to a regnode.)
8491
8492    If valuep is null then it is assumed that we are parsing normal text and a
8493    new EXACT node is inserted into the program containing the resolved string,
8494    and a pointer to the new node is returned.  But if the string is zero length
8495    a NOTHING node is emitted instead.
8496
8497    On success RExC_parse is set to the char following the endbrace.
8498    Parsing failures will generate a fatal error via vFAIL(...)
8499  */
8500 STATIC regnode *
8501 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
8502 {
8503     char * endbrace;    /* '}' following the name */
8504     regnode *ret = NULL;
8505     char* p;
8506
8507     GET_RE_DEBUG_FLAGS_DECL;
8508
8509     PERL_ARGS_ASSERT_REG_NAMEDSEQ;
8510
8511     GET_RE_DEBUG_FLAGS;
8512
8513     /* The [^\n] meaning of \N ignores spaces and comments under the /x
8514      * modifier.  The other meaning does not */
8515     p = (RExC_flags & RXf_PMf_EXTENDED)
8516         ? regwhite( pRExC_state, RExC_parse )
8517         : RExC_parse;
8518
8519     /* Disambiguate between \N meaning a named character versus \N meaning
8520      * [^\n].  The former is assumed when it can't be the latter. */
8521     if (*p != '{' || regcurly(p)) {
8522         RExC_parse = p;
8523         if (valuep) {
8524             /* no bare \N in a charclass */
8525             vFAIL("\\N in a character class must be a named character: \\N{...}");
8526         }
8527         nextchar(pRExC_state);
8528         ret = reg_node(pRExC_state, REG_ANY);
8529         *flagp |= HASWIDTH|SIMPLE;
8530         RExC_naughty++;
8531         RExC_parse--;
8532         Set_Node_Length(ret, 1); /* MJD */
8533         return ret;
8534     }
8535
8536     /* Here, we have decided it should be a named sequence */
8537
8538     /* The test above made sure that the next real character is a '{', but
8539      * under the /x modifier, it could be separated by space (or a comment and
8540      * \n) and this is not allowed (for consistency with \x{...} and the
8541      * tokenizer handling of \N{NAME}). */
8542     if (*RExC_parse != '{') {
8543         vFAIL("Missing braces on \\N{}");
8544     }
8545
8546     RExC_parse++;       /* Skip past the '{' */
8547
8548     if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8549         || ! (endbrace == RExC_parse            /* nothing between the {} */
8550               || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
8551                   && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8552     {
8553         if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
8554         vFAIL("\\N{NAME} must be resolved by the lexer");
8555     }
8556
8557     if (endbrace == RExC_parse) {   /* empty: \N{} */
8558         if (! valuep) {
8559             RExC_parse = endbrace + 1;
8560             return reg_node(pRExC_state,NOTHING);
8561         }
8562
8563         if (SIZE_ONLY) {
8564             ckWARNreg(RExC_parse,
8565                     "Ignoring zero length \\N{} in character class"
8566             );
8567             RExC_parse = endbrace + 1;
8568         }
8569         *valuep = 0;
8570         return (regnode *) &RExC_parse; /* Invalid regnode pointer */
8571     }
8572
8573     REQUIRE_UTF8;       /* named sequences imply Unicode semantics */
8574     RExC_parse += 2;    /* Skip past the 'U+' */
8575
8576     if (valuep) {   /* In a bracketed char class */
8577         /* We only pay attention to the first char of
8578         multichar strings being returned. I kinda wonder
8579         if this makes sense as it does change the behaviour
8580         from earlier versions, OTOH that behaviour was broken
8581         as well. XXX Solution is to recharacterize as
8582         [rest-of-class]|multi1|multi2... */
8583
8584         STRLEN length_of_hex;
8585         I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8586             | PERL_SCAN_DISALLOW_PREFIX
8587             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8588
8589         char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8590         if (endchar < endbrace) {
8591             ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8592         }
8593
8594         length_of_hex = (STRLEN)(endchar - RExC_parse);
8595         *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8596
8597         /* The tokenizer should have guaranteed validity, but it's possible to
8598          * bypass it by using single quoting, so check */
8599         if (length_of_hex == 0
8600             || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8601         {
8602             RExC_parse += length_of_hex;        /* Includes all the valid */
8603             RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
8604                             ? UTF8SKIP(RExC_parse)
8605                             : 1;
8606             /* Guard against malformed utf8 */
8607             if (RExC_parse >= endchar) RExC_parse = endchar;
8608             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8609         }
8610
8611         RExC_parse = endbrace + 1;
8612         if (endchar == endbrace) return NULL;
8613
8614         ret = (regnode *) &RExC_parse;  /* Invalid regnode pointer */
8615     }
8616     else {      /* Not a char class */
8617
8618         /* What is done here is to convert this to a sub-pattern of the form
8619          * (?:\x{char1}\x{char2}...)
8620          * and then call reg recursively.  That way, it retains its atomicness,
8621          * while not having to worry about special handling that some code
8622          * points may have.  toke.c has converted the original Unicode values
8623          * to native, so that we can just pass on the hex values unchanged.  We
8624          * do have to set a flag to keep recoding from happening in the
8625          * recursion */
8626
8627         SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8628         STRLEN len;
8629         char *endchar;      /* Points to '.' or '}' ending cur char in the input
8630                                stream */
8631         char *orig_end = RExC_end;
8632
8633         while (RExC_parse < endbrace) {
8634
8635             /* Code points are separated by dots.  If none, there is only one
8636              * code point, and is terminated by the brace */
8637             endchar = RExC_parse + strcspn(RExC_parse, ".}");
8638
8639             /* Convert to notation the rest of the code understands */
8640             sv_catpv(substitute_parse, "\\x{");
8641             sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8642             sv_catpv(substitute_parse, "}");
8643
8644             /* Point to the beginning of the next character in the sequence. */
8645             RExC_parse = endchar + 1;
8646         }
8647         sv_catpv(substitute_parse, ")");
8648
8649         RExC_parse = SvPV(substitute_parse, len);
8650
8651         /* Don't allow empty number */
8652         if (len < 8) {
8653             vFAIL("Invalid hexadecimal number in \\N{U+...}");
8654         }
8655         RExC_end = RExC_parse + len;
8656
8657         /* The values are Unicode, and therefore not subject to recoding */
8658         RExC_override_recoding = 1;
8659
8660         ret = reg(pRExC_state, 1, flagp, depth+1);
8661
8662         RExC_parse = endbrace;
8663         RExC_end = orig_end;
8664         RExC_override_recoding = 0;
8665
8666         nextchar(pRExC_state);
8667     }
8668
8669     return ret;
8670 }
8671
8672
8673 /*
8674  * reg_recode
8675  *
8676  * It returns the code point in utf8 for the value in *encp.
8677  *    value: a code value in the source encoding
8678  *    encp:  a pointer to an Encode object
8679  *
8680  * If the result from Encode is not a single character,
8681  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8682  */
8683 STATIC UV
8684 S_reg_recode(pTHX_ const char value, SV **encp)
8685 {
8686     STRLEN numlen = 1;
8687     SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
8688     const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
8689     const STRLEN newlen = SvCUR(sv);
8690     UV uv = UNICODE_REPLACEMENT;
8691
8692     PERL_ARGS_ASSERT_REG_RECODE;
8693
8694     if (newlen)
8695         uv = SvUTF8(sv)
8696              ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8697              : *(U8*)s;
8698
8699     if (!newlen || numlen != newlen) {
8700         uv = UNICODE_REPLACEMENT;
8701         *encp = NULL;
8702     }
8703     return uv;
8704 }
8705
8706
8707 /*
8708  - regatom - the lowest level
8709
8710    Try to identify anything special at the start of the pattern. If there
8711    is, then handle it as required. This may involve generating a single regop,
8712    such as for an assertion; or it may involve recursing, such as to
8713    handle a () structure.
8714
8715    If the string doesn't start with something special then we gobble up
8716    as much literal text as we can.
8717
8718    Once we have been able to handle whatever type of thing started the
8719    sequence, we return.
8720
8721    Note: we have to be careful with escapes, as they can be both literal
8722    and special, and in the case of \10 and friends can either, depending
8723    on context. Specifically there are two separate switches for handling
8724    escape sequences, with the one for handling literal escapes requiring
8725    a dummy entry for all of the special escapes that are actually handled
8726    by the other.
8727 */
8728
8729 STATIC regnode *
8730 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8731 {
8732     dVAR;
8733     register regnode *ret = NULL;
8734     I32 flags;
8735     char *parse_start = RExC_parse;
8736     U8 op;
8737     GET_RE_DEBUG_FLAGS_DECL;
8738     DEBUG_PARSE("atom");
8739     *flagp = WORST;             /* Tentatively. */
8740
8741     PERL_ARGS_ASSERT_REGATOM;
8742
8743 tryagain:
8744     switch ((U8)*RExC_parse) {
8745     case '^':
8746         RExC_seen_zerolen++;
8747         nextchar(pRExC_state);
8748         if (RExC_flags & RXf_PMf_MULTILINE)
8749             ret = reg_node(pRExC_state, MBOL);
8750         else if (RExC_flags & RXf_PMf_SINGLELINE)
8751             ret = reg_node(pRExC_state, SBOL);
8752         else
8753             ret = reg_node(pRExC_state, BOL);
8754         Set_Node_Length(ret, 1); /* MJD */
8755         break;
8756     case '$':
8757         nextchar(pRExC_state);
8758         if (*RExC_parse)
8759             RExC_seen_zerolen++;
8760         if (RExC_flags & RXf_PMf_MULTILINE)
8761             ret = reg_node(pRExC_state, MEOL);
8762         else if (RExC_flags & RXf_PMf_SINGLELINE)
8763             ret = reg_node(pRExC_state, SEOL);
8764         else
8765             ret = reg_node(pRExC_state, EOL);
8766         Set_Node_Length(ret, 1); /* MJD */
8767         break;
8768     case '.':
8769         nextchar(pRExC_state);
8770         if (RExC_flags & RXf_PMf_SINGLELINE)
8771             ret = reg_node(pRExC_state, SANY);
8772         else
8773             ret = reg_node(pRExC_state, REG_ANY);
8774         *flagp |= HASWIDTH|SIMPLE;
8775         RExC_naughty++;
8776         Set_Node_Length(ret, 1); /* MJD */
8777         break;
8778     case '[':
8779     {
8780         char * const oregcomp_parse = ++RExC_parse;
8781         ret = regclass(pRExC_state,depth+1);
8782         if (*RExC_parse != ']') {
8783             RExC_parse = oregcomp_parse;
8784             vFAIL("Unmatched [");
8785         }
8786         nextchar(pRExC_state);
8787         *flagp |= HASWIDTH|SIMPLE;
8788         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8789         break;
8790     }
8791     case '(':
8792         nextchar(pRExC_state);
8793         ret = reg(pRExC_state, 1, &flags,depth+1);
8794         if (ret == NULL) {
8795                 if (flags & TRYAGAIN) {
8796                     if (RExC_parse == RExC_end) {
8797                          /* Make parent create an empty node if needed. */
8798                         *flagp |= TRYAGAIN;
8799                         return(NULL);
8800                     }
8801                     goto tryagain;
8802                 }
8803                 return(NULL);
8804         }
8805         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8806         break;
8807     case '|':
8808     case ')':
8809         if (flags & TRYAGAIN) {
8810             *flagp |= TRYAGAIN;
8811             return NULL;
8812         }
8813         vFAIL("Internal urp");
8814                                 /* Supposed to be caught earlier. */
8815         break;
8816     case '{':
8817         if (!regcurly(RExC_parse)) {
8818             RExC_parse++;
8819             goto defchar;
8820         }
8821         /* FALL THROUGH */
8822     case '?':
8823     case '+':
8824     case '*':
8825         RExC_parse++;
8826         vFAIL("Quantifier follows nothing");
8827         break;
8828     case '\\':
8829         /* Special Escapes
8830
8831            This switch handles escape sequences that resolve to some kind
8832            of special regop and not to literal text. Escape sequnces that
8833            resolve to literal text are handled below in the switch marked
8834            "Literal Escapes".
8835
8836            Every entry in this switch *must* have a corresponding entry
8837            in the literal escape switch. However, the opposite is not
8838            required, as the default for this switch is to jump to the
8839            literal text handling code.
8840         */
8841         switch ((U8)*++RExC_parse) {
8842         /* Special Escapes */
8843         case 'A':
8844             RExC_seen_zerolen++;
8845             ret = reg_node(pRExC_state, SBOL);
8846             *flagp |= SIMPLE;
8847             goto finish_meta_pat;
8848         case 'G':
8849             ret = reg_node(pRExC_state, GPOS);
8850             RExC_seen |= REG_SEEN_GPOS;
8851             *flagp |= SIMPLE;
8852             goto finish_meta_pat;
8853         case 'K':
8854             RExC_seen_zerolen++;
8855             ret = reg_node(pRExC_state, KEEPS);
8856             *flagp |= SIMPLE;
8857             /* XXX:dmq : disabling in-place substitution seems to
8858              * be necessary here to avoid cases of memory corruption, as
8859              * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8860              */
8861             RExC_seen |= REG_SEEN_LOOKBEHIND;
8862             goto finish_meta_pat;
8863         case 'Z':
8864             ret = reg_node(pRExC_state, SEOL);
8865             *flagp |= SIMPLE;
8866             RExC_seen_zerolen++;                /* Do not optimize RE away */
8867             goto finish_meta_pat;
8868         case 'z':
8869             ret = reg_node(pRExC_state, EOS);
8870             *flagp |= SIMPLE;
8871             RExC_seen_zerolen++;                /* Do not optimize RE away */
8872             goto finish_meta_pat;
8873         case 'C':
8874             ret = reg_node(pRExC_state, CANY);
8875             RExC_seen |= REG_SEEN_CANY;
8876             *flagp |= HASWIDTH|SIMPLE;
8877             goto finish_meta_pat;
8878         case 'X':
8879             ret = reg_node(pRExC_state, CLUMP);
8880             *flagp |= HASWIDTH;
8881             goto finish_meta_pat;
8882         case 'w':
8883             switch (get_regex_charset(RExC_flags)) {
8884                 case REGEX_LOCALE_CHARSET:
8885                     op = ALNUML;
8886                     break;
8887                 case REGEX_UNICODE_CHARSET:
8888                     op = ALNUMU;
8889                     break;
8890                 case REGEX_ASCII_RESTRICTED_CHARSET:
8891                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8892                     op = ALNUMA;
8893                     break;
8894                 case REGEX_DEPENDS_CHARSET:
8895                     op = ALNUM;
8896                     break;
8897                 default:
8898                     goto bad_charset;
8899             }
8900             ret = reg_node(pRExC_state, op);
8901             *flagp |= HASWIDTH|SIMPLE;
8902             goto finish_meta_pat;
8903         case 'W':
8904             switch (get_regex_charset(RExC_flags)) {
8905                 case REGEX_LOCALE_CHARSET:
8906                     op = NALNUML;
8907                     break;
8908                 case REGEX_UNICODE_CHARSET:
8909                     op = NALNUMU;
8910                     break;
8911                 case REGEX_ASCII_RESTRICTED_CHARSET:
8912                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8913                     op = NALNUMA;
8914                     break;
8915                 case REGEX_DEPENDS_CHARSET:
8916                     op = NALNUM;
8917                     break;
8918                 default:
8919                     goto bad_charset;
8920             }
8921             ret = reg_node(pRExC_state, op);
8922             *flagp |= HASWIDTH|SIMPLE;
8923             goto finish_meta_pat;
8924         case 'b':
8925             RExC_seen_zerolen++;
8926             RExC_seen |= REG_SEEN_LOOKBEHIND;
8927             switch (get_regex_charset(RExC_flags)) {
8928                 case REGEX_LOCALE_CHARSET:
8929                     op = BOUNDL;
8930                     break;
8931                 case REGEX_UNICODE_CHARSET:
8932                     op = BOUNDU;
8933                     break;
8934                 case REGEX_ASCII_RESTRICTED_CHARSET:
8935                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8936                     op = BOUNDA;
8937                     break;
8938                 case REGEX_DEPENDS_CHARSET:
8939                     op = BOUND;
8940                     break;
8941                 default:
8942                     goto bad_charset;
8943             }
8944             ret = reg_node(pRExC_state, op);
8945             FLAGS(ret) = get_regex_charset(RExC_flags);
8946             *flagp |= SIMPLE;
8947             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8948                 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8949             }
8950             goto finish_meta_pat;
8951         case 'B':
8952             RExC_seen_zerolen++;
8953             RExC_seen |= REG_SEEN_LOOKBEHIND;
8954             switch (get_regex_charset(RExC_flags)) {
8955                 case REGEX_LOCALE_CHARSET:
8956                     op = NBOUNDL;
8957                     break;
8958                 case REGEX_UNICODE_CHARSET:
8959                     op = NBOUNDU;
8960                     break;
8961                 case REGEX_ASCII_RESTRICTED_CHARSET:
8962                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8963                     op = NBOUNDA;
8964                     break;
8965                 case REGEX_DEPENDS_CHARSET:
8966                     op = NBOUND;
8967                     break;
8968                 default:
8969                     goto bad_charset;
8970             }
8971             ret = reg_node(pRExC_state, op);
8972             FLAGS(ret) = get_regex_charset(RExC_flags);
8973             *flagp |= SIMPLE;
8974             if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8975                 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8976             }
8977             goto finish_meta_pat;
8978         case 's':
8979             switch (get_regex_charset(RExC_flags)) {
8980                 case REGEX_LOCALE_CHARSET:
8981                     op = SPACEL;
8982                     break;
8983                 case REGEX_UNICODE_CHARSET:
8984                     op = SPACEU;
8985                     break;
8986                 case REGEX_ASCII_RESTRICTED_CHARSET:
8987                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8988                     op = SPACEA;
8989                     break;
8990                 case REGEX_DEPENDS_CHARSET:
8991                     op = SPACE;
8992                     break;
8993                 default:
8994                     goto bad_charset;
8995             }
8996             ret = reg_node(pRExC_state, op);
8997             *flagp |= HASWIDTH|SIMPLE;
8998             goto finish_meta_pat;
8999         case 'S':
9000             switch (get_regex_charset(RExC_flags)) {
9001                 case REGEX_LOCALE_CHARSET:
9002                     op = NSPACEL;
9003                     break;
9004                 case REGEX_UNICODE_CHARSET:
9005                     op = NSPACEU;
9006                     break;
9007                 case REGEX_ASCII_RESTRICTED_CHARSET:
9008                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9009                     op = NSPACEA;
9010                     break;
9011                 case REGEX_DEPENDS_CHARSET:
9012                     op = NSPACE;
9013                     break;
9014                 default:
9015                     goto bad_charset;
9016             }
9017             ret = reg_node(pRExC_state, op);
9018             *flagp |= HASWIDTH|SIMPLE;
9019             goto finish_meta_pat;
9020         case 'd':
9021             switch (get_regex_charset(RExC_flags)) {
9022                 case REGEX_LOCALE_CHARSET:
9023                     op = DIGITL;
9024                     break;
9025                 case REGEX_ASCII_RESTRICTED_CHARSET:
9026                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9027                     op = DIGITA;
9028                     break;
9029                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9030                 case REGEX_UNICODE_CHARSET:
9031                     op = DIGIT;
9032                     break;
9033                 default:
9034                     goto bad_charset;
9035             }
9036             ret = reg_node(pRExC_state, op);
9037             *flagp |= HASWIDTH|SIMPLE;
9038             goto finish_meta_pat;
9039         case 'D':
9040             switch (get_regex_charset(RExC_flags)) {
9041                 case REGEX_LOCALE_CHARSET:
9042                     op = NDIGITL;
9043                     break;
9044                 case REGEX_ASCII_RESTRICTED_CHARSET:
9045                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9046                     op = NDIGITA;
9047                     break;
9048                 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9049                 case REGEX_UNICODE_CHARSET:
9050                     op = NDIGIT;
9051                     break;
9052                 default:
9053                     goto bad_charset;
9054             }
9055             ret = reg_node(pRExC_state, op);
9056             *flagp |= HASWIDTH|SIMPLE;
9057             goto finish_meta_pat;
9058         case 'R':
9059             ret = reg_node(pRExC_state, LNBREAK);
9060             *flagp |= HASWIDTH|SIMPLE;
9061             goto finish_meta_pat;
9062         case 'h':
9063             ret = reg_node(pRExC_state, HORIZWS);
9064             *flagp |= HASWIDTH|SIMPLE;
9065             goto finish_meta_pat;
9066         case 'H':
9067             ret = reg_node(pRExC_state, NHORIZWS);
9068             *flagp |= HASWIDTH|SIMPLE;
9069             goto finish_meta_pat;
9070         case 'v':
9071             ret = reg_node(pRExC_state, VERTWS);
9072             *flagp |= HASWIDTH|SIMPLE;
9073             goto finish_meta_pat;
9074         case 'V':
9075             ret = reg_node(pRExC_state, NVERTWS);
9076             *flagp |= HASWIDTH|SIMPLE;
9077          finish_meta_pat:
9078             nextchar(pRExC_state);
9079             Set_Node_Length(ret, 2); /* MJD */
9080             break;
9081         case 'p':
9082         case 'P':
9083             {
9084                 char* const oldregxend = RExC_end;
9085 #ifdef DEBUGGING
9086                 char* parse_start = RExC_parse - 2;
9087 #endif
9088
9089                 if (RExC_parse[1] == '{') {
9090                   /* a lovely hack--pretend we saw [\pX] instead */
9091                     RExC_end = strchr(RExC_parse, '}');
9092                     if (!RExC_end) {
9093                         const U8 c = (U8)*RExC_parse;
9094                         RExC_parse += 2;
9095                         RExC_end = oldregxend;
9096                         vFAIL2("Missing right brace on \\%c{}", c);
9097                     }
9098                     RExC_end++;
9099                 }
9100                 else {
9101                     RExC_end = RExC_parse + 2;
9102                     if (RExC_end > oldregxend)
9103                         RExC_end = oldregxend;
9104                 }
9105                 RExC_parse--;
9106
9107                 ret = regclass(pRExC_state,depth+1);
9108
9109                 RExC_end = oldregxend;
9110                 RExC_parse--;
9111
9112                 Set_Node_Offset(ret, parse_start + 2);
9113                 Set_Node_Cur_Length(ret);
9114                 nextchar(pRExC_state);
9115                 *flagp |= HASWIDTH|SIMPLE;
9116             }
9117             break;
9118         case 'N':
9119             /* Handle \N and \N{NAME} here and not below because it can be
9120             multicharacter. join_exact() will join them up later on.
9121             Also this makes sure that things like /\N{BLAH}+/ and
9122             \N{BLAH} being multi char Just Happen. dmq*/
9123             ++RExC_parse;
9124             ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9125             break;
9126         case 'k':    /* Handle \k<NAME> and \k'NAME' */
9127         parse_named_seq:
9128         {
9129             char ch= RExC_parse[1];
9130             if (ch != '<' && ch != '\'' && ch != '{') {
9131                 RExC_parse++;
9132                 vFAIL2("Sequence %.2s... not terminated",parse_start);
9133             } else {
9134                 /* this pretty much dupes the code for (?P=...) in reg(), if
9135                    you change this make sure you change that */
9136                 char* name_start = (RExC_parse += 2);
9137                 U32 num = 0;
9138                 SV *sv_dat = reg_scan_name(pRExC_state,
9139                     SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9140                 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9141                 if (RExC_parse == name_start || *RExC_parse != ch)
9142                     vFAIL2("Sequence %.3s... not terminated",parse_start);
9143
9144                 if (!SIZE_ONLY) {
9145                     num = add_data( pRExC_state, 1, "S" );
9146                     RExC_rxi->data->data[num]=(void*)sv_dat;
9147                     SvREFCNT_inc_simple_void(sv_dat);
9148                 }
9149
9150                 RExC_sawback = 1;
9151                 ret = reganode(pRExC_state,
9152                                ((! FOLD)
9153                                  ? NREF
9154                                  : (MORE_ASCII_RESTRICTED)
9155                                    ? NREFFA
9156                                    : (AT_LEAST_UNI_SEMANTICS)
9157                                      ? NREFFU
9158                                      : (LOC)
9159                                        ? NREFFL
9160                                        : NREFF),
9161                                 num);
9162                 *flagp |= HASWIDTH;
9163
9164                 /* override incorrect value set in reganode MJD */
9165                 Set_Node_Offset(ret, parse_start+1);
9166                 Set_Node_Cur_Length(ret); /* MJD */
9167                 nextchar(pRExC_state);
9168
9169             }
9170             break;
9171         }
9172         case 'g':
9173         case '1': case '2': case '3': case '4':
9174         case '5': case '6': case '7': case '8': case '9':
9175             {
9176                 I32 num;
9177                 bool isg = *RExC_parse == 'g';
9178                 bool isrel = 0;
9179                 bool hasbrace = 0;
9180                 if (isg) {
9181                     RExC_parse++;
9182                     if (*RExC_parse == '{') {
9183                         RExC_parse++;
9184                         hasbrace = 1;
9185                     }
9186                     if (*RExC_parse == '-') {
9187                         RExC_parse++;
9188                         isrel = 1;
9189                     }
9190                     if (hasbrace && !isDIGIT(*RExC_parse)) {
9191                         if (isrel) RExC_parse--;
9192                         RExC_parse -= 2;
9193                         goto parse_named_seq;
9194                 }   }
9195                 num = atoi(RExC_parse);
9196                 if (isg && num == 0)
9197                     vFAIL("Reference to invalid group 0");
9198                 if (isrel) {
9199                     num = RExC_npar - num;
9200                     if (num < 1)
9201                         vFAIL("Reference to nonexistent or unclosed group");
9202                 }
9203                 if (!isg && num > 9 && num >= RExC_npar)
9204                     goto defchar;
9205                 else {
9206                     char * const parse_start = RExC_parse - 1; /* MJD */
9207                     while (isDIGIT(*RExC_parse))
9208                         RExC_parse++;
9209                     if (parse_start == RExC_parse - 1)
9210                         vFAIL("Unterminated \\g... pattern");
9211                     if (hasbrace) {
9212                         if (*RExC_parse != '}')
9213                             vFAIL("Unterminated \\g{...} pattern");
9214                         RExC_parse++;
9215                     }
9216                     if (!SIZE_ONLY) {
9217                         if (num > (I32)RExC_rx->nparens)
9218                             vFAIL("Reference to nonexistent group");
9219                     }
9220                     RExC_sawback = 1;
9221                     ret = reganode(pRExC_state,
9222                                    ((! FOLD)
9223                                      ? REF
9224                                      : (MORE_ASCII_RESTRICTED)
9225                                        ? REFFA
9226                                        : (AT_LEAST_UNI_SEMANTICS)
9227                                          ? REFFU
9228                                          : (LOC)
9229                                            ? REFFL
9230                                            : REFF),
9231                                     num);
9232                     *flagp |= HASWIDTH;
9233
9234                     /* override incorrect value set in reganode MJD */
9235                     Set_Node_Offset(ret, parse_start+1);
9236                     Set_Node_Cur_Length(ret); /* MJD */
9237                     RExC_parse--;
9238                     nextchar(pRExC_state);
9239                 }
9240             }
9241             break;
9242         case '\0':
9243             if (RExC_parse >= RExC_end)
9244                 FAIL("Trailing \\");
9245             /* FALL THROUGH */
9246         default:
9247             /* Do not generate "unrecognized" warnings here, we fall
9248                back into the quick-grab loop below */
9249             parse_start--;
9250             goto defchar;
9251         }
9252         break;
9253
9254     case '#':
9255         if (RExC_flags & RXf_PMf_EXTENDED) {
9256             if ( reg_skipcomment( pRExC_state ) )
9257                 goto tryagain;
9258         }
9259         /* FALL THROUGH */
9260
9261     default:
9262
9263             parse_start = RExC_parse - 1;
9264
9265             RExC_parse++;
9266
9267         defchar: {
9268             register STRLEN len;
9269             register UV ender;
9270             register char *p;
9271             char *s;
9272             STRLEN foldlen;
9273             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
9274             regnode * orig_emit;
9275             U8 node_type;
9276
9277             /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node?  If so,
9278              * it is folded to 'ss' even if not utf8 */
9279             bool is_exactfu_sharp_s;
9280
9281             ender = 0;
9282             orig_emit = RExC_emit; /* Save the original output node position in
9283                                       case we need to output a different node
9284                                       type */
9285             node_type = ((! FOLD) ? EXACT
9286                         : (LOC)
9287                           ? EXACTFL
9288                           : (MORE_ASCII_RESTRICTED)
9289                             ? EXACTFA
9290                             : (AT_LEAST_UNI_SEMANTICS)
9291                               ? EXACTFU
9292                               : EXACTF);
9293             ret = reg_node(pRExC_state, node_type);
9294             s = STRING(ret);
9295
9296             /* XXX The node can hold up to 255 bytes, yet this only goes to
9297              * 127.  I (khw) do not know why.  Keeping it somewhat less than
9298              * 255 allows us to not have to worry about overflow due to
9299              * converting to utf8 and fold expansion, but that value is
9300              * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
9301              * split up by this limit into a single one using the real max of
9302              * 255.  Even at 127, this breaks under rare circumstances.  If
9303              * folding, we do not want to split a node at a character that is a
9304              * non-final in a multi-char fold, as an input string could just
9305              * happen to want to match across the node boundary.  The join
9306              * would solve that problem if the join actually happens.  But a
9307              * series of more than two nodes in a row each of 127 would cause
9308              * the first join to succeed to get to 254, but then there wouldn't
9309              * be room for the next one, which could at be one of those split
9310              * multi-char folds.  I don't know of any fool-proof solution.  One
9311              * could back off to end with only a code point that isn't such a
9312              * non-final, but it is possible for there not to be any in the
9313              * entire node. */
9314             for (len = 0, p = RExC_parse - 1;
9315                  len < 127 && p < RExC_end;
9316                  len++)
9317             {
9318                 char * const oldp = p;
9319
9320                 if (RExC_flags & RXf_PMf_EXTENDED)
9321                     p = regwhite( pRExC_state, p );
9322                 switch ((U8)*p) {
9323                 case '^':
9324                 case '$':
9325                 case '.':
9326                 case '[':
9327                 case '(':
9328                 case ')':
9329                 case '|':
9330                     goto loopdone;
9331                 case '\\':
9332                     /* Literal Escapes Switch
9333
9334                        This switch is meant to handle escape sequences that
9335                        resolve to a literal character.
9336
9337                        Every escape sequence that represents something
9338                        else, like an assertion or a char class, is handled
9339                        in the switch marked 'Special Escapes' above in this
9340                        routine, but also has an entry here as anything that
9341                        isn't explicitly mentioned here will be treated as
9342                        an unescaped equivalent literal.
9343                     */
9344
9345                     switch ((U8)*++p) {
9346                     /* These are all the special escapes. */
9347                     case 'A':             /* Start assertion */
9348                     case 'b': case 'B':   /* Word-boundary assertion*/
9349                     case 'C':             /* Single char !DANGEROUS! */
9350                     case 'd': case 'D':   /* digit class */
9351                     case 'g': case 'G':   /* generic-backref, pos assertion */
9352                     case 'h': case 'H':   /* HORIZWS */
9353                     case 'k': case 'K':   /* named backref, keep marker */
9354                     case 'N':             /* named char sequence */
9355                     case 'p': case 'P':   /* Unicode property */
9356                               case 'R':   /* LNBREAK */
9357                     case 's': case 'S':   /* space class */
9358                     case 'v': case 'V':   /* VERTWS */
9359                     case 'w': case 'W':   /* word class */
9360                     case 'X':             /* eXtended Unicode "combining character sequence" */
9361                     case 'z': case 'Z':   /* End of line/string assertion */
9362                         --p;
9363                         goto loopdone;
9364
9365                     /* Anything after here is an escape that resolves to a
9366                        literal. (Except digits, which may or may not)
9367                      */
9368                     case 'n':
9369                         ender = '\n';
9370                         p++;
9371                         break;
9372                     case 'r':
9373                         ender = '\r';
9374                         p++;
9375                         break;
9376                     case 't':
9377                         ender = '\t';
9378                         p++;
9379                         break;
9380                     case 'f':
9381                         ender = '\f';
9382                         p++;
9383                         break;
9384                     case 'e':
9385                           ender = ASCII_TO_NATIVE('\033');
9386                         p++;
9387                         break;
9388                     case 'a':
9389                           ender = ASCII_TO_NATIVE('\007');
9390                         p++;
9391                         break;
9392                     case 'o':
9393                         {
9394                             STRLEN brace_len = len;
9395                             UV result;
9396                             const char* error_msg;
9397
9398                             bool valid = grok_bslash_o(p,
9399                                                        &result,
9400                                                        &brace_len,
9401                                                        &error_msg,
9402                                                        1);
9403                             p += brace_len;
9404                             if (! valid) {
9405                                 RExC_parse = p; /* going to die anyway; point
9406                                                    to exact spot of failure */
9407                                 vFAIL(error_msg);
9408                             }
9409                             else
9410                             {
9411                                 ender = result;
9412                             }
9413                             if (PL_encoding && ender < 0x100) {
9414                                 goto recode_encoding;
9415                             }
9416                             if (ender > 0xff) {
9417                                 REQUIRE_UTF8;
9418                             }
9419                             break;
9420                         }
9421                     case 'x':
9422                         if (*++p == '{') {
9423                             char* const e = strchr(p, '}');
9424
9425                             if (!e) {
9426                                 RExC_parse = p + 1;
9427                                 vFAIL("Missing right brace on \\x{}");
9428                             }
9429                             else {
9430                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9431                                     | PERL_SCAN_DISALLOW_PREFIX;
9432                                 STRLEN numlen = e - p - 1;
9433                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
9434                                 if (ender > 0xff)
9435                                     REQUIRE_UTF8;
9436                                 p = e + 1;
9437                             }
9438                         }
9439                         else {
9440                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9441                             STRLEN numlen = 2;
9442                             ender = grok_hex(p, &numlen, &flags, NULL);
9443                             p += numlen;
9444                         }
9445                         if (PL_encoding && ender < 0x100)
9446                             goto recode_encoding;
9447                         break;
9448                     case 'c':
9449                         p++;
9450                         ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
9451                         break;
9452                     case '0': case '1': case '2': case '3':case '4':
9453                     case '5': case '6': case '7': case '8':case '9':
9454                         if (*p == '0' ||
9455                             (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
9456                         {
9457                             I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9458                             STRLEN numlen = 3;
9459                             ender = grok_oct(p, &numlen, &flags, NULL);
9460                             if (ender > 0xff) {
9461                                 REQUIRE_UTF8;
9462                             }
9463                             p += numlen;
9464                         }
9465                         else {
9466                             --p;
9467                             goto loopdone;
9468                         }
9469                         if (PL_encoding && ender < 0x100)
9470                             goto recode_encoding;
9471                         break;
9472                     recode_encoding:
9473                         if (! RExC_override_recoding) {
9474                             SV* enc = PL_encoding;
9475                             ender = reg_recode((const char)(U8)ender, &enc);
9476                             if (!enc && SIZE_ONLY)
9477                                 ckWARNreg(p, "Invalid escape in the specified encoding");
9478                             REQUIRE_UTF8;
9479                         }
9480                         break;
9481                     case '\0':
9482                         if (p >= RExC_end)
9483                             FAIL("Trailing \\");
9484                         /* FALL THROUGH */
9485                     default:
9486                         if (!SIZE_ONLY&& isALPHA(*p)) {
9487                             /* Include any { following the alpha to emphasize
9488                              * that it could be part of an escape at some point
9489                              * in the future */
9490                             int len = (*(p + 1) == '{') ? 2 : 1;
9491                             ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
9492                         }
9493                         goto normal_default;
9494                     }
9495                     break;
9496                 default:
9497                   normal_default:
9498                     if (UTF8_IS_START(*p) && UTF) {
9499                         STRLEN numlen;
9500                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9501                                                &numlen, UTF8_ALLOW_DEFAULT);
9502                         p += numlen;
9503                     }
9504                     else
9505                         ender = (U8) *p++;
9506                     break;
9507                 } /* End of switch on the literal */
9508
9509                 is_exactfu_sharp_s = (node_type == EXACTFU
9510                                       && ender == LATIN_SMALL_LETTER_SHARP_S);
9511                 if ( RExC_flags & RXf_PMf_EXTENDED)
9512                     p = regwhite( pRExC_state, p );
9513                 if ((UTF && FOLD) || is_exactfu_sharp_s) {
9514                     /* Prime the casefolded buffer.  Locale rules, which apply
9515                      * only to code points < 256, aren't known until execution,
9516                      * so for them, just output the original character using
9517                      * utf8.  If we start to fold non-UTF patterns, be sure to
9518                      * update join_exact() */
9519                     if (LOC && ender < 256) {
9520                         if (UNI_IS_INVARIANT(ender)) {
9521                             *tmpbuf = (U8) ender;
9522                             foldlen = 1;
9523                         } else {
9524                             *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9525                             *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9526                             foldlen = 2;
9527                         }
9528                     }
9529                     else if (isASCII(ender)) {  /* Note: Here can't also be LOC
9530                                                  */
9531                         ender = toLOWER(ender);
9532                         *tmpbuf = (U8) ender;
9533                         foldlen = 1;
9534                     }
9535                     else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9536
9537                         /* Locale and /aa require more selectivity about the
9538                          * fold, so are handled below.  Otherwise, here, just
9539                          * use the fold */
9540                         ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9541                     }
9542                     else {
9543                         /* Under locale rules or /aa we are not to mix,
9544                          * respectively, ords < 256 or ASCII with non-.  So
9545                          * reject folds that mix them, using only the
9546                          * non-folded code point.  So do the fold to a
9547                          * temporary, and inspect each character in it. */
9548                         U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9549                         U8* s = trialbuf;
9550                         UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9551                         U8* e = s + foldlen;
9552                         bool fold_ok = TRUE;
9553
9554                         while (s < e) {
9555                             if (isASCII(*s)
9556                                 || (LOC && (UTF8_IS_INVARIANT(*s)
9557                                            || UTF8_IS_DOWNGRADEABLE_START(*s))))
9558                             {
9559                                 fold_ok = FALSE;
9560                                 break;
9561                             }
9562                             s += UTF8SKIP(s);
9563                         }
9564                         if (fold_ok) {
9565                             Copy(trialbuf, tmpbuf, foldlen, U8);
9566                             ender = tmpender;
9567                         }
9568                         else {
9569                             uvuni_to_utf8(tmpbuf, ender);
9570                             foldlen = UNISKIP(ender);
9571                         }
9572                     }
9573                 }
9574                 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
9575                     if (len)
9576                         p = oldp;
9577                     else if (UTF || is_exactfu_sharp_s) {
9578                          if (FOLD) {
9579                               /* Emit all the Unicode characters. */
9580                               STRLEN numlen;
9581                               for (foldbuf = tmpbuf;
9582                                    foldlen;
9583                                    foldlen -= numlen) {
9584                                    ender = utf8_to_uvchr(foldbuf, &numlen);
9585                                    if (numlen > 0) {
9586                                         const STRLEN unilen = reguni(pRExC_state, ender, s);
9587                                         s       += unilen;
9588                                         len     += unilen;
9589                                         /* In EBCDIC the numlen
9590                                          * and unilen can differ. */
9591                                         foldbuf += numlen;
9592                                         if (numlen >= foldlen)
9593                                              break;
9594                                    }
9595                                    else
9596                                         break; /* "Can't happen." */
9597                               }
9598                          }
9599                          else {
9600                               const STRLEN unilen = reguni(pRExC_state, ender, s);
9601                               if (unilen > 0) {
9602                                    s   += unilen;
9603                                    len += unilen;
9604                               }
9605                          }
9606                     }
9607                     else {
9608                         len++;
9609                         REGC((char)ender, s++);
9610                     }
9611                     break;
9612                 }
9613                 if (UTF || is_exactfu_sharp_s) {
9614                      if (FOLD) {
9615                           /* Emit all the Unicode characters. */
9616                           STRLEN numlen;
9617                           for (foldbuf = tmpbuf;
9618                                foldlen;
9619                                foldlen -= numlen) {
9620                                ender = utf8_to_uvchr(foldbuf, &numlen);
9621                                if (numlen > 0) {
9622                                     const STRLEN unilen = reguni(pRExC_state, ender, s);
9623                                     len     += unilen;
9624                                     s       += unilen;
9625                                     /* In EBCDIC the numlen
9626                                      * and unilen can differ. */
9627                                     foldbuf += numlen;
9628                                     if (numlen >= foldlen)
9629                                          break;
9630                                }
9631                                else
9632                                     break;
9633                           }
9634                      }
9635                      else {
9636                           const STRLEN unilen = reguni(pRExC_state, ender, s);
9637                           if (unilen > 0) {
9638                                s   += unilen;
9639                                len += unilen;
9640                           }
9641                      }
9642                      len--;
9643                 }
9644                 else {
9645                     REGC((char)ender, s++);
9646                 }
9647             }
9648         loopdone:   /* Jumped to when encounters something that shouldn't be in
9649                        the node */
9650             RExC_parse = p - 1;
9651             Set_Node_Cur_Length(ret); /* MJD */
9652             nextchar(pRExC_state);
9653             {
9654                 /* len is STRLEN which is unsigned, need to copy to signed */
9655                 IV iv = len;
9656                 if (iv < 0)
9657                     vFAIL("Internal disaster");
9658             }
9659             if (len > 0)
9660                 *flagp |= HASWIDTH;
9661             if (len == 1 && UNI_IS_INVARIANT(ender))
9662                 *flagp |= SIMPLE;
9663
9664             if (SIZE_ONLY)
9665                 RExC_size += STR_SZ(len);
9666             else {
9667                 STR_LEN(ret) = len;
9668                 RExC_emit += STR_SZ(len);
9669             }
9670         }
9671         break;
9672     }
9673
9674     return(ret);
9675
9676 /* Jumped to when an unrecognized character set is encountered */
9677 bad_charset:
9678     Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9679     return(NULL);
9680 }
9681
9682 STATIC char *
9683 S_regwhite( RExC_state_t *pRExC_state, char *p )
9684 {
9685     const char *e = RExC_end;
9686
9687     PERL_ARGS_ASSERT_REGWHITE;
9688
9689     while (p < e) {
9690         if (isSPACE(*p))
9691             ++p;
9692         else if (*p == '#') {
9693             bool ended = 0;
9694             do {
9695                 if (*p++ == '\n') {
9696                     ended = 1;
9697                     break;
9698                 }
9699             } while (p < e);
9700             if (!ended)
9701                 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9702         }
9703         else
9704             break;
9705     }
9706     return p;
9707 }
9708
9709 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9710    Character classes ([:foo:]) can also be negated ([:^foo:]).
9711    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9712    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9713    but trigger failures because they are currently unimplemented. */
9714
9715 #define POSIXCC_DONE(c)   ((c) == ':')
9716 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9717 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9718
9719 STATIC I32
9720 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9721 {
9722     dVAR;
9723     I32 namedclass = OOB_NAMEDCLASS;
9724
9725     PERL_ARGS_ASSERT_REGPPOSIXCC;
9726
9727     if (value == '[' && RExC_parse + 1 < RExC_end &&
9728         /* I smell either [: or [= or [. -- POSIX has been here, right? */
9729         POSIXCC(UCHARAT(RExC_parse))) {
9730         const char c = UCHARAT(RExC_parse);
9731         char* const s = RExC_parse++;
9732
9733         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9734             RExC_parse++;
9735         if (RExC_parse == RExC_end)
9736             /* Grandfather lone [:, [=, [. */
9737             RExC_parse = s;
9738         else {
9739             const char* const t = RExC_parse++; /* skip over the c */
9740             assert(*t == c);
9741
9742             if (UCHARAT(RExC_parse) == ']') {
9743                 const char *posixcc = s + 1;
9744                 RExC_parse++; /* skip over the ending ] */
9745
9746                 if (*s == ':') {
9747                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9748                     const I32 skip = t - posixcc;
9749
9750                     /* Initially switch on the length of the name.  */
9751                     switch (skip) {
9752                     case 4:
9753                         if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9754                             namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9755                         break;
9756                     case 5:
9757                         /* Names all of length 5.  */
9758                         /* alnum alpha ascii blank cntrl digit graph lower
9759                            print punct space upper  */
9760                         /* Offset 4 gives the best switch position.  */
9761                         switch (posixcc[4]) {
9762                         case 'a':
9763                             if (memEQ(posixcc, "alph", 4)) /* alpha */
9764                                 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9765                             break;
9766                         case 'e':
9767                             if (memEQ(posixcc, "spac", 4)) /* space */
9768                                 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9769                             break;
9770                         case 'h':
9771                             if (memEQ(posixcc, "grap", 4)) /* graph */
9772                                 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9773                             break;
9774                         case 'i':
9775                             if (memEQ(posixcc, "asci", 4)) /* ascii */
9776                                 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9777                             break;
9778                         case 'k':
9779                             if (memEQ(posixcc, "blan", 4)) /* blank */
9780                                 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9781                             break;
9782                         case 'l':
9783                             if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9784                                 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9785                             break;
9786                         case 'm':
9787                             if (memEQ(posixcc, "alnu", 4)) /* alnum */
9788                                 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9789                             break;
9790                         case 'r':
9791                             if (memEQ(posixcc, "lowe", 4)) /* lower */
9792                                 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9793                             else if (memEQ(posixcc, "uppe", 4)) /* upper */
9794                                 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9795                             break;
9796                         case 't':
9797                             if (memEQ(posixcc, "digi", 4)) /* digit */
9798                                 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9799                             else if (memEQ(posixcc, "prin", 4)) /* print */
9800                                 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9801                             else if (memEQ(posixcc, "punc", 4)) /* punct */
9802                                 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9803                             break;
9804                         }
9805                         break;
9806                     case 6:
9807                         if (memEQ(posixcc, "xdigit", 6))
9808                             namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9809                         break;
9810                     }
9811
9812                     if (namedclass == OOB_NAMEDCLASS)
9813                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9814                                       t - s - 1, s + 1);
9815                     assert (posixcc[skip] == ':');
9816                     assert (posixcc[skip+1] == ']');
9817                 } else if (!SIZE_ONLY) {
9818                     /* [[=foo=]] and [[.foo.]] are still future. */
9819
9820                     /* adjust RExC_parse so the warning shows after
9821                        the class closes */
9822                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9823                         RExC_parse++;
9824                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9825                 }
9826             } else {
9827                 /* Maternal grandfather:
9828                  * "[:" ending in ":" but not in ":]" */
9829                 RExC_parse = s;
9830             }
9831         }
9832     }
9833
9834     return namedclass;
9835 }
9836
9837 STATIC void
9838 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9839 {
9840     dVAR;
9841
9842     PERL_ARGS_ASSERT_CHECKPOSIXCC;
9843
9844     if (POSIXCC(UCHARAT(RExC_parse))) {
9845         const char *s = RExC_parse;
9846         const char  c = *s++;
9847
9848         while (isALNUM(*s))
9849             s++;
9850         if (*s && c == *s && s[1] == ']') {
9851             ckWARN3reg(s+2,
9852                        "POSIX syntax [%c %c] belongs inside character classes",
9853                        c, c);
9854
9855             /* [[=foo=]] and [[.foo.]] are still future. */
9856             if (POSIXCC_NOTYET(c)) {
9857                 /* adjust RExC_parse so the error shows after
9858                    the class closes */
9859                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9860                     NOOP;
9861                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9862             }
9863         }
9864     }
9865 }
9866
9867 /* No locale test, and always Unicode semantics, no ignore-case differences */
9868 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
9869 ANYOF_##NAME:                                                                  \
9870         for (value = 0; value < 256; value++)                                  \
9871             if (TEST)                                                          \
9872             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9873     yesno = '+';                                                               \
9874     what = WORD;                                                               \
9875     break;                                                                     \
9876 case ANYOF_N##NAME:                                                            \
9877         for (value = 0; value < 256; value++)                                  \
9878             if (!TEST)                                                         \
9879             stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9880     yesno = '!';                                                               \
9881     what = WORD;                                                               \
9882     break
9883
9884 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9885  * there are two tests passed in, to use depending on that. There aren't any
9886  * cases where the label is different from the name, so no need for that
9887  * parameter.
9888  * Sets 'what' to WORD which is the property name for non-bitmap code points;
9889  * But, uses FOLD_WORD instead if /i has been selected, to allow a different
9890  * property name */
9891 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD)                         \
9892 ANYOF_##NAME:                                                                  \
9893     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
9894     else if (UNI_SEMANTICS) {                                                  \
9895         for (value = 0; value < 256; value++) {                                \
9896             if (TEST_8(value)) stored +=                                       \
9897                       set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);  \
9898         }                                                                      \
9899     }                                                                          \
9900     else {                                                                     \
9901         for (value = 0; value < 128; value++) {                                \
9902             if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
9903                 set_regclass_bit(pRExC_state, ret,                     \
9904                                    (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);                 \
9905         }                                                                      \
9906     }                                                                          \
9907     yesno = '+';                                                               \
9908     if (FOLD) {                                                                \
9909         what = FOLD_WORD;                                                      \
9910     }                                                                          \
9911     else {                                                                     \
9912         what = WORD;                                                           \
9913     }                                                                          \
9914     break;                                                                     \
9915 case ANYOF_N##NAME:                                                            \
9916     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
9917     else if (UNI_SEMANTICS) {                                                  \
9918         for (value = 0; value < 256; value++) {                                \
9919             if (! TEST_8(value)) stored +=                                     \
9920                     set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);    \
9921         }                                                                      \
9922     }                                                                          \
9923     else {                                                                     \
9924         for (value = 0; value < 128; value++) {                                \
9925             if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit(  \
9926                         pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);    \
9927         }                                                                      \
9928         if (AT_LEAST_ASCII_RESTRICTED) {                                       \
9929             for (value = 128; value < 256; value++) {                          \
9930              stored += set_regclass_bit(                                     \
9931                            pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9932             }                                                                  \
9933             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;                             \
9934         }                                                                      \
9935         else {                                                                 \
9936             /* For a non-ut8 target string with DEPENDS semantics, all above   \
9937              * ASCII Latin1 code points match the complement of any of the     \
9938              * classes.  But in utf8, they have their Unicode semantics, so    \
9939              * can't just set them in the bitmap, or else regexec.c will think \
9940              * they matched when they shouldn't. */                            \
9941             ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;                     \
9942         }                                                                      \
9943     }                                                                          \
9944     yesno = '!';                                                               \
9945     if (FOLD) {                                                                \
9946         what = FOLD_WORD;                                                      \
9947     }                                                                          \
9948     else {                                                                     \
9949         what = WORD;                                                           \
9950     }                                                                          \
9951     break
9952
9953 STATIC U8
9954 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
9955 {
9956
9957     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9958      * Locale folding is done at run-time, so this function should not be
9959      * called for nodes that are for locales.
9960      *
9961      * This function sets the bit corresponding to the fold of the input
9962      * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
9963      * 'F' is 'f'.
9964      *
9965      * It also knows about the characters that are in the bitmap that have
9966      * folds that are matchable only outside it, and sets the appropriate lists
9967      * and flags.
9968      *
9969      * It returns the number of bits that actually changed from 0 to 1 */
9970
9971     U8 stored = 0;
9972     U8 fold;
9973
9974     PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9975
9976     fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9977                                     : PL_fold[value];
9978
9979     /* It assumes the bit for 'value' has already been set */
9980     if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9981         ANYOF_BITMAP_SET(node, fold);
9982         stored++;
9983     }
9984     if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9985         /* Certain Latin1 characters have matches outside the bitmap.  To get
9986          * here, 'value' is one of those characters.   None of these matches is
9987          * valid for ASCII characters under /aa, which have been excluded by
9988          * the 'if' above.  The matches fall into three categories:
9989          * 1) They are singly folded-to or -from an above 255 character, as
9990          *    LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9991          *    WITH DIAERESIS;
9992          * 2) They are part of a multi-char fold with another character in the
9993          *    bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9994          * 3) They are part of a multi-char fold with a character not in the
9995          *    bitmap, such as various ligatures.
9996          * We aren't dealing fully with multi-char folds, except we do deal
9997          * with the pattern containing a character that has a multi-char fold
9998          * (not so much the inverse).
9999          * For types 1) and 3), the matches only happen when the target string
10000          * is utf8; that's not true for 2), and we set a flag for it.
10001          *
10002          * The code below adds to the passed in inversion list the single fold
10003          * closures for 'value'.  The values are hard-coded here so that an
10004          * innocent-looking character class, like /[ks]/i won't have to go out
10005          * to disk to find the possible matches.  XXX It would be better to
10006          * generate these via regen, in case a new version of the Unicode
10007          * standard adds new mappings, though that is not really likely. */
10008         switch (value) {
10009             case 'k':
10010             case 'K':
10011                 /* KELVIN SIGN */
10012                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10013                 break;
10014             case 's':
10015             case 'S':
10016                 /* LATIN SMALL LETTER LONG S */
10017                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10018                 break;
10019             case MICRO_SIGN:
10020                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10021                                                  GREEK_SMALL_LETTER_MU);
10022                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10023                                                  GREEK_CAPITAL_LETTER_MU);
10024                 break;
10025             case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10026             case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10027                 /* ANGSTROM SIGN */
10028                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10029                 if (DEPENDS_SEMANTICS) {    /* See DEPENDS comment below */
10030                     *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10031                                                      PL_fold_latin1[value]);
10032                 }
10033                 break;
10034             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10035                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10036                                         LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10037                 break;
10038             case LATIN_SMALL_LETTER_SHARP_S:
10039                 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10040                                         LATIN_CAPITAL_LETTER_SHARP_S);
10041
10042                 /* Under /a, /d, and /u, this can match the two chars "ss" */
10043                 if (! MORE_ASCII_RESTRICTED) {
10044                     add_alternate(alternate_ptr, (U8 *) "ss", 2);
10045
10046                     /* And under /u or /a, it can match even if the target is
10047                      * not utf8 */
10048                     if (AT_LEAST_UNI_SEMANTICS) {
10049                         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10050                     }
10051                 }
10052                 break;
10053             case 'F': case 'f':
10054             case 'I': case 'i':
10055             case 'L': case 'l':
10056             case 'T': case 't':
10057             case 'A': case 'a':
10058             case 'H': case 'h':
10059             case 'J': case 'j':
10060             case 'N': case 'n':
10061             case 'W': case 'w':
10062             case 'Y': case 'y':
10063                 /* These all are targets of multi-character folds from code
10064                  * points that require UTF8 to express, so they can't match
10065                  * unless the target string is in UTF-8, so no action here is
10066                  * necessary, as regexec.c properly handles the general case
10067                  * for UTF-8 matching */
10068                 break;
10069             default:
10070                 /* Use deprecated warning to increase the chances of this
10071                  * being output */
10072                 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10073                 break;
10074         }
10075     }
10076     else if (DEPENDS_SEMANTICS
10077             && ! isASCII(value)
10078             && PL_fold_latin1[value] != value)
10079     {
10080            /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10081             * folds only when the target string is in UTF-8.  We add the fold
10082             * here to the list of things to match outside the bitmap, which
10083             * won't be looked at unless it is UTF8 (or else if something else
10084             * says to look even if not utf8, but those things better not happen
10085             * under DEPENDS semantics. */
10086         *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10087     }
10088
10089     return stored;
10090 }
10091
10092
10093 PERL_STATIC_INLINE U8
10094 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10095 {
10096     /* This inline function sets a bit in the bitmap if not already set, and if
10097      * appropriate, its fold, returning the number of bits that actually
10098      * changed from 0 to 1 */
10099
10100     U8 stored;
10101
10102     PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10103
10104     if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
10105         return 0;
10106     }
10107
10108     ANYOF_BITMAP_SET(node, value);
10109     stored = 1;
10110
10111     if (FOLD && ! LOC) {        /* Locale folds aren't known until runtime */
10112         stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10113     }
10114
10115     return stored;
10116 }
10117
10118 STATIC void
10119 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10120 {
10121     /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10122      * alternate list, pointed to by 'alternate_ptr'.  This is an array of
10123      * the multi-character folds of characters in the node */
10124     SV *sv;
10125
10126     PERL_ARGS_ASSERT_ADD_ALTERNATE;
10127
10128     if (! *alternate_ptr) {
10129         *alternate_ptr = newAV();
10130     }
10131     sv = newSVpvn_utf8((char*)string, len, TRUE);
10132     av_push(*alternate_ptr, sv);
10133     return;
10134 }
10135
10136 /*
10137    parse a class specification and produce either an ANYOF node that
10138    matches the pattern or perhaps will be optimized into an EXACTish node
10139    instead. The node contains a bit map for the first 256 characters, with the
10140    corresponding bit set if that character is in the list.  For characters
10141    above 255, a range list is used */
10142
10143 STATIC regnode *
10144 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10145 {
10146     dVAR;
10147     register UV nextvalue;
10148     register IV prevvalue = OOB_UNICODE;
10149     register IV range = 0;
10150     UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10151     register regnode *ret;
10152     STRLEN numlen;
10153     IV namedclass;
10154     char *rangebegin = NULL;
10155     bool need_class = 0;
10156     bool allow_full_fold = TRUE;   /* Assume wants multi-char folding */
10157     SV *listsv = NULL;
10158     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10159                                       than just initialized.  */
10160     SV* properties = NULL;    /* Code points that match \p{} \P{} */
10161     UV element_count = 0;   /* Number of distinct elements in the class.
10162                                Optimizations may be possible if this is tiny */
10163     UV n;
10164
10165     /* Unicode properties are stored in a swash; this holds the current one
10166      * being parsed.  If this swash is the only above-latin1 component of the
10167      * character class, an optimization is to pass it directly on to the
10168      * execution engine.  Otherwise, it is set to NULL to indicate that there
10169      * are other things in the class that have to be dealt with at execution
10170      * time */
10171     SV* swash = NULL;           /* Code points that match \p{} \P{} */
10172
10173     /* Set if a component of this character class is user-defined; just passed
10174      * on to the engine */
10175     UV has_user_defined_property = 0;
10176
10177     /* code points this node matches that can't be stored in the bitmap */
10178     SV* nonbitmap = NULL;
10179
10180     /* The items that are to match that aren't stored in the bitmap, but are a
10181      * result of things that are stored there.  This is the fold closure of
10182      * such a character, either because it has DEPENDS semantics and shouldn't
10183      * be matched unless the target string is utf8, or is a code point that is
10184      * too large for the bit map, as for example, the fold of the MICRO SIGN is
10185      * above 255.  This all is solely for performance reasons.  By having this
10186      * code know the outside-the-bitmap folds that the bitmapped characters are
10187      * involved with, we don't have to go out to disk to find the list of
10188      * matches, unless the character class includes code points that aren't
10189      * storable in the bit map.  That means that a character class with an 's'
10190      * in it, for example, doesn't need to go out to disk to find everything
10191      * that matches.  A 2nd list is used so that the 'nonbitmap' list is kept
10192      * empty unless there is something whose fold we don't know about, and will
10193      * have to go out to the disk to find. */
10194     SV* l1_fold_invlist = NULL;
10195
10196     /* List of multi-character folds that are matched by this node */
10197     AV* unicode_alternate  = NULL;
10198 #ifdef EBCDIC
10199     UV literal_endpoint = 0;
10200 #endif
10201     UV stored = 0;  /* how many chars stored in the bitmap */
10202
10203     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
10204         case we need to change the emitted regop to an EXACT. */
10205     const char * orig_parse = RExC_parse;
10206     GET_RE_DEBUG_FLAGS_DECL;
10207
10208     PERL_ARGS_ASSERT_REGCLASS;
10209 #ifndef DEBUGGING
10210     PERL_UNUSED_ARG(depth);
10211 #endif
10212
10213     DEBUG_PARSE("clas");
10214
10215     /* Assume we are going to generate an ANYOF node. */
10216     ret = reganode(pRExC_state, ANYOF, 0);
10217
10218
10219     if (!SIZE_ONLY) {
10220         ANYOF_FLAGS(ret) = 0;
10221     }
10222
10223     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
10224         RExC_naughty++;
10225         RExC_parse++;
10226         if (!SIZE_ONLY)
10227             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
10228
10229         /* We have decided to not allow multi-char folds in inverted character
10230          * classes, due to the confusion that can happen, especially with
10231          * classes that are designed for a non-Unicode world:  You have the
10232          * peculiar case that:
10233             "s s" =~ /^[^\xDF]+$/i => Y
10234             "ss"  =~ /^[^\xDF]+$/i => N
10235          *
10236          * See [perl #89750] */
10237         allow_full_fold = FALSE;
10238     }
10239
10240     if (SIZE_ONLY) {
10241         RExC_size += ANYOF_SKIP;
10242         listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10243     }
10244     else {
10245         RExC_emit += ANYOF_SKIP;
10246         if (LOC) {
10247             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
10248         }
10249         ANYOF_BITMAP_ZERO(ret);
10250         listsv = newSVpvs("# comment\n");
10251         initial_listsv_len = SvCUR(listsv);
10252     }
10253
10254     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10255
10256     if (!SIZE_ONLY && POSIXCC(nextvalue))
10257         checkposixcc(pRExC_state);
10258
10259     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10260     if (UCHARAT(RExC_parse) == ']')
10261         goto charclassloop;
10262
10263 parseit:
10264     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
10265
10266     charclassloop:
10267
10268         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10269
10270         if (!range) {
10271             rangebegin = RExC_parse;
10272             element_count++;
10273         }
10274         if (UTF) {
10275             value = utf8n_to_uvchr((U8*)RExC_parse,
10276                                    RExC_end - RExC_parse,
10277                                    &numlen, UTF8_ALLOW_DEFAULT);
10278             RExC_parse += numlen;
10279         }
10280         else
10281             value = UCHARAT(RExC_parse++);
10282
10283         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10284         if (value == '[' && POSIXCC(nextvalue))
10285             namedclass = regpposixcc(pRExC_state, value);
10286         else if (value == '\\') {
10287             if (UTF) {
10288                 value = utf8n_to_uvchr((U8*)RExC_parse,
10289                                    RExC_end - RExC_parse,
10290                                    &numlen, UTF8_ALLOW_DEFAULT);
10291                 RExC_parse += numlen;
10292             }
10293             else
10294                 value = UCHARAT(RExC_parse++);
10295             /* Some compilers cannot handle switching on 64-bit integer
10296              * values, therefore value cannot be an UV.  Yes, this will
10297              * be a problem later if we want switch on Unicode.
10298              * A similar issue a little bit later when switching on
10299              * namedclass. --jhi */
10300             switch ((I32)value) {
10301             case 'w':   namedclass = ANYOF_ALNUM;       break;
10302             case 'W':   namedclass = ANYOF_NALNUM;      break;
10303             case 's':   namedclass = ANYOF_SPACE;       break;
10304             case 'S':   namedclass = ANYOF_NSPACE;      break;
10305             case 'd':   namedclass = ANYOF_DIGIT;       break;
10306             case 'D':   namedclass = ANYOF_NDIGIT;      break;
10307             case 'v':   namedclass = ANYOF_VERTWS;      break;
10308             case 'V':   namedclass = ANYOF_NVERTWS;     break;
10309             case 'h':   namedclass = ANYOF_HORIZWS;     break;
10310             case 'H':   namedclass = ANYOF_NHORIZWS;    break;
10311             case 'N':  /* Handle \N{NAME} in class */
10312                 {
10313                     /* We only pay attention to the first char of
10314                     multichar strings being returned. I kinda wonder
10315                     if this makes sense as it does change the behaviour
10316                     from earlier versions, OTOH that behaviour was broken
10317                     as well. */
10318                     UV v; /* value is register so we cant & it /grrr */
10319                     if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
10320                         goto parseit;
10321                     }
10322                     value= v;
10323                 }
10324                 break;
10325             case 'p':
10326             case 'P':
10327                 {
10328                 char *e;
10329                 if (RExC_parse >= RExC_end)
10330                     vFAIL2("Empty \\%c{}", (U8)value);
10331                 if (*RExC_parse == '{') {
10332                     const U8 c = (U8)value;
10333                     e = strchr(RExC_parse++, '}');
10334                     if (!e)
10335                         vFAIL2("Missing right brace on \\%c{}", c);
10336                     while (isSPACE(UCHARAT(RExC_parse)))
10337                         RExC_parse++;
10338                     if (e == RExC_parse)
10339                         vFAIL2("Empty \\%c{}", c);
10340                     n = e - RExC_parse;
10341                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10342                         n--;
10343                 }
10344                 else {
10345                     e = RExC_parse;
10346                     n = 1;
10347                 }
10348                 if (!SIZE_ONLY) {
10349                     SV** invlistsvp;
10350                     SV* invlist;
10351                     char* name;
10352                     if (UCHARAT(RExC_parse) == '^') {
10353                          RExC_parse++;
10354                          n--;
10355                          value = value == 'p' ? 'P' : 'p'; /* toggle */
10356                          while (isSPACE(UCHARAT(RExC_parse))) {
10357                               RExC_parse++;
10358                               n--;
10359                          }
10360                     }
10361                     /* Try to get the definition of the property into
10362                      * <invlist>.  If /i is in effect, the effective property
10363                      * will have its name be <__NAME_i>.  The design is
10364                      * discussed in commit
10365                      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10366                     Newx(name, n + sizeof("_i__\n"), char);
10367
10368                     sprintf(name, "%s%.*s%s\n",
10369                                     (FOLD) ? "__" : "",
10370                                     (int)n,
10371                                     RExC_parse,
10372                                     (FOLD) ? "_i" : ""
10373                     );
10374
10375                     /* Look up the property name, and get its swash and
10376                      * inversion list, if the property is found  */
10377                     if (swash) {
10378                         SvREFCNT_dec(swash);
10379                     }
10380                     swash = _core_swash_init("utf8", name, &PL_sv_undef,
10381                                              1, /* binary */
10382                                              0, /* not tr/// */
10383                                              TRUE, /* this routine will handle
10384                                                       undefined properties */
10385                                              NULL, FALSE /* No inversion list */
10386                                             );
10387                     if (   ! swash
10388                         || ! SvROK(swash)
10389                         || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10390                         || ! (invlistsvp =
10391                                 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10392                                 "INVLIST", FALSE))
10393                         || ! (invlist = *invlistsvp))
10394                     {
10395                         if (swash) {
10396                             SvREFCNT_dec(swash);
10397                             swash = NULL;
10398                         }
10399
10400                         /* Here didn't find it.  It could be a user-defined
10401                          * property that will be available at run-time.  Add it
10402                          * to the list to look up then */
10403                         Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10404                                         (value == 'p' ? '+' : '!'),
10405                                         name);
10406                         has_user_defined_property = 1;
10407
10408                         /* We don't know yet, so have to assume that the
10409                          * property could match something in the Latin1 range,
10410                          * hence something that isn't utf8 */
10411                         ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10412                     }
10413                     else {
10414
10415                         /* Here, did get the swash and its inversion list.  If
10416                          * the swash is from a user-defined property, then this
10417                          * whole character class should be regarded as such */
10418                         SV** user_defined_svp =
10419                                             hv_fetchs(MUTABLE_HV(SvRV(swash)),
10420                                                         "USER_DEFINED", FALSE);
10421                         if (user_defined_svp) {
10422                             has_user_defined_property
10423                                                     |= SvUV(*user_defined_svp);
10424                         }
10425
10426                         /* Invert if asking for the complement */
10427                         if (value == 'P') {
10428
10429                             /* Add to any existing list */
10430                             if (! properties) {
10431                                 properties = invlist_clone(invlist);
10432                                 _invlist_invert(properties);
10433                             }
10434                             else {
10435                                 invlist = invlist_clone(invlist);
10436                                 _invlist_invert(invlist);
10437                                 _invlist_union(properties, invlist, &properties);
10438                                 SvREFCNT_dec(invlist);
10439                             }
10440
10441                             /* The swash can't be used as-is, because we've
10442                              * inverted things; delay removing it to here after
10443                              * have copied its invlist above */
10444                             SvREFCNT_dec(swash);
10445                             swash = NULL;
10446                         }
10447                         else {
10448                             if (! properties) {
10449                                 properties = invlist_clone(invlist);
10450                             }
10451                             else {
10452                                 _invlist_union(properties, invlist, &properties);
10453                             }
10454                         }
10455                     }
10456                     Safefree(name);
10457                 }
10458                 RExC_parse = e + 1;
10459                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
10460
10461                 /* \p means they want Unicode semantics */
10462                 RExC_uni_semantics = 1;
10463                 }
10464                 break;
10465             case 'n':   value = '\n';                   break;
10466             case 'r':   value = '\r';                   break;
10467             case 't':   value = '\t';                   break;
10468             case 'f':   value = '\f';                   break;
10469             case 'b':   value = '\b';                   break;
10470             case 'e':   value = ASCII_TO_NATIVE('\033');break;
10471             case 'a':   value = ASCII_TO_NATIVE('\007');break;
10472             case 'o':
10473                 RExC_parse--;   /* function expects to be pointed at the 'o' */
10474                 {
10475                     const char* error_msg;
10476                     bool valid = grok_bslash_o(RExC_parse,
10477                                                &value,
10478                                                &numlen,
10479                                                &error_msg,
10480                                                SIZE_ONLY);
10481                     RExC_parse += numlen;
10482                     if (! valid) {
10483                         vFAIL(error_msg);
10484                     }
10485                 }
10486                 if (PL_encoding && value < 0x100) {
10487                     goto recode_encoding;
10488                 }
10489                 break;
10490             case 'x':
10491                 if (*RExC_parse == '{') {
10492                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10493                         | PERL_SCAN_DISALLOW_PREFIX;
10494                     char * const e = strchr(RExC_parse++, '}');
10495                     if (!e)
10496                         vFAIL("Missing right brace on \\x{}");
10497
10498                     numlen = e - RExC_parse;
10499                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10500                     RExC_parse = e + 1;
10501                 }
10502                 else {
10503                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10504                     numlen = 2;
10505                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10506                     RExC_parse += numlen;
10507                 }
10508                 if (PL_encoding && value < 0x100)
10509                     goto recode_encoding;
10510                 break;
10511             case 'c':
10512                 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
10513                 break;
10514             case '0': case '1': case '2': case '3': case '4':
10515             case '5': case '6': case '7':
10516                 {
10517                     /* Take 1-3 octal digits */
10518                     I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10519                     numlen = 3;
10520                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10521                     RExC_parse += numlen;
10522                     if (PL_encoding && value < 0x100)
10523                         goto recode_encoding;
10524                     break;
10525                 }
10526             recode_encoding:
10527                 if (! RExC_override_recoding) {
10528                     SV* enc = PL_encoding;
10529                     value = reg_recode((const char)(U8)value, &enc);
10530                     if (!enc && SIZE_ONLY)
10531                         ckWARNreg(RExC_parse,
10532                                   "Invalid escape in the specified encoding");
10533                     break;
10534                 }
10535             default:
10536                 /* Allow \_ to not give an error */
10537                 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
10538                     ckWARN2reg(RExC_parse,
10539                                "Unrecognized escape \\%c in character class passed through",
10540                                (int)value);
10541                 }
10542                 break;
10543             }
10544         } /* end of \blah */
10545 #ifdef EBCDIC
10546         else
10547             literal_endpoint++;
10548 #endif
10549
10550         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10551
10552             /* What matches in a locale is not known until runtime, so need to
10553              * (one time per class) allocate extra space to pass to regexec.
10554              * The space will contain a bit for each named class that is to be
10555              * matched against.  This isn't needed for \p{} and pseudo-classes,
10556              * as they are not affected by locale, and hence are dealt with
10557              * separately */
10558             if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10559                 need_class = 1;
10560                 if (SIZE_ONLY) {
10561                     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10562                 }
10563                 else {
10564                     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10565                     ANYOF_CLASS_ZERO(ret);
10566                 }
10567                 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
10568             }
10569
10570             /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
10571              * literal, as is the character that began the false range, i.e.
10572              * the 'a' in the examples */
10573             if (range) {
10574                 if (!SIZE_ONLY) {
10575                     const int w =
10576                         RExC_parse >= rangebegin ?
10577                         RExC_parse - rangebegin : 0;
10578                     ckWARN4reg(RExC_parse,
10579                                "False [] range \"%*.*s\"",
10580                                w, w, rangebegin);
10581
10582                     stored +=
10583                          set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10584                     if (prevvalue < 256) {
10585                         stored +=
10586                          set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
10587                     }
10588                     else {
10589                         nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
10590                     }
10591                 }
10592
10593                 range = 0; /* this was not a true range */
10594             }
10595
10596             if (!SIZE_ONLY) {
10597                 const char *what = NULL;
10598                 char yesno = 0;
10599
10600                 /* Possible truncation here but in some 64-bit environments
10601                  * the compiler gets heartburn about switch on 64-bit values.
10602                  * A similar issue a little earlier when switching on value.
10603                  * --jhi */
10604                 switch ((I32)namedclass) {
10605
10606                 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum");
10607                 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha");
10608                 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank");
10609                 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl");
10610                 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph");
10611                 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i");
10612                 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint");
10613                 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace");
10614                 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct");
10615                 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i");
10616                 /* \s, \w match all unicode if utf8. */
10617                 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl");
10618                 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word");
10619                 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit");
10620                 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
10621                 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
10622                 case ANYOF_ASCII:
10623                     if (LOC)
10624                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
10625                     else {
10626                         for (value = 0; value < 128; value++)
10627                             stored +=
10628                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
10629                     }
10630                     yesno = '+';
10631                     what = NULL;        /* Doesn't match outside ascii, so
10632                                            don't want to add +utf8:: */
10633                     break;
10634                 case ANYOF_NASCII:
10635                     if (LOC)
10636                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
10637                     else {
10638                         for (value = 128; value < 256; value++)
10639                             stored +=
10640                               set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
10641                     }
10642                     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
10643                     yesno = '!';
10644                     what = "ASCII";
10645                     break;
10646                 case ANYOF_DIGIT:
10647                     if (LOC)
10648                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
10649                     else {
10650                         /* consecutive digits assumed */
10651                         for (value = '0'; value <= '9'; value++)
10652                             stored +=
10653                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
10654                     }
10655                     yesno = '+';
10656                     what = "Digit";
10657                     break;
10658                 case ANYOF_NDIGIT:
10659                     if (LOC)
10660                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
10661                     else {
10662                         /* consecutive digits assumed */
10663                         for (value = 0; value < '0'; value++)
10664                             stored +=
10665                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
10666                         for (value = '9' + 1; value < 256; value++)
10667                             stored +=
10668                               set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
10669                     }
10670                     yesno = '!';
10671                     what = "Digit";
10672                     if (AT_LEAST_ASCII_RESTRICTED ) {
10673                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
10674                     }
10675                     break;
10676                 case ANYOF_MAX:
10677                     /* this is to handle \p and \P */
10678                     break;
10679                 default:
10680                     vFAIL("Invalid [::] class");
10681                     break;
10682                 }
10683                 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
10684                     /* Strings such as "+utf8::isWord\n" */
10685                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what);
10686                 }
10687
10688                 continue;
10689             }
10690         } /* end of namedclass \blah */
10691
10692         if (range) {
10693             if (prevvalue > (IV)value) /* b-a */ {
10694                 const int w = RExC_parse - rangebegin;
10695                 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
10696                 range = 0; /* not a valid range */
10697             }
10698         }
10699         else {
10700             prevvalue = value; /* save the beginning of the range */
10701             if (RExC_parse+1 < RExC_end
10702                 && *RExC_parse == '-'
10703                 && RExC_parse[1] != ']')
10704             {
10705                 RExC_parse++;
10706
10707                 /* a bad range like \w-, [:word:]- ? */
10708                 if (namedclass > OOB_NAMEDCLASS) {
10709                     if (ckWARN(WARN_REGEXP)) {
10710                         const int w =
10711                             RExC_parse >= rangebegin ?
10712                             RExC_parse - rangebegin : 0;
10713                         vWARN4(RExC_parse,
10714                                "False [] range \"%*.*s\"",
10715                                w, w, rangebegin);
10716                     }
10717                     if (!SIZE_ONLY)
10718                         stored +=
10719                             set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10720                 } else
10721                     range = 1;  /* yeah, it's a range! */
10722                 continue;       /* but do it the next time */
10723             }
10724         }
10725
10726         /* non-Latin1 code point implies unicode semantics.  Must be set in
10727          * pass1 so is there for the whole of pass 2 */
10728         if (value > 255) {
10729             RExC_uni_semantics = 1;
10730         }
10731
10732         /* now is the next time */
10733         if (!SIZE_ONLY) {
10734             if (prevvalue < 256) {
10735                 const IV ceilvalue = value < 256 ? value : 255;
10736                 IV i;
10737 #ifdef EBCDIC
10738                 /* In EBCDIC [\x89-\x91] should include
10739                  * the \x8e but [i-j] should not. */
10740                 if (literal_endpoint == 2 &&
10741                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
10742                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
10743                 {
10744                     if (isLOWER(prevvalue)) {
10745                         for (i = prevvalue; i <= ceilvalue; i++)
10746                             if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
10747                                 stored +=
10748                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10749                             }
10750                     } else {
10751                         for (i = prevvalue; i <= ceilvalue; i++)
10752                             if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
10753                                 stored +=
10754                                   set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10755                             }
10756                     }
10757                 }
10758                 else
10759 #endif
10760                       for (i = prevvalue; i <= ceilvalue; i++) {
10761                         stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10762                       }
10763           }
10764           if (value > 255) {
10765             const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
10766             const UV natvalue      = NATIVE_TO_UNI(value);
10767             nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
10768         }
10769 #ifdef EBCDIC
10770             literal_endpoint = 0;
10771 #endif
10772         }
10773
10774         range = 0; /* this range (if it was one) is done now */
10775     }
10776
10777
10778
10779     if (SIZE_ONLY)
10780         return ret;
10781     /****** !SIZE_ONLY AFTER HERE *********/
10782
10783     /* If folding and there are code points above 255, we calculate all
10784      * characters that could fold to or from the ones already on the list */
10785     if (FOLD && nonbitmap) {
10786         UV start, end;  /* End points of code point ranges */
10787
10788         SV* fold_intersection = NULL;
10789
10790         /* This is a list of all the characters that participate in folds
10791             * (except marks, etc in multi-char folds */
10792         if (! PL_utf8_foldable) {
10793             SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
10794             PL_utf8_foldable = _swash_to_invlist(swash);
10795             SvREFCNT_dec(swash);
10796         }
10797
10798         /* This is a hash that for a particular fold gives all characters
10799             * that are involved in it */
10800         if (! PL_utf8_foldclosures) {
10801
10802             /* If we were unable to find any folds, then we likely won't be
10803              * able to find the closures.  So just create an empty list.
10804              * Folding will effectively be restricted to the non-Unicode rules
10805              * hard-coded into Perl.  (This case happens legitimately during
10806              * compilation of Perl itself before the Unicode tables are
10807              * generated) */
10808             if (invlist_len(PL_utf8_foldable) == 0) {
10809                 PL_utf8_foldclosures = newHV();
10810             } else {
10811                 /* If the folds haven't been read in, call a fold function
10812                     * to force that */
10813                 if (! PL_utf8_tofold) {
10814                     U8 dummy[UTF8_MAXBYTES+1];
10815                     STRLEN dummy_len;
10816
10817                     /* This particular string is above \xff in both UTF-8 and
10818                      * UTFEBCDIC */
10819                     to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
10820                     assert(PL_utf8_tofold); /* Verify that worked */
10821                 }
10822                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10823             }
10824         }
10825
10826         /* Only the characters in this class that participate in folds need be
10827          * checked.  Get the intersection of this class and all the possible
10828          * characters that are foldable.  This can quickly narrow down a large
10829          * class */
10830         _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
10831
10832         /* Now look at the foldable characters in this class individually */
10833         invlist_iterinit(fold_intersection);
10834         while (invlist_iternext(fold_intersection, &start, &end)) {
10835             UV j;
10836
10837             /* Look at every character in the range */
10838             for (j = start; j <= end; j++) {
10839
10840                 /* Get its fold */
10841                 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10842                 STRLEN foldlen;
10843                 const UV f =
10844                     _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
10845
10846                 if (foldlen > (STRLEN)UNISKIP(f)) {
10847
10848                     /* Any multicharacter foldings (disallowed in lookbehind
10849                      * patterns) require the following transform: [ABCDEF] ->
10850                      * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
10851                      * folds into "rst", all other characters fold to single
10852                      * characters.  We save away these multicharacter foldings,
10853                      * to be later saved as part of the additional "s" data. */
10854                     if (! RExC_in_lookbehind) {
10855                         U8* loc = foldbuf;
10856                         U8* e = foldbuf + foldlen;
10857
10858                         /* If any of the folded characters of this are in the
10859                          * Latin1 range, tell the regex engine that this can
10860                          * match a non-utf8 target string.  The only multi-byte
10861                          * fold whose source is in the Latin1 range (U+00DF)
10862                          * applies only when the target string is utf8, or
10863                          * under unicode rules */
10864                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10865                             while (loc < e) {
10866
10867                                 /* Can't mix ascii with non- under /aa */
10868                                 if (MORE_ASCII_RESTRICTED
10869                                     && (isASCII(*loc) != isASCII(j)))
10870                                 {
10871                                     goto end_multi_fold;
10872                                 }
10873                                 if (UTF8_IS_INVARIANT(*loc)
10874                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
10875                                 {
10876                                     /* Can't mix above and below 256 under LOC
10877                                      */
10878                                     if (LOC) {
10879                                         goto end_multi_fold;
10880                                     }
10881                                     ANYOF_FLAGS(ret)
10882                                             |= ANYOF_NONBITMAP_NON_UTF8;
10883                                     break;
10884                                 }
10885                                 loc += UTF8SKIP(loc);
10886                             }
10887                         }
10888
10889                         add_alternate(&unicode_alternate, foldbuf, foldlen);
10890                     end_multi_fold: ;
10891                     }
10892
10893                     /* This is special-cased, as it is the only letter which
10894                      * has both a multi-fold and single-fold in Latin1.  All
10895                      * the other chars that have single and multi-folds are
10896                      * always in utf8, and the utf8 folding algorithm catches
10897                      * them */
10898                     if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
10899                         stored += set_regclass_bit(pRExC_state,
10900                                         ret,
10901                                         LATIN_SMALL_LETTER_SHARP_S,
10902                                         &l1_fold_invlist, &unicode_alternate);
10903                     }
10904                 }
10905                 else {
10906                     /* Single character fold.  Add everything in its fold
10907                      * closure to the list that this node should match */
10908                     SV** listp;
10909
10910                     /* The fold closures data structure is a hash with the keys
10911                      * being every character that is folded to, like 'k', and
10912                      * the values each an array of everything that folds to its
10913                      * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
10914                     if ((listp = hv_fetch(PL_utf8_foldclosures,
10915                                     (char *) foldbuf, foldlen, FALSE)))
10916                     {
10917                         AV* list = (AV*) *listp;
10918                         IV k;
10919                         for (k = 0; k <= av_len(list); k++) {
10920                             SV** c_p = av_fetch(list, k, FALSE);
10921                             UV c;
10922                             if (c_p == NULL) {
10923                                 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10924                             }
10925                             c = SvUV(*c_p);
10926
10927                             /* /aa doesn't allow folds between ASCII and non-;
10928                              * /l doesn't allow them between above and below
10929                              * 256 */
10930                             if ((MORE_ASCII_RESTRICTED
10931                                  && (isASCII(c) != isASCII(j)))
10932                                     || (LOC && ((c < 256) != (j < 256))))
10933                             {
10934                                 continue;
10935                             }
10936
10937                             if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10938                                 stored += set_regclass_bit(pRExC_state,
10939                                         ret,
10940                                         (U8) c,
10941                                         &l1_fold_invlist, &unicode_alternate);
10942                             }
10943                                 /* It may be that the code point is already in
10944                                  * this range or already in the bitmap, in
10945                                  * which case we need do nothing */
10946                             else if ((c < start || c > end)
10947                                         && (c > 255
10948                                             || ! ANYOF_BITMAP_TEST(ret, c)))
10949                             {
10950                                 nonbitmap = add_cp_to_invlist(nonbitmap, c);
10951                             }
10952                         }
10953                     }
10954                 }
10955             }
10956         }
10957         SvREFCNT_dec(fold_intersection);
10958     }
10959
10960     /* Combine the two lists into one. */
10961     if (l1_fold_invlist) {
10962         if (nonbitmap) {
10963             _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
10964             SvREFCNT_dec(l1_fold_invlist);
10965         }
10966         else {
10967             nonbitmap = l1_fold_invlist;
10968         }
10969     }
10970
10971     /* And combine the result (if any) with any inversion list from properties.
10972      * The lists are kept separate up to now because we don't want to fold the
10973      * properties */
10974     if (properties) {
10975         if (nonbitmap) {
10976             _invlist_union(nonbitmap, properties, &nonbitmap);
10977             SvREFCNT_dec(properties);
10978         }
10979         else {
10980             nonbitmap = properties;
10981         }
10982     }
10983
10984     /* Here, <nonbitmap> contains all the code points we can determine at
10985      * compile time that we haven't put into the bitmap.  Go through it, and
10986      * for things that belong in the bitmap, put them there, and delete from
10987      * <nonbitmap> */
10988     if (nonbitmap) {
10989
10990         /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
10991          * possibly only should match when the target string is UTF-8 */
10992         UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
10993
10994         /* This gets set if we actually need to modify things */
10995         bool change_invlist = FALSE;
10996
10997         UV start, end;
10998
10999         /* Start looking through <nonbitmap> */
11000         invlist_iterinit(nonbitmap);
11001         while (invlist_iternext(nonbitmap, &start, &end)) {
11002             UV high;
11003             int i;
11004
11005             /* Quit if are above what we should change */
11006             if (start > max_cp_to_set) {
11007                 break;
11008             }
11009
11010             change_invlist = TRUE;
11011
11012             /* Set all the bits in the range, up to the max that we are doing */
11013             high = (end < max_cp_to_set) ? end : max_cp_to_set;
11014             for (i = start; i <= (int) high; i++) {
11015                 if (! ANYOF_BITMAP_TEST(ret, i)) {
11016                     ANYOF_BITMAP_SET(ret, i);
11017                     stored++;
11018                     prevvalue = value;
11019                     value = i;
11020                 }
11021             }
11022         }
11023
11024         /* Done with loop; set <nonbitmap> to not include any code points that
11025          * are in the bitmap */
11026         if (change_invlist) {
11027             SV* keep_list = _new_invlist(2);
11028             _append_range_to_invlist(keep_list, max_cp_to_set + 1, UV_MAX);
11029             _invlist_intersection(nonbitmap, keep_list, &nonbitmap);
11030             SvREFCNT_dec(keep_list);
11031         }
11032
11033         /* If have completely emptied it, remove it completely */
11034         if (invlist_len(nonbitmap) == 0) {
11035             SvREFCNT_dec(nonbitmap);
11036             nonbitmap = NULL;
11037         }
11038     }
11039
11040     /* Here, we have calculated what code points should be in the character
11041      * class.  <nonbitmap> does not overlap the bitmap except possibly in the
11042      * case of DEPENDS rules.
11043      *
11044      * Now we can see about various optimizations.  Fold calculation (which we
11045      * did above) needs to take place before inversion.  Otherwise /[^k]/i
11046      * would invert to include K, which under /i would match k, which it
11047      * shouldn't. */
11048
11049     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
11050      * set the FOLD flag yet, so this does optimize those.  It doesn't
11051      * optimize locale.  Doing so perhaps could be done as long as there is
11052      * nothing like \w in it; some thought also would have to be given to the
11053      * interaction with above 0x100 chars */
11054     if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11055         && ! LOC
11056         && ! unicode_alternate
11057         /* In case of /d, there are some things that should match only when in
11058          * not in the bitmap, i.e., they require UTF8 to match.  These are
11059          * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11060          * case, they don't require UTF8, so can invert here */
11061         && (! nonbitmap
11062             || ! DEPENDS_SEMANTICS
11063             || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11064         && SvCUR(listsv) == initial_listsv_len)
11065     {
11066         int i;
11067         if (! nonbitmap) {
11068             for (i = 0; i < 256; ++i) {
11069                 if (ANYOF_BITMAP_TEST(ret, i)) {
11070                     ANYOF_BITMAP_CLEAR(ret, i);
11071                 }
11072                 else {
11073                     ANYOF_BITMAP_SET(ret, i);
11074                     prevvalue = value;
11075                     value = i;
11076                 }
11077             }
11078             /* The inversion means that everything above 255 is matched */
11079             ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11080         }
11081         else {
11082             /* Here, also has things outside the bitmap that may overlap with
11083              * the bitmap.  We have to sync them up, so that they get inverted
11084              * in both places.  Earlier, we removed all overlaps except in the
11085              * case of /d rules, so no syncing is needed except for this case
11086              */
11087             SV *remove_list = NULL;
11088
11089             if (DEPENDS_SEMANTICS) {
11090                 UV start, end;
11091
11092                 /* Set the bits that correspond to the ones that aren't in the
11093                  * bitmap.  Otherwise, when we invert, we'll miss these.
11094                  * Earlier, we removed from the nonbitmap all code points
11095                  * < 128, so there is no extra work here */
11096                 invlist_iterinit(nonbitmap);
11097                 while (invlist_iternext(nonbitmap, &start, &end)) {
11098                     if (start > 255) {  /* The bit map goes to 255 */
11099                         break;
11100                     }
11101                     if (end > 255) {
11102                         end = 255;
11103                     }
11104                     for (i = start; i <= (int) end; ++i) {
11105                         ANYOF_BITMAP_SET(ret, i);
11106                         prevvalue = value;
11107                         value = i;
11108                     }
11109                 }
11110             }
11111
11112             /* Now invert both the bitmap and the nonbitmap.  Anything in the
11113              * bitmap has to also be removed from the non-bitmap, but again,
11114              * there should not be overlap unless is /d rules. */
11115             _invlist_invert(nonbitmap);
11116
11117             for (i = 0; i < 256; ++i) {
11118                 if (ANYOF_BITMAP_TEST(ret, i)) {
11119                     ANYOF_BITMAP_CLEAR(ret, i);
11120                     if (DEPENDS_SEMANTICS) {
11121                         if (! remove_list) {
11122                             remove_list = _new_invlist(2);
11123                         }
11124                         remove_list = add_cp_to_invlist(remove_list, i);
11125                     }
11126                 }
11127                 else {
11128                     ANYOF_BITMAP_SET(ret, i);
11129                     prevvalue = value;
11130                     value = i;
11131                 }
11132             }
11133
11134             /* And do the removal */
11135             if (DEPENDS_SEMANTICS) {
11136                 if (remove_list) {
11137                     _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11138                     SvREFCNT_dec(remove_list);
11139                 }
11140             }
11141             else {
11142                 /* There is no overlap for non-/d, so just delete anything
11143                  * below 256 */
11144                 SV* keep_list = _new_invlist(2);
11145                 _append_range_to_invlist(keep_list, 256, UV_MAX);
11146                 _invlist_intersection(nonbitmap, keep_list, &nonbitmap);
11147                 SvREFCNT_dec(keep_list);
11148             }
11149         }
11150
11151         stored = 256 - stored;
11152
11153         /* Clear the invert flag since have just done it here */
11154         ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
11155     }
11156
11157     /* Folding in the bitmap is taken care of above, but not for locale (for
11158      * which we have to wait to see what folding is in effect at runtime), and
11159      * for some things not in the bitmap (only the upper latin folds in this
11160      * case, as all other single-char folding has been set above).  Set
11161      * run-time fold flag for these */
11162     if (FOLD && (LOC
11163                 || (DEPENDS_SEMANTICS
11164                     && nonbitmap
11165                     && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11166                 || unicode_alternate))
11167     {
11168         ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
11169     }
11170
11171     /* A single character class can be "optimized" into an EXACTish node.
11172      * Note that since we don't currently count how many characters there are
11173      * outside the bitmap, we are XXX missing optimization possibilities for
11174      * them.  This optimization can't happen unless this is a truly single
11175      * character class, which means that it can't be an inversion into a
11176      * many-character class, and there must be no possibility of there being
11177      * things outside the bitmap.  'stored' (only) for locales doesn't include
11178      * \w, etc, so have to make a special test that they aren't present
11179      *
11180      * Similarly A 2-character class of the very special form like [bB] can be
11181      * optimized into an EXACTFish node, but only for non-locales, and for
11182      * characters which only have the two folds; so things like 'fF' and 'Ii'
11183      * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11184      * FI'. */
11185     if (! nonbitmap
11186         && ! unicode_alternate
11187         && SvCUR(listsv) == initial_listsv_len
11188         && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
11189         && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11190                               || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11191             || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11192                                  && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11193                                  /* If the latest code point has a fold whose
11194                                   * bit is set, it must be the only other one */
11195                                 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
11196                                  && ANYOF_BITMAP_TEST(ret, prevvalue)))))
11197     {
11198         /* Note that the information needed to decide to do this optimization
11199          * is not currently available until the 2nd pass, and that the actually
11200          * used EXACTish node takes less space than the calculated ANYOF node,
11201          * and hence the amount of space calculated in the first pass is larger
11202          * than actually used, so this optimization doesn't gain us any space.
11203          * But an EXACT node is faster than an ANYOF node, and can be combined
11204          * with any adjacent EXACT nodes later by the optimizer for further
11205          * gains.  The speed of executing an EXACTF is similar to an ANYOF
11206          * node, so the optimization advantage comes from the ability to join
11207          * it to adjacent EXACT nodes */
11208
11209         const char * cur_parse= RExC_parse;
11210         U8 op;
11211         RExC_emit = (regnode *)orig_emit;
11212         RExC_parse = (char *)orig_parse;
11213
11214         if (stored == 1) {
11215
11216             /* A locale node with one point can be folded; all the other cases
11217              * with folding will have two points, since we calculate them above
11218              */
11219             if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
11220                  op = EXACTFL;
11221             }
11222             else {
11223                 op = EXACT;
11224             }
11225         }
11226         else {   /* else 2 chars in the bit map: the folds of each other */
11227
11228             /* Use the folded value, which for the cases where we get here,
11229              * is just the lower case of the current one (which may resolve to
11230              * itself, or to the other one */
11231             value = toLOWER_LATIN1(value);
11232
11233             /* To join adjacent nodes, they must be the exact EXACTish type.
11234              * Try to use the most likely type, by using EXACTFA if possible,
11235              * then EXACTFU if the regex calls for it, or is required because
11236              * the character is non-ASCII.  (If <value> is ASCII, its fold is
11237              * also ASCII for the cases where we get here.) */
11238             if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11239                 op = EXACTFA;
11240             }
11241             else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
11242                 op = EXACTFU;
11243             }
11244             else {    /* Otherwise, more likely to be EXACTF type */
11245                 op = EXACTF;
11246             }
11247         }
11248
11249         ret = reg_node(pRExC_state, op);
11250         RExC_parse = (char *)cur_parse;
11251         if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11252             *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11253             *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11254             STR_LEN(ret)= 2;
11255             RExC_emit += STR_SZ(2);
11256         }
11257         else {
11258             *STRING(ret)= (char)value;
11259             STR_LEN(ret)= 1;
11260             RExC_emit += STR_SZ(1);
11261         }
11262         SvREFCNT_dec(listsv);
11263         return ret;
11264     }
11265
11266     /* If there is a swash and more than one element, we can't use the swash in
11267      * the optimization below. */
11268     if (swash && element_count > 1) {
11269         SvREFCNT_dec(swash);
11270         swash = NULL;
11271     }
11272     if (! nonbitmap
11273         && SvCUR(listsv) == initial_listsv_len
11274         && ! unicode_alternate)
11275     {
11276         ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11277         SvREFCNT_dec(listsv);
11278         SvREFCNT_dec(unicode_alternate);
11279     }
11280     else {
11281         /* av[0] stores the character class description in its textual form:
11282          *       used later (regexec.c:Perl_regclass_swash()) to initialize the
11283          *       appropriate swash, and is also useful for dumping the regnode.
11284          * av[1] if NULL, is a placeholder to later contain the swash computed
11285          *       from av[0].  But if no further computation need be done, the
11286          *       swash is stored there now.
11287          * av[2] stores the multicharacter foldings, used later in
11288          *       regexec.c:S_reginclass().
11289          * av[3] stores the nonbitmap inversion list for use in addition or
11290          *       instead of av[0]; not used if av[1] isn't NULL
11291          * av[4] is set if any component of the class is from a user-defined
11292          *       property; not used if av[1] isn't NULL */
11293         AV * const av = newAV();
11294         SV *rv;
11295
11296         av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11297                         ? &PL_sv_undef
11298                         : listsv);
11299         if (swash) {
11300             av_store(av, 1, swash);
11301             SvREFCNT_dec(nonbitmap);
11302         }
11303         else {
11304             av_store(av, 1, NULL);
11305             if (nonbitmap) {
11306                 av_store(av, 3, nonbitmap);
11307                 av_store(av, 4, newSVuv(has_user_defined_property));
11308             }
11309         }
11310
11311         /* Store any computed multi-char folds only if we are allowing
11312          * them */
11313         if (allow_full_fold) {
11314             av_store(av, 2, MUTABLE_SV(unicode_alternate));
11315             if (unicode_alternate) { /* This node is variable length */
11316                 OP(ret) = ANYOFV;
11317             }
11318         }
11319         else {
11320             av_store(av, 2, NULL);
11321         }
11322         rv = newRV_noinc(MUTABLE_SV(av));
11323         n = add_data(pRExC_state, 1, "s");
11324         RExC_rxi->data->data[n] = (void*)rv;
11325         ARG_SET(ret, n);
11326     }
11327     return ret;
11328 }
11329 #undef _C_C_T_
11330
11331
11332 /* reg_skipcomment()
11333
11334    Absorbs an /x style # comments from the input stream.
11335    Returns true if there is more text remaining in the stream.
11336    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11337    terminates the pattern without including a newline.
11338
11339    Note its the callers responsibility to ensure that we are
11340    actually in /x mode
11341
11342 */
11343
11344 STATIC bool
11345 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11346 {
11347     bool ended = 0;
11348
11349     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11350
11351     while (RExC_parse < RExC_end)
11352         if (*RExC_parse++ == '\n') {
11353             ended = 1;
11354             break;
11355         }
11356     if (!ended) {
11357         /* we ran off the end of the pattern without ending
11358            the comment, so we have to add an \n when wrapping */
11359         RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11360         return 0;
11361     } else
11362         return 1;
11363 }
11364
11365 /* nextchar()
11366
11367    Advances the parse position, and optionally absorbs
11368    "whitespace" from the inputstream.
11369
11370    Without /x "whitespace" means (?#...) style comments only,
11371    with /x this means (?#...) and # comments and whitespace proper.
11372
11373    Returns the RExC_parse point from BEFORE the scan occurs.
11374
11375    This is the /x friendly way of saying RExC_parse++.
11376 */
11377
11378 STATIC char*
11379 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
11380 {
11381     char* const retval = RExC_parse++;
11382
11383     PERL_ARGS_ASSERT_NEXTCHAR;
11384
11385     for (;;) {
11386         if (RExC_end - RExC_parse >= 3
11387             && *RExC_parse == '('
11388             && RExC_parse[1] == '?'
11389             && RExC_parse[2] == '#')
11390         {
11391             while (*RExC_parse != ')') {
11392                 if (RExC_parse == RExC_end)
11393                     FAIL("Sequence (?#... not terminated");
11394                 RExC_parse++;
11395             }
11396             RExC_parse++;
11397             continue;
11398         }
11399         if (RExC_flags & RXf_PMf_EXTENDED) {
11400             if (isSPACE(*RExC_parse)) {
11401                 RExC_parse++;
11402                 continue;
11403             }
11404             else if (*RExC_parse == '#') {
11405                 if ( reg_skipcomment( pRExC_state ) )
11406                     continue;
11407             }
11408         }
11409         return retval;
11410     }
11411 }
11412
11413 /*
11414 - reg_node - emit a node
11415 */
11416 STATIC regnode *                        /* Location. */
11417 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
11418 {
11419     dVAR;
11420     register regnode *ptr;
11421     regnode * const ret = RExC_emit;
11422     GET_RE_DEBUG_FLAGS_DECL;
11423
11424     PERL_ARGS_ASSERT_REG_NODE;
11425
11426     if (SIZE_ONLY) {
11427         SIZE_ALIGN(RExC_size);
11428         RExC_size += 1;
11429         return(ret);
11430     }
11431     if (RExC_emit >= RExC_emit_bound)
11432         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11433                    op, RExC_emit, RExC_emit_bound);
11434
11435     NODE_ALIGN_FILL(ret);
11436     ptr = ret;
11437     FILL_ADVANCE_NODE(ptr, op);
11438 #ifdef RE_TRACK_PATTERN_OFFSETS
11439     if (RExC_offsets) {         /* MJD */
11440         MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
11441               "reg_node", __LINE__,
11442               PL_reg_name[op],
11443               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11444                 ? "Overwriting end of array!\n" : "OK",
11445               (UV)(RExC_emit - RExC_emit_start),
11446               (UV)(RExC_parse - RExC_start),
11447               (UV)RExC_offsets[0]));
11448         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
11449     }
11450 #endif
11451     RExC_emit = ptr;
11452     return(ret);
11453 }
11454
11455 /*
11456 - reganode - emit a node with an argument
11457 */
11458 STATIC regnode *                        /* Location. */
11459 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
11460 {
11461     dVAR;
11462     register regnode *ptr;
11463     regnode * const ret = RExC_emit;
11464     GET_RE_DEBUG_FLAGS_DECL;
11465
11466     PERL_ARGS_ASSERT_REGANODE;
11467
11468     if (SIZE_ONLY) {
11469         SIZE_ALIGN(RExC_size);
11470         RExC_size += 2;
11471         /*
11472            We can't do this:
11473
11474            assert(2==regarglen[op]+1);
11475
11476            Anything larger than this has to allocate the extra amount.
11477            If we changed this to be:
11478
11479            RExC_size += (1 + regarglen[op]);
11480
11481            then it wouldn't matter. Its not clear what side effect
11482            might come from that so its not done so far.
11483            -- dmq
11484         */
11485         return(ret);
11486     }
11487     if (RExC_emit >= RExC_emit_bound)
11488         Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11489                    op, RExC_emit, RExC_emit_bound);
11490
11491     NODE_ALIGN_FILL(ret);
11492     ptr = ret;
11493     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
11494 #ifdef RE_TRACK_PATTERN_OFFSETS
11495     if (RExC_offsets) {         /* MJD */
11496         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11497               "reganode",
11498               __LINE__,
11499               PL_reg_name[op],
11500               (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
11501               "Overwriting end of array!\n" : "OK",
11502               (UV)(RExC_emit - RExC_emit_start),
11503               (UV)(RExC_parse - RExC_start),
11504               (UV)RExC_offsets[0]));
11505         Set_Cur_Node_Offset;
11506     }
11507 #endif
11508     RExC_emit = ptr;
11509     return(ret);
11510 }
11511
11512 /*
11513 - reguni - emit (if appropriate) a Unicode character
11514 */
11515 STATIC STRLEN
11516 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
11517 {
11518     dVAR;
11519
11520     PERL_ARGS_ASSERT_REGUNI;
11521
11522     return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
11523 }
11524
11525 /*
11526 - reginsert - insert an operator in front of already-emitted operand
11527 *
11528 * Means relocating the operand.
11529 */
11530 STATIC void
11531 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
11532 {
11533     dVAR;
11534     register regnode *src;
11535     register regnode *dst;
11536     register regnode *place;
11537     const int offset = regarglen[(U8)op];
11538     const int size = NODE_STEP_REGNODE + offset;
11539     GET_RE_DEBUG_FLAGS_DECL;
11540
11541     PERL_ARGS_ASSERT_REGINSERT;
11542     PERL_UNUSED_ARG(depth);
11543 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
11544     DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
11545     if (SIZE_ONLY) {
11546         RExC_size += size;
11547         return;
11548     }
11549
11550     src = RExC_emit;
11551     RExC_emit += size;
11552     dst = RExC_emit;
11553     if (RExC_open_parens) {
11554         int paren;
11555         /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
11556         for ( paren=0 ; paren < RExC_npar ; paren++ ) {
11557             if ( RExC_open_parens[paren] >= opnd ) {
11558                 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
11559                 RExC_open_parens[paren] += size;
11560             } else {
11561                 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
11562             }
11563             if ( RExC_close_parens[paren] >= opnd ) {
11564                 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
11565                 RExC_close_parens[paren] += size;
11566             } else {
11567                 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
11568             }
11569         }
11570     }
11571
11572     while (src > opnd) {
11573         StructCopy(--src, --dst, regnode);
11574 #ifdef RE_TRACK_PATTERN_OFFSETS
11575         if (RExC_offsets) {     /* MJD 20010112 */
11576             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
11577                   "reg_insert",
11578                   __LINE__,
11579                   PL_reg_name[op],
11580                   (UV)(dst - RExC_emit_start) > RExC_offsets[0]
11581                     ? "Overwriting end of array!\n" : "OK",
11582                   (UV)(src - RExC_emit_start),
11583                   (UV)(dst - RExC_emit_start),
11584                   (UV)RExC_offsets[0]));
11585             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
11586             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
11587         }
11588 #endif
11589     }
11590
11591
11592     place = opnd;               /* Op node, where operand used to be. */
11593 #ifdef RE_TRACK_PATTERN_OFFSETS
11594     if (RExC_offsets) {         /* MJD */
11595         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11596               "reginsert",
11597               __LINE__,
11598               PL_reg_name[op],
11599               (UV)(place - RExC_emit_start) > RExC_offsets[0]
11600               ? "Overwriting end of array!\n" : "OK",
11601               (UV)(place - RExC_emit_start),
11602               (UV)(RExC_parse - RExC_start),
11603               (UV)RExC_offsets[0]));
11604         Set_Node_Offset(place, RExC_parse);
11605         Set_Node_Length(place, 1);
11606     }
11607 #endif
11608     src = NEXTOPER(place);
11609     FILL_ADVANCE_NODE(place, op);
11610     Zero(src, offset, regnode);
11611 }
11612
11613 /*
11614 - regtail - set the next-pointer at the end of a node chain of p to val.
11615 - SEE ALSO: regtail_study
11616 */
11617 /* TODO: All three parms should be const */
11618 STATIC void
11619 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
11620 {
11621     dVAR;
11622     register regnode *scan;
11623     GET_RE_DEBUG_FLAGS_DECL;
11624
11625     PERL_ARGS_ASSERT_REGTAIL;
11626 #ifndef DEBUGGING
11627     PERL_UNUSED_ARG(depth);
11628 #endif
11629
11630     if (SIZE_ONLY)
11631         return;
11632
11633     /* Find last node. */
11634     scan = p;
11635     for (;;) {
11636         regnode * const temp = regnext(scan);
11637         DEBUG_PARSE_r({
11638             SV * const mysv=sv_newmortal();
11639             DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
11640             regprop(RExC_rx, mysv, scan);
11641             PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
11642                 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
11643                     (temp == NULL ? "->" : ""),
11644                     (temp == NULL ? PL_reg_name[OP(val)] : "")
11645             );
11646         });
11647         if (temp == NULL)
11648             break;
11649         scan = temp;
11650     }
11651
11652     if (reg_off_by_arg[OP(scan)]) {
11653         ARG_SET(scan, val - scan);
11654     }
11655     else {
11656         NEXT_OFF(scan) = val - scan;
11657     }
11658 }
11659
11660 #ifdef DEBUGGING
11661 /*
11662 - regtail_study - set the next-pointer at the end of a node chain of p to val.
11663 - Look for optimizable sequences at the same time.
11664 - currently only looks for EXACT chains.
11665
11666 This is experimental code. The idea is to use this routine to perform
11667 in place optimizations on branches and groups as they are constructed,
11668 with the long term intention of removing optimization from study_chunk so
11669 that it is purely analytical.
11670
11671 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
11672 to control which is which.
11673
11674 */
11675 /* TODO: All four parms should be const */
11676
11677 STATIC U8
11678 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
11679 {
11680     dVAR;
11681     register regnode *scan;
11682     U8 exact = PSEUDO;
11683 #ifdef EXPERIMENTAL_INPLACESCAN
11684     I32 min = 0;
11685 #endif
11686     GET_RE_DEBUG_FLAGS_DECL;
11687
11688     PERL_ARGS_ASSERT_REGTAIL_STUDY;
11689
11690
11691     if (SIZE_ONLY)
11692         return exact;
11693
11694     /* Find last node. */
11695
11696     scan = p;
11697     for (;;) {
11698         regnode * const temp = regnext(scan);
11699 #ifdef EXPERIMENTAL_INPLACESCAN
11700         if (PL_regkind[OP(scan)] == EXACT) {
11701             bool has_exactf_sharp_s;    /* Unexamined in this routine */
11702             if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
11703                 return EXACT;
11704         }
11705 #endif
11706         if ( exact ) {
11707             switch (OP(scan)) {
11708                 case EXACT:
11709                 case EXACTF:
11710                 case EXACTFA:
11711                 case EXACTFU:
11712                 case EXACTFU_SS:
11713                 case EXACTFU_NO_TRIE:
11714                 case EXACTFL:
11715                         if( exact == PSEUDO )
11716                             exact= OP(scan);
11717                         else if ( exact != OP(scan) )
11718                             exact= 0;
11719                 case NOTHING:
11720                     break;
11721                 default:
11722                     exact= 0;
11723             }
11724         }
11725         DEBUG_PARSE_r({
11726             SV * const mysv=sv_newmortal();
11727             DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
11728             regprop(RExC_rx, mysv, scan);
11729             PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
11730                 SvPV_nolen_const(mysv),
11731                 REG_NODE_NUM(scan),
11732                 PL_reg_name[exact]);
11733         });
11734         if (temp == NULL)
11735             break;
11736         scan = temp;
11737     }
11738     DEBUG_PARSE_r({
11739         SV * const mysv_val=sv_newmortal();
11740         DEBUG_PARSE_MSG("");
11741         regprop(RExC_rx, mysv_val, val);
11742         PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
11743                       SvPV_nolen_const(mysv_val),
11744                       (IV)REG_NODE_NUM(val),
11745                       (IV)(val - scan)
11746         );
11747     });
11748     if (reg_off_by_arg[OP(scan)]) {
11749         ARG_SET(scan, val - scan);
11750     }
11751     else {
11752         NEXT_OFF(scan) = val - scan;
11753     }
11754
11755     return exact;
11756 }
11757 #endif
11758
11759 /*
11760  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
11761  */
11762 #ifdef DEBUGGING
11763 static void
11764 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
11765 {
11766     int bit;
11767     int set=0;
11768     regex_charset cs;
11769
11770     for (bit=0; bit<32; bit++) {
11771         if (flags & (1<<bit)) {
11772             if ((1<<bit) & RXf_PMf_CHARSET) {   /* Output separately, below */
11773                 continue;
11774             }
11775             if (!set++ && lead)
11776                 PerlIO_printf(Perl_debug_log, "%s",lead);
11777             PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
11778         }
11779     }
11780     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
11781             if (!set++ && lead) {
11782                 PerlIO_printf(Perl_debug_log, "%s",lead);
11783             }
11784             switch (cs) {
11785                 case REGEX_UNICODE_CHARSET:
11786                     PerlIO_printf(Perl_debug_log, "UNICODE");
11787                     break;
11788                 case REGEX_LOCALE_CHARSET:
11789                     PerlIO_printf(Perl_debug_log, "LOCALE");
11790                     break;
11791                 case REGEX_ASCII_RESTRICTED_CHARSET:
11792                     PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
11793                     break;
11794                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
11795                     PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
11796                     break;
11797                 default:
11798                     PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
11799                     break;
11800             }
11801     }
11802     if (lead)  {
11803         if (set)
11804             PerlIO_printf(Perl_debug_log, "\n");
11805         else
11806             PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
11807     }
11808 }
11809 #endif
11810
11811 void
11812 Perl_regdump(pTHX_ const regexp *r)
11813 {
11814 #ifdef DEBUGGING
11815     dVAR;
11816     SV * const sv = sv_newmortal();
11817     SV *dsv= sv_newmortal();
11818     RXi_GET_DECL(r,ri);
11819     GET_RE_DEBUG_FLAGS_DECL;
11820
11821     PERL_ARGS_ASSERT_REGDUMP;
11822
11823     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
11824
11825     /* Header fields of interest. */
11826     if (r->anchored_substr) {
11827         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
11828             RE_SV_DUMPLEN(r->anchored_substr), 30);
11829         PerlIO_printf(Perl_debug_log,
11830                       "anchored %s%s at %"IVdf" ",
11831                       s, RE_SV_TAIL(r->anchored_substr),
11832                       (IV)r->anchored_offset);
11833     } else if (r->anchored_utf8) {
11834         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
11835             RE_SV_DUMPLEN(r->anchored_utf8), 30);
11836         PerlIO_printf(Perl_debug_log,
11837                       "anchored utf8 %s%s at %"IVdf" ",
11838                       s, RE_SV_TAIL(r->anchored_utf8),
11839                       (IV)r->anchored_offset);
11840     }
11841     if (r->float_substr) {
11842         RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
11843             RE_SV_DUMPLEN(r->float_substr), 30);
11844         PerlIO_printf(Perl_debug_log,
11845                       "floating %s%s at %"IVdf"..%"UVuf" ",
11846                       s, RE_SV_TAIL(r->float_substr),
11847                       (IV)r->float_min_offset, (UV)r->float_max_offset);
11848     } else if (r->float_utf8) {
11849         RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
11850             RE_SV_DUMPLEN(r->float_utf8), 30);
11851         PerlIO_printf(Perl_debug_log,
11852                       "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
11853                       s, RE_SV_TAIL(r->float_utf8),
11854                       (IV)r->float_min_offset, (UV)r->float_max_offset);
11855     }
11856     if (r->check_substr || r->check_utf8)
11857         PerlIO_printf(Perl_debug_log,
11858                       (const char *)
11859                       (r->check_substr == r->float_substr
11860                        && r->check_utf8 == r->float_utf8
11861                        ? "(checking floating" : "(checking anchored"));
11862     if (r->extflags & RXf_NOSCAN)
11863         PerlIO_printf(Perl_debug_log, " noscan");
11864     if (r->extflags & RXf_CHECK_ALL)
11865         PerlIO_printf(Perl_debug_log, " isall");
11866     if (r->check_substr || r->check_utf8)
11867         PerlIO_printf(Perl_debug_log, ") ");
11868
11869     if (ri->regstclass) {
11870         regprop(r, sv, ri->regstclass);
11871         PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
11872     }
11873     if (r->extflags & RXf_ANCH) {
11874         PerlIO_printf(Perl_debug_log, "anchored");
11875         if (r->extflags & RXf_ANCH_BOL)
11876             PerlIO_printf(Perl_debug_log, "(BOL)");
11877         if (r->extflags & RXf_ANCH_MBOL)
11878             PerlIO_printf(Perl_debug_log, "(MBOL)");
11879         if (r->extflags & RXf_ANCH_SBOL)
11880             PerlIO_printf(Perl_debug_log, "(SBOL)");
11881         if (r->extflags & RXf_ANCH_GPOS)
11882             PerlIO_printf(Perl_debug_log, "(GPOS)");
11883         PerlIO_putc(Perl_debug_log, ' ');
11884     }
11885     if (r->extflags & RXf_GPOS_SEEN)
11886         PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
11887     if (r->intflags & PREGf_SKIP)
11888         PerlIO_printf(Perl_debug_log, "plus ");
11889     if (r->intflags & PREGf_IMPLICIT)
11890         PerlIO_printf(Perl_debug_log, "implicit ");
11891     PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
11892     if (r->extflags & RXf_EVAL_SEEN)
11893         PerlIO_printf(Perl_debug_log, "with eval ");
11894     PerlIO_printf(Perl_debug_log, "\n");
11895     DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
11896 #else
11897     PERL_ARGS_ASSERT_REGDUMP;
11898     PERL_UNUSED_CONTEXT;
11899     PERL_UNUSED_ARG(r);
11900 #endif  /* DEBUGGING */
11901 }
11902
11903 /*
11904 - regprop - printable representation of opcode
11905 */
11906 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
11907 STMT_START { \
11908         if (do_sep) {                           \
11909             Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
11910             if (flags & ANYOF_INVERT)           \
11911                 /*make sure the invert info is in each */ \
11912                 sv_catpvs(sv, "^");             \
11913             do_sep = 0;                         \
11914         }                                       \
11915 } STMT_END
11916
11917 void
11918 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
11919 {
11920 #ifdef DEBUGGING
11921     dVAR;
11922     register int k;
11923     RXi_GET_DECL(prog,progi);
11924     GET_RE_DEBUG_FLAGS_DECL;
11925
11926     PERL_ARGS_ASSERT_REGPROP;
11927
11928     sv_setpvs(sv, "");
11929
11930     if (OP(o) > REGNODE_MAX)            /* regnode.type is unsigned */
11931         /* It would be nice to FAIL() here, but this may be called from
11932            regexec.c, and it would be hard to supply pRExC_state. */
11933         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
11934     sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
11935
11936     k = PL_regkind[OP(o)];
11937
11938     if (k == EXACT) {
11939         sv_catpvs(sv, " ");
11940         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
11941          * is a crude hack but it may be the best for now since
11942          * we have no flag "this EXACTish node was UTF-8"
11943          * --jhi */
11944         pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
11945                   PERL_PV_ESCAPE_UNI_DETECT |
11946                   PERL_PV_ESCAPE_NONASCII   |
11947                   PERL_PV_PRETTY_ELLIPSES   |
11948                   PERL_PV_PRETTY_LTGT       |
11949                   PERL_PV_PRETTY_NOCLEAR
11950                   );
11951     } else if (k == TRIE) {
11952         /* print the details of the trie in dumpuntil instead, as
11953          * progi->data isn't available here */
11954         const char op = OP(o);
11955         const U32 n = ARG(o);
11956         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
11957                (reg_ac_data *)progi->data->data[n] :
11958                NULL;
11959         const reg_trie_data * const trie
11960             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
11961
11962         Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
11963         DEBUG_TRIE_COMPILE_r(
11964             Perl_sv_catpvf(aTHX_ sv,
11965                 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
11966                 (UV)trie->startstate,
11967                 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
11968                 (UV)trie->wordcount,
11969                 (UV)trie->minlen,
11970                 (UV)trie->maxlen,
11971                 (UV)TRIE_CHARCOUNT(trie),
11972                 (UV)trie->uniquecharcount
11973             )
11974         );
11975         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
11976             int i;
11977             int rangestart = -1;
11978             U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
11979             sv_catpvs(sv, "[");
11980             for (i = 0; i <= 256; i++) {
11981                 if (i < 256 && BITMAP_TEST(bitmap,i)) {
11982                     if (rangestart == -1)
11983                         rangestart = i;
11984                 } else if (rangestart != -1) {
11985                     if (i <= rangestart + 3)
11986                         for (; rangestart < i; rangestart++)
11987                             put_byte(sv, rangestart);
11988                     else {
11989                         put_byte(sv, rangestart);
11990                         sv_catpvs(sv, "-");
11991                         put_byte(sv, i - 1);
11992                     }
11993                     rangestart = -1;
11994                 }
11995             }
11996             sv_catpvs(sv, "]");
11997         }
11998
11999     } else if (k == CURLY) {
12000         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12001             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12002         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12003     }
12004     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
12005         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12006     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12007         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
12008         if ( RXp_PAREN_NAMES(prog) ) {
12009             if ( k != REF || (OP(o) < NREF)) {
12010                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12011                 SV **name= av_fetch(list, ARG(o), 0 );
12012                 if (name)
12013                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12014             }
12015             else {
12016                 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12017                 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12018                 I32 *nums=(I32*)SvPVX(sv_dat);
12019                 SV **name= av_fetch(list, nums[0], 0 );
12020                 I32 n;
12021                 if (name) {
12022                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
12023                         Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12024                                     (n ? "," : ""), (IV)nums[n]);
12025                     }
12026                     Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12027                 }
12028             }
12029         }
12030     } else if (k == GOSUB)
12031         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12032     else if (k == VERB) {
12033         if (!o->flags)
12034             Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12035                            SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12036     } else if (k == LOGICAL)
12037         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
12038     else if (k == ANYOF) {
12039         int i, rangestart = -1;
12040         const U8 flags = ANYOF_FLAGS(o);
12041         int do_sep = 0;
12042
12043         /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12044         static const char * const anyofs[] = {
12045             "\\w",
12046             "\\W",
12047             "\\s",
12048             "\\S",
12049             "\\d",
12050             "\\D",
12051             "[:alnum:]",
12052             "[:^alnum:]",
12053             "[:alpha:]",
12054             "[:^alpha:]",
12055             "[:ascii:]",
12056             "[:^ascii:]",
12057             "[:cntrl:]",
12058             "[:^cntrl:]",
12059             "[:graph:]",
12060             "[:^graph:]",
12061             "[:lower:]",
12062             "[:^lower:]",
12063             "[:print:]",
12064             "[:^print:]",
12065             "[:punct:]",
12066             "[:^punct:]",
12067             "[:upper:]",
12068             "[:^upper:]",
12069             "[:xdigit:]",
12070             "[:^xdigit:]",
12071             "[:space:]",
12072             "[:^space:]",
12073             "[:blank:]",
12074             "[:^blank:]"
12075         };
12076
12077         if (flags & ANYOF_LOCALE)
12078             sv_catpvs(sv, "{loc}");
12079         if (flags & ANYOF_LOC_NONBITMAP_FOLD)
12080             sv_catpvs(sv, "{i}");
12081         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
12082         if (flags & ANYOF_INVERT)
12083             sv_catpvs(sv, "^");
12084
12085         /* output what the standard cp 0-255 bitmap matches */
12086         for (i = 0; i <= 256; i++) {
12087             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12088                 if (rangestart == -1)
12089                     rangestart = i;
12090             } else if (rangestart != -1) {
12091                 if (i <= rangestart + 3)
12092                     for (; rangestart < i; rangestart++)
12093                         put_byte(sv, rangestart);
12094                 else {
12095                     put_byte(sv, rangestart);
12096                     sv_catpvs(sv, "-");
12097                     put_byte(sv, i - 1);
12098                 }
12099                 do_sep = 1;
12100                 rangestart = -1;
12101             }
12102         }
12103
12104         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12105         /* output any special charclass tests (used entirely under use locale) */
12106         if (ANYOF_CLASS_TEST_ANY_SET(o))
12107             for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
12108                 if (ANYOF_CLASS_TEST(o,i)) {
12109                     sv_catpv(sv, anyofs[i]);
12110                     do_sep = 1;
12111                 }
12112
12113         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12114
12115         if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12116             sv_catpvs(sv, "{non-utf8-latin1-all}");
12117         }
12118
12119         /* output information about the unicode matching */
12120         if (flags & ANYOF_UNICODE_ALL)
12121             sv_catpvs(sv, "{unicode_all}");
12122         else if (ANYOF_NONBITMAP(o))
12123             sv_catpvs(sv, "{unicode}");
12124         if (flags & ANYOF_NONBITMAP_NON_UTF8)
12125             sv_catpvs(sv, "{outside bitmap}");
12126
12127         if (ANYOF_NONBITMAP(o)) {
12128             SV *lv; /* Set if there is something outside the bit map */
12129             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
12130             bool byte_output = FALSE;   /* If something in the bitmap has been
12131                                            output */
12132
12133             if (lv && lv != &PL_sv_undef) {
12134                 if (sw) {
12135                     U8 s[UTF8_MAXBYTES_CASE+1];
12136
12137                     for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
12138                         uvchr_to_utf8(s, i);
12139
12140                         if (i < 256
12141                             && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
12142                                                                things already
12143                                                                output as part
12144                                                                of the bitmap */
12145                             && swash_fetch(sw, s, TRUE))
12146                         {
12147                             if (rangestart == -1)
12148                                 rangestart = i;
12149                         } else if (rangestart != -1) {
12150                             byte_output = TRUE;
12151                             if (i <= rangestart + 3)
12152                                 for (; rangestart < i; rangestart++) {
12153                                     put_byte(sv, rangestart);
12154                                 }
12155                             else {
12156                                 put_byte(sv, rangestart);
12157                                 sv_catpvs(sv, "-");
12158                                 put_byte(sv, i-1);
12159                             }
12160                             rangestart = -1;
12161                         }
12162                     }
12163                 }
12164
12165                 {
12166                     char *s = savesvpv(lv);
12167                     char * const origs = s;
12168
12169                     while (*s && *s != '\n')
12170                         s++;
12171
12172                     if (*s == '\n') {
12173                         const char * const t = ++s;
12174
12175                         if (byte_output) {
12176                             sv_catpvs(sv, " ");
12177                         }
12178
12179                         while (*s) {
12180                             if (*s == '\n') {
12181
12182                                 /* Truncate very long output */
12183                                 if (s - origs > 256) {
12184                                     Perl_sv_catpvf(aTHX_ sv,
12185                                                    "%.*s...",
12186                                                    (int) (s - origs - 1),
12187                                                    t);
12188                                     goto out_dump;
12189                                 }
12190                                 *s = ' ';
12191                             }
12192                             else if (*s == '\t') {
12193                                 *s = '-';
12194                             }
12195                             s++;
12196                         }
12197                         if (s[-1] == ' ')
12198                             s[-1] = 0;
12199
12200                         sv_catpv(sv, t);
12201                     }
12202
12203                 out_dump:
12204
12205                     Safefree(origs);
12206                 }
12207                 SvREFCNT_dec(lv);
12208             }
12209         }
12210
12211         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12212     }
12213     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
12214         Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
12215 #else
12216     PERL_UNUSED_CONTEXT;
12217     PERL_UNUSED_ARG(sv);
12218     PERL_UNUSED_ARG(o);
12219     PERL_UNUSED_ARG(prog);
12220 #endif  /* DEBUGGING */
12221 }
12222
12223 SV *
12224 Perl_re_intuit_string(pTHX_ REGEXP * const r)
12225 {                               /* Assume that RE_INTUIT is set */
12226     dVAR;
12227     struct regexp *const prog = (struct regexp *)SvANY(r);
12228     GET_RE_DEBUG_FLAGS_DECL;
12229
12230     PERL_ARGS_ASSERT_RE_INTUIT_STRING;
12231     PERL_UNUSED_CONTEXT;
12232
12233     DEBUG_COMPILE_r(
12234         {
12235             const char * const s = SvPV_nolen_const(prog->check_substr
12236                       ? prog->check_substr : prog->check_utf8);
12237
12238             if (!PL_colorset) reginitcolors();
12239             PerlIO_printf(Perl_debug_log,
12240                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
12241                       PL_colors[4],
12242                       prog->check_substr ? "" : "utf8 ",
12243                       PL_colors[5],PL_colors[0],
12244                       s,
12245                       PL_colors[1],
12246                       (strlen(s) > 60 ? "..." : ""));
12247         } );
12248
12249     return prog->check_substr ? prog->check_substr : prog->check_utf8;
12250 }
12251
12252 /*
12253    pregfree()
12254
12255    handles refcounting and freeing the perl core regexp structure. When
12256    it is necessary to actually free the structure the first thing it
12257    does is call the 'free' method of the regexp_engine associated to
12258    the regexp, allowing the handling of the void *pprivate; member
12259    first. (This routine is not overridable by extensions, which is why
12260    the extensions free is called first.)
12261
12262    See regdupe and regdupe_internal if you change anything here.
12263 */
12264 #ifndef PERL_IN_XSUB_RE
12265 void
12266 Perl_pregfree(pTHX_ REGEXP *r)
12267 {
12268     SvREFCNT_dec(r);
12269 }
12270
12271 void
12272 Perl_pregfree2(pTHX_ REGEXP *rx)
12273 {
12274     dVAR;
12275     struct regexp *const r = (struct regexp *)SvANY(rx);
12276     GET_RE_DEBUG_FLAGS_DECL;
12277
12278     PERL_ARGS_ASSERT_PREGFREE2;
12279
12280     if (r->mother_re) {
12281         ReREFCNT_dec(r->mother_re);
12282     } else {
12283         CALLREGFREE_PVT(rx); /* free the private data */
12284         SvREFCNT_dec(RXp_PAREN_NAMES(r));
12285     }
12286     if (r->substrs) {
12287         SvREFCNT_dec(r->anchored_substr);
12288         SvREFCNT_dec(r->anchored_utf8);
12289         SvREFCNT_dec(r->float_substr);
12290         SvREFCNT_dec(r->float_utf8);
12291         Safefree(r->substrs);
12292     }
12293     RX_MATCH_COPY_FREE(rx);
12294 #ifdef PERL_OLD_COPY_ON_WRITE
12295     SvREFCNT_dec(r->saved_copy);
12296 #endif
12297     Safefree(r->offs);
12298 }
12299
12300 /*  reg_temp_copy()
12301
12302     This is a hacky workaround to the structural issue of match results
12303     being stored in the regexp structure which is in turn stored in
12304     PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12305     could be PL_curpm in multiple contexts, and could require multiple
12306     result sets being associated with the pattern simultaneously, such
12307     as when doing a recursive match with (??{$qr})
12308
12309     The solution is to make a lightweight copy of the regexp structure
12310     when a qr// is returned from the code executed by (??{$qr}) this
12311     lightweight copy doesn't actually own any of its data except for
12312     the starp/end and the actual regexp structure itself.
12313
12314 */
12315
12316
12317 REGEXP *
12318 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
12319 {
12320     struct regexp *ret;
12321     struct regexp *const r = (struct regexp *)SvANY(rx);
12322     register const I32 npar = r->nparens+1;
12323
12324     PERL_ARGS_ASSERT_REG_TEMP_COPY;
12325
12326     if (!ret_x)
12327         ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12328     ret = (struct regexp *)SvANY(ret_x);
12329
12330     (void)ReREFCNT_inc(rx);
12331     /* We can take advantage of the existing "copied buffer" mechanism in SVs
12332        by pointing directly at the buffer, but flagging that the allocated
12333        space in the copy is zero. As we've just done a struct copy, it's now
12334        a case of zero-ing that, rather than copying the current length.  */
12335     SvPV_set(ret_x, RX_WRAPPED(rx));
12336     SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
12337     memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12338            sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
12339     SvLEN_set(ret_x, 0);
12340     SvSTASH_set(ret_x, NULL);
12341     SvMAGIC_set(ret_x, NULL);
12342     Newx(ret->offs, npar, regexp_paren_pair);
12343     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12344     if (r->substrs) {
12345         Newx(ret->substrs, 1, struct reg_substr_data);
12346         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12347
12348         SvREFCNT_inc_void(ret->anchored_substr);
12349         SvREFCNT_inc_void(ret->anchored_utf8);
12350         SvREFCNT_inc_void(ret->float_substr);
12351         SvREFCNT_inc_void(ret->float_utf8);
12352
12353         /* check_substr and check_utf8, if non-NULL, point to either their
12354            anchored or float namesakes, and don't hold a second reference.  */
12355     }
12356     RX_MATCH_COPIED_off(ret_x);
12357 #ifdef PERL_OLD_COPY_ON_WRITE
12358     ret->saved_copy = NULL;
12359 #endif
12360     ret->mother_re = rx;
12361
12362     return ret_x;
12363 }
12364 #endif
12365
12366 /* regfree_internal()
12367
12368    Free the private data in a regexp. This is overloadable by
12369    extensions. Perl takes care of the regexp structure in pregfree(),
12370    this covers the *pprivate pointer which technically perl doesn't
12371    know about, however of course we have to handle the
12372    regexp_internal structure when no extension is in use.
12373
12374    Note this is called before freeing anything in the regexp
12375    structure.
12376  */
12377
12378 void
12379 Perl_regfree_internal(pTHX_ REGEXP * const rx)
12380 {
12381     dVAR;
12382     struct regexp *const r = (struct regexp *)SvANY(rx);
12383     RXi_GET_DECL(r,ri);
12384     GET_RE_DEBUG_FLAGS_DECL;
12385
12386     PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12387
12388     DEBUG_COMPILE_r({
12389         if (!PL_colorset)
12390             reginitcolors();
12391         {
12392             SV *dsv= sv_newmortal();
12393             RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
12394                 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
12395             PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12396                 PL_colors[4],PL_colors[5],s);
12397         }
12398     });
12399 #ifdef RE_TRACK_PATTERN_OFFSETS
12400     if (ri->u.offsets)
12401         Safefree(ri->u.offsets);             /* 20010421 MJD */
12402 #endif
12403     if (ri->data) {
12404         int n = ri->data->count;
12405         PAD* new_comppad = NULL;
12406         PAD* old_comppad;
12407         PADOFFSET refcnt;
12408
12409         while (--n >= 0) {
12410           /* If you add a ->what type here, update the comment in regcomp.h */
12411             switch (ri->data->what[n]) {
12412             case 'a':
12413             case 's':
12414             case 'S':
12415             case 'u':
12416                 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
12417                 break;
12418             case 'f':
12419                 Safefree(ri->data->data[n]);
12420                 break;
12421             case 'p':
12422                 new_comppad = MUTABLE_AV(ri->data->data[n]);
12423                 break;
12424             case 'o':
12425                 if (new_comppad == NULL)
12426                     Perl_croak(aTHX_ "panic: pregfree comppad");
12427                 PAD_SAVE_LOCAL(old_comppad,
12428                     /* Watch out for global destruction's random ordering. */
12429                     (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
12430                 );
12431                 OP_REFCNT_LOCK;
12432                 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
12433                 OP_REFCNT_UNLOCK;
12434                 if (!refcnt)
12435                     op_free((OP_4tree*)ri->data->data[n]);
12436
12437                 PAD_RESTORE_LOCAL(old_comppad);
12438                 SvREFCNT_dec(MUTABLE_SV(new_comppad));
12439                 new_comppad = NULL;
12440                 break;
12441             case 'n':
12442                 break;
12443             case 'T':
12444                 { /* Aho Corasick add-on structure for a trie node.
12445                      Used in stclass optimization only */
12446                     U32 refcount;
12447                     reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
12448                     OP_REFCNT_LOCK;
12449                     refcount = --aho->refcount;
12450                     OP_REFCNT_UNLOCK;
12451                     if ( !refcount ) {
12452                         PerlMemShared_free(aho->states);
12453                         PerlMemShared_free(aho->fail);
12454                          /* do this last!!!! */
12455                         PerlMemShared_free(ri->data->data[n]);
12456                         PerlMemShared_free(ri->regstclass);
12457                     }
12458                 }
12459                 break;
12460             case 't':
12461                 {
12462                     /* trie structure. */
12463                     U32 refcount;
12464                     reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
12465                     OP_REFCNT_LOCK;
12466                     refcount = --trie->refcount;
12467                     OP_REFCNT_UNLOCK;
12468                     if ( !refcount ) {
12469                         PerlMemShared_free(trie->charmap);
12470                         PerlMemShared_free(trie->states);
12471                         PerlMemShared_free(trie->trans);
12472                         if (trie->bitmap)
12473                             PerlMemShared_free(trie->bitmap);
12474                         if (trie->jump)
12475                             PerlMemShared_free(trie->jump);
12476                         PerlMemShared_free(trie->wordinfo);
12477                         /* do this last!!!! */
12478                         PerlMemShared_free(ri->data->data[n]);
12479                     }
12480                 }
12481                 break;
12482             default:
12483                 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
12484             }
12485         }
12486         Safefree(ri->data->what);
12487         Safefree(ri->data);
12488     }
12489
12490     Safefree(ri);
12491 }
12492
12493 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12494 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
12495 #define SAVEPVN(p,n)    ((p) ? savepvn(p,n) : NULL)
12496
12497 /*
12498    re_dup - duplicate a regexp.
12499
12500    This routine is expected to clone a given regexp structure. It is only
12501    compiled under USE_ITHREADS.
12502
12503    After all of the core data stored in struct regexp is duplicated
12504    the regexp_engine.dupe method is used to copy any private data
12505    stored in the *pprivate pointer. This allows extensions to handle
12506    any duplication it needs to do.
12507
12508    See pregfree() and regfree_internal() if you change anything here.
12509 */
12510 #if defined(USE_ITHREADS)
12511 #ifndef PERL_IN_XSUB_RE
12512 void
12513 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
12514 {
12515     dVAR;
12516     I32 npar;
12517     const struct regexp *r = (const struct regexp *)SvANY(sstr);
12518     struct regexp *ret = (struct regexp *)SvANY(dstr);
12519
12520     PERL_ARGS_ASSERT_RE_DUP_GUTS;
12521
12522     npar = r->nparens+1;
12523     Newx(ret->offs, npar, regexp_paren_pair);
12524     Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12525     if(ret->swap) {
12526         /* no need to copy these */
12527         Newx(ret->swap, npar, regexp_paren_pair);
12528     }
12529
12530     if (ret->substrs) {
12531         /* Do it this way to avoid reading from *r after the StructCopy().
12532            That way, if any of the sv_dup_inc()s dislodge *r from the L1
12533            cache, it doesn't matter.  */
12534         const bool anchored = r->check_substr
12535             ? r->check_substr == r->anchored_substr
12536             : r->check_utf8 == r->anchored_utf8;
12537         Newx(ret->substrs, 1, struct reg_substr_data);
12538         StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12539
12540         ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12541         ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12542         ret->float_substr = sv_dup_inc(ret->float_substr, param);
12543         ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
12544
12545         /* check_substr and check_utf8, if non-NULL, point to either their
12546            anchored or float namesakes, and don't hold a second reference.  */
12547
12548         if (ret->check_substr) {
12549             if (anchored) {
12550                 assert(r->check_utf8 == r->anchored_utf8);
12551                 ret->check_substr = ret->anchored_substr;
12552                 ret->check_utf8 = ret->anchored_utf8;
12553             } else {
12554                 assert(r->check_substr == r->float_substr);
12555                 assert(r->check_utf8 == r->float_utf8);
12556                 ret->check_substr = ret->float_substr;
12557                 ret->check_utf8 = ret->float_utf8;
12558             }
12559         } else if (ret->check_utf8) {
12560             if (anchored) {
12561                 ret->check_utf8 = ret->anchored_utf8;
12562             } else {
12563                 ret->check_utf8 = ret->float_utf8;
12564             }
12565         }
12566     }
12567
12568     RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
12569
12570     if (ret->pprivate)
12571         RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
12572
12573     if (RX_MATCH_COPIED(dstr))
12574         ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
12575     else
12576         ret->subbeg = NULL;
12577 #ifdef PERL_OLD_COPY_ON_WRITE
12578     ret->saved_copy = NULL;
12579 #endif
12580
12581     if (ret->mother_re) {
12582         if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
12583             /* Our storage points directly to our mother regexp, but that's
12584                1: a buffer in a different thread
12585                2: something we no longer hold a reference on
12586                so we need to copy it locally.  */
12587             /* Note we need to use SvCUR(), rather than
12588                SvLEN(), on our mother_re, because it, in
12589                turn, may well be pointing to its own mother_re.  */
12590             SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
12591                                    SvCUR(ret->mother_re)+1));
12592             SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
12593         }
12594         ret->mother_re      = NULL;
12595     }
12596     ret->gofs = 0;
12597 }
12598 #endif /* PERL_IN_XSUB_RE */
12599
12600 /*
12601    regdupe_internal()
12602
12603    This is the internal complement to regdupe() which is used to copy
12604    the structure pointed to by the *pprivate pointer in the regexp.
12605    This is the core version of the extension overridable cloning hook.
12606    The regexp structure being duplicated will be copied by perl prior
12607    to this and will be provided as the regexp *r argument, however
12608    with the /old/ structures pprivate pointer value. Thus this routine
12609    may override any copying normally done by perl.
12610
12611    It returns a pointer to the new regexp_internal structure.
12612 */
12613
12614 void *
12615 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
12616 {
12617     dVAR;
12618     struct regexp *const r = (struct regexp *)SvANY(rx);
12619     regexp_internal *reti;
12620     int len;
12621     RXi_GET_DECL(r,ri);
12622
12623     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
12624
12625     len = ProgLen(ri);
12626
12627     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
12628     Copy(ri->program, reti->program, len+1, regnode);
12629
12630
12631     reti->regstclass = NULL;
12632
12633     if (ri->data) {
12634         struct reg_data *d;
12635         const int count = ri->data->count;
12636         int i;
12637
12638         Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
12639                 char, struct reg_data);
12640         Newx(d->what, count, U8);
12641
12642         d->count = count;
12643         for (i = 0; i < count; i++) {
12644             d->what[i] = ri->data->what[i];
12645             switch (d->what[i]) {
12646                 /* legal options are one of: sSfpontTua
12647                    see also regcomp.h and pregfree() */
12648             case 'a': /* actually an AV, but the dup function is identical.  */
12649             case 's':
12650             case 'S':
12651             case 'p': /* actually an AV, but the dup function is identical.  */
12652             case 'u': /* actually an HV, but the dup function is identical.  */
12653                 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
12654                 break;
12655             case 'f':
12656                 /* This is cheating. */
12657                 Newx(d->data[i], 1, struct regnode_charclass_class);
12658                 StructCopy(ri->data->data[i], d->data[i],
12659                             struct regnode_charclass_class);
12660                 reti->regstclass = (regnode*)d->data[i];
12661                 break;
12662             case 'o':
12663                 /* Compiled op trees are readonly and in shared memory,
12664                    and can thus be shared without duplication. */
12665                 OP_REFCNT_LOCK;
12666                 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
12667                 OP_REFCNT_UNLOCK;
12668                 break;
12669             case 'T':
12670                 /* Trie stclasses are readonly and can thus be shared
12671                  * without duplication. We free the stclass in pregfree
12672                  * when the corresponding reg_ac_data struct is freed.
12673                  */
12674                 reti->regstclass= ri->regstclass;
12675                 /* Fall through */
12676             case 't':
12677                 OP_REFCNT_LOCK;
12678                 ((reg_trie_data*)ri->data->data[i])->refcount++;
12679                 OP_REFCNT_UNLOCK;
12680                 /* Fall through */
12681             case 'n':
12682                 d->data[i] = ri->data->data[i];
12683                 break;
12684             default:
12685                 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
12686             }
12687         }
12688
12689         reti->data = d;
12690     }
12691     else
12692         reti->data = NULL;
12693
12694     reti->name_list_idx = ri->name_list_idx;
12695
12696 #ifdef RE_TRACK_PATTERN_OFFSETS
12697     if (ri->u.offsets) {
12698         Newx(reti->u.offsets, 2*len+1, U32);
12699         Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
12700     }
12701 #else
12702     SetProgLen(reti,len);
12703 #endif
12704
12705     return (void*)reti;
12706 }
12707
12708 #endif    /* USE_ITHREADS */
12709
12710 #ifndef PERL_IN_XSUB_RE
12711
12712 /*
12713  - regnext - dig the "next" pointer out of a node
12714  */
12715 regnode *
12716 Perl_regnext(pTHX_ register regnode *p)
12717 {
12718     dVAR;
12719     register I32 offset;
12720
12721     if (!p)
12722         return(NULL);
12723
12724     if (OP(p) > REGNODE_MAX) {          /* regnode.type is unsigned */
12725         Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
12726     }
12727
12728     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
12729     if (offset == 0)
12730         return(NULL);
12731
12732     return(p+offset);
12733 }
12734 #endif
12735
12736 STATIC void
12737 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
12738 {
12739     va_list args;
12740     STRLEN l1 = strlen(pat1);
12741     STRLEN l2 = strlen(pat2);
12742     char buf[512];
12743     SV *msv;
12744     const char *message;
12745
12746     PERL_ARGS_ASSERT_RE_CROAK2;
12747
12748     if (l1 > 510)
12749         l1 = 510;
12750     if (l1 + l2 > 510)
12751         l2 = 510 - l1;
12752     Copy(pat1, buf, l1 , char);
12753     Copy(pat2, buf + l1, l2 , char);
12754     buf[l1 + l2] = '\n';
12755     buf[l1 + l2 + 1] = '\0';
12756 #ifdef I_STDARG
12757     /* ANSI variant takes additional second argument */
12758     va_start(args, pat2);
12759 #else
12760     va_start(args);
12761 #endif
12762     msv = vmess(buf, &args);
12763     va_end(args);
12764     message = SvPV_const(msv,l1);
12765     if (l1 > 512)
12766         l1 = 512;
12767     Copy(message, buf, l1 , char);
12768     buf[l1-1] = '\0';                   /* Overwrite \n */
12769     Perl_croak(aTHX_ "%s", buf);
12770 }
12771
12772 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
12773
12774 #ifndef PERL_IN_XSUB_RE
12775 void
12776 Perl_save_re_context(pTHX)
12777 {
12778     dVAR;
12779
12780     struct re_save_state *state;
12781
12782     SAVEVPTR(PL_curcop);
12783     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
12784
12785     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
12786     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
12787     SSPUSHUV(SAVEt_RE_STATE);
12788
12789     Copy(&PL_reg_state, state, 1, struct re_save_state);
12790
12791     PL_reg_start_tmp = 0;
12792     PL_reg_start_tmpl = 0;
12793     PL_reg_oldsaved = NULL;
12794     PL_reg_oldsavedlen = 0;
12795     PL_reg_maxiter = 0;
12796     PL_reg_leftiter = 0;
12797     PL_reg_poscache = NULL;
12798     PL_reg_poscache_size = 0;
12799 #ifdef PERL_OLD_COPY_ON_WRITE
12800     PL_nrs = NULL;
12801 #endif
12802
12803     /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
12804     if (PL_curpm) {
12805         const REGEXP * const rx = PM_GETRE(PL_curpm);
12806         if (rx) {
12807             U32 i;
12808             for (i = 1; i <= RX_NPARENS(rx); i++) {
12809                 char digits[TYPE_CHARS(long)];
12810                 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
12811                 GV *const *const gvp
12812                     = (GV**)hv_fetch(PL_defstash, digits, len, 0);
12813
12814                 if (gvp) {
12815                     GV * const gv = *gvp;
12816                     if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
12817                         save_scalar(gv);
12818                 }
12819             }
12820         }
12821     }
12822 }
12823 #endif
12824
12825 static void
12826 clear_re(pTHX_ void *r)
12827 {
12828     dVAR;
12829     ReREFCNT_dec((REGEXP *)r);
12830 }
12831
12832 #ifdef DEBUGGING
12833
12834 STATIC void
12835 S_put_byte(pTHX_ SV *sv, int c)
12836 {
12837     PERL_ARGS_ASSERT_PUT_BYTE;
12838
12839     /* Our definition of isPRINT() ignores locales, so only bytes that are
12840        not part of UTF-8 are considered printable. I assume that the same
12841        holds for UTF-EBCDIC.
12842        Also, code point 255 is not printable in either (it's E0 in EBCDIC,
12843        which Wikipedia says:
12844
12845        EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
12846        ones (binary 1111 1111, hexadecimal FF). It is similar, but not
12847        identical, to the ASCII delete (DEL) or rubout control character.
12848        ) So the old condition can be simplified to !isPRINT(c)  */
12849     if (!isPRINT(c)) {
12850         if (c < 256) {
12851             Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
12852         }
12853         else {
12854             Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
12855         }
12856     }
12857     else {
12858         const char string = c;
12859         if (c == '-' || c == ']' || c == '\\' || c == '^')
12860             sv_catpvs(sv, "\\");
12861         sv_catpvn(sv, &string, 1);
12862     }
12863 }
12864
12865
12866 #define CLEAR_OPTSTART \
12867     if (optstart) STMT_START { \
12868             DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
12869             optstart=NULL; \
12870     } STMT_END
12871
12872 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
12873
12874 STATIC const regnode *
12875 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
12876             const regnode *last, const regnode *plast,
12877             SV* sv, I32 indent, U32 depth)
12878 {
12879     dVAR;
12880     register U8 op = PSEUDO;    /* Arbitrary non-END op. */
12881     register const regnode *next;
12882     const regnode *optstart= NULL;
12883
12884     RXi_GET_DECL(r,ri);
12885     GET_RE_DEBUG_FLAGS_DECL;
12886
12887     PERL_ARGS_ASSERT_DUMPUNTIL;
12888
12889 #ifdef DEBUG_DUMPUNTIL
12890     PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
12891         last ? last-start : 0,plast ? plast-start : 0);
12892 #endif
12893
12894     if (plast && plast < last)
12895         last= plast;
12896
12897     while (PL_regkind[op] != END && (!last || node < last)) {
12898         /* While that wasn't END last time... */
12899         NODE_ALIGN(node);
12900         op = OP(node);
12901         if (op == CLOSE || op == WHILEM)
12902             indent--;
12903         next = regnext((regnode *)node);
12904
12905         /* Where, what. */
12906         if (OP(node) == OPTIMIZED) {
12907             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
12908                 optstart = node;
12909             else
12910                 goto after_print;
12911         } else
12912             CLEAR_OPTSTART;
12913
12914         regprop(r, sv, node);
12915         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
12916                       (int)(2*indent + 1), "", SvPVX_const(sv));
12917
12918         if (OP(node) != OPTIMIZED) {
12919             if (next == NULL)           /* Next ptr. */
12920                 PerlIO_printf(Perl_debug_log, " (0)");
12921             else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
12922                 PerlIO_printf(Perl_debug_log, " (FAIL)");
12923             else
12924                 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
12925             (void)PerlIO_putc(Perl_debug_log, '\n');
12926         }
12927
12928       after_print:
12929         if (PL_regkind[(U8)op] == BRANCHJ) {
12930             assert(next);
12931             {
12932                 register const regnode *nnode = (OP(next) == LONGJMP
12933                                              ? regnext((regnode *)next)
12934                                              : next);
12935                 if (last && nnode > last)
12936                     nnode = last;
12937                 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
12938             }
12939         }
12940         else if (PL_regkind[(U8)op] == BRANCH) {
12941             assert(next);
12942             DUMPUNTIL(NEXTOPER(node), next);
12943         }
12944         else if ( PL_regkind[(U8)op]  == TRIE ) {
12945             const regnode *this_trie = node;
12946             const char op = OP(node);
12947             const U32 n = ARG(node);
12948             const reg_ac_data * const ac = op>=AHOCORASICK ?
12949                (reg_ac_data *)ri->data->data[n] :
12950                NULL;
12951             const reg_trie_data * const trie =
12952                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
12953 #ifdef DEBUGGING
12954             AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
12955 #endif
12956             const regnode *nextbranch= NULL;
12957             I32 word_idx;
12958             sv_setpvs(sv, "");
12959             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
12960                 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
12961
12962                 PerlIO_printf(Perl_debug_log, "%*s%s ",
12963                    (int)(2*(indent+3)), "",
12964                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
12965                             PL_colors[0], PL_colors[1],
12966                             (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
12967                             PERL_PV_PRETTY_ELLIPSES    |
12968                             PERL_PV_PRETTY_LTGT
12969                             )
12970                             : "???"
12971                 );
12972                 if (trie->jump) {
12973                     U16 dist= trie->jump[word_idx+1];
12974                     PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
12975                                   (UV)((dist ? this_trie + dist : next) - start));
12976                     if (dist) {
12977                         if (!nextbranch)
12978                             nextbranch= this_trie + trie->jump[0];
12979                         DUMPUNTIL(this_trie + dist, nextbranch);
12980                     }
12981                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
12982                         nextbranch= regnext((regnode *)nextbranch);
12983                 } else {
12984                     PerlIO_printf(Perl_debug_log, "\n");
12985                 }
12986             }
12987             if (last && next > last)
12988                 node= last;
12989             else
12990                 node= next;
12991         }
12992         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
12993             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
12994                     NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
12995         }
12996         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
12997             assert(next);
12998             DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
12999         }
13000         else if ( op == PLUS || op == STAR) {
13001             DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13002         }
13003         else if (PL_regkind[(U8)op] == ANYOF) {
13004             /* arglen 1 + class block */
13005             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13006                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13007             node = NEXTOPER(node);
13008         }
13009         else if (PL_regkind[(U8)op] == EXACT) {
13010             /* Literal string, where present. */
13011             node += NODE_SZ_STR(node) - 1;
13012             node = NEXTOPER(node);
13013         }
13014         else {
13015             node = NEXTOPER(node);
13016             node += regarglen[(U8)op];
13017         }
13018         if (op == CURLYX || op == OPEN)
13019             indent++;
13020     }
13021     CLEAR_OPTSTART;
13022 #ifdef DEBUG_DUMPUNTIL
13023     PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13024 #endif
13025     return node;
13026 }
13027
13028 #endif  /* DEBUGGING */
13029
13030 /*
13031  * Local variables:
13032  * c-indentation-style: bsd
13033  * c-basic-offset: 4
13034  * indent-tabs-mode: t
13035  * End:
13036  *
13037  * ex: set ts=8 sts=4 sw=4 noet:
13038  */