src/5021003/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifndef STATIC
 102 #define STATIC static
 103 #endif
 104
 105
 106 struct RExC_state_t {
 107  U32  flags;   /* RXf_* are we folding, multilining? */
 108  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 109  char *precomp;  /* uncompiled string. */
 110  REGEXP *rx_sv;   /* The SV that is the regexp. */
 111  regexp *rx;                    /* perl core regexp structure */
 112  regexp_internal *rxi;           /* internal data for regexp object
 113           pprivate field */
 114  char *start;   /* Start of input for compile */
 115  char *end;   /* End of input for compile */
 116  char *parse;   /* Input-scan pointer. */
 117  SSize_t whilem_seen;  /* number of WHILEM in this expr */
 118  regnode *emit_start;  /* Start of emitted-code area */
 119  regnode *emit_bound;  /* First regnode outside of the
 120           allocated space */
 121  regnode *emit;   /* Code-emit pointer; if = &emit_dummy,
 122           implies compiling, so don't emit */
 123  regnode_ssc emit_dummy;  /* placeholder for emit to point to;
 124           large enough for the largest
 125           non-EXACTish node, so can use it as
 126           scratch in pass1 */
 127  I32  naughty;  /* How bad is this pattern? */
 128  I32  sawback;  /* Did we see \1, ...? */
 129  U32  seen;
 130  SSize_t size;   /* Code size. */
 131  I32                npar;            /* Capture buffer count, (OPEN) plus
 132           one. ("par" 0 is the whole
 133           pattern)*/
 134  I32  nestroot;  /* root parens we are in - used by
 135           accept */
 136  I32  extralen;
 137  I32  seen_zerolen;
 138  regnode **open_parens;  /* pointers to open parens */
 139  regnode **close_parens;  /* pointers to close parens */
 140  regnode *opend;   /* END node in program */
 141  I32  utf8;  /* whether the pattern is utf8 or not */
 142  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 143         /* XXX use this for future optimisation of case
 144         * where pattern must be upgraded to utf8. */
 145  I32  uni_semantics; /* If a d charset modifier should use unicode
 146         rules, even if the pattern is not in
 147         utf8 */
 148  HV  *paren_names;  /* Paren names */
 149
 150  regnode **recurse;  /* Recurse regops */
 151  I32  recurse_count;  /* Number of recurse regops */
 152  U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 153           through */
 154  U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 155  I32  in_lookbehind;
 156  I32  contains_locale;
 157  I32  contains_i;
 158  I32  override_recoding;
 159  I32  in_multi_char_class;
 160  struct reg_code_block *code_blocks; /* positions of literal (?{})
 161            within pattern */
 162  int  num_code_blocks; /* size of code_blocks[] */
 163  int  code_index;  /* next code_blocks[] slot */
 164  SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 165 #ifdef ADD_TO_REGEXEC
 166  char  *starttry;  /* -Dr: where regtry was called. */
 167 #define RExC_starttry (pRExC_state->starttry)
 168 #endif
 169  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 170 #ifdef DEBUGGING
 171  const char  *lastparse;
 172  I32         lastnum;
 173  AV          *paren_name_list;       /* idx -> name */
 174 #define RExC_lastparse (pRExC_state->lastparse)
 175 #define RExC_lastnum (pRExC_state->lastnum)
 176 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 177 #endif
 178 };
 179
 180 #define RExC_flags (pRExC_state->flags)
 181 #define RExC_pm_flags (pRExC_state->pm_flags)
 182 #define RExC_precomp (pRExC_state->precomp)
 183 #define RExC_rx_sv (pRExC_state->rx_sv)
 184 #define RExC_rx  (pRExC_state->rx)
 185 #define RExC_rxi (pRExC_state->rxi)
 186 #define RExC_start (pRExC_state->start)
 187 #define RExC_end (pRExC_state->end)
 188 #define RExC_parse (pRExC_state->parse)
 189 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 190 #ifdef RE_TRACK_PATTERN_OFFSETS
 191 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
 192               others */
 193 #endif
 194 #define RExC_emit (pRExC_state->emit)
 195 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 196 #define RExC_emit_start (pRExC_state->emit_start)
 197 #define RExC_emit_bound (pRExC_state->emit_bound)
 198 #define RExC_naughty (pRExC_state->naughty)
 199 #define RExC_sawback (pRExC_state->sawback)
 200 #define RExC_seen (pRExC_state->seen)
 201 #define RExC_size (pRExC_state->size)
 202 #define RExC_maxlen        (pRExC_state->maxlen)
 203 #define RExC_npar (pRExC_state->npar)
 204 #define RExC_nestroot   (pRExC_state->nestroot)
 205 #define RExC_extralen (pRExC_state->extralen)
 206 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 207 #define RExC_utf8 (pRExC_state->utf8)
 208 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 209 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 210 #define RExC_open_parens (pRExC_state->open_parens)
 211 #define RExC_close_parens (pRExC_state->close_parens)
 212 #define RExC_opend (pRExC_state->opend)
 213 #define RExC_paren_names (pRExC_state->paren_names)
 214 #define RExC_recurse (pRExC_state->recurse)
 215 #define RExC_recurse_count (pRExC_state->recurse_count)
 216 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 217 #define RExC_study_chunk_recursed_bytes  \
 218         (pRExC_state->study_chunk_recursed_bytes)
 219 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 220 #define RExC_contains_locale (pRExC_state->contains_locale)
 221 #define RExC_contains_i (pRExC_state->contains_i)
 222 #define RExC_override_recoding (pRExC_state->override_recoding)
 223 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 224
 225
 226 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 227 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 228   ((*s) == '{' && regcurly(s)))
 229
 230 /*
 231  * Flags to be passed up and down.
 232  */
 233 #define WORST  0 /* Worst case. */
 234 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 235
 236 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 237  * character.  (There needs to be a case: in the switch statement in regexec.c
 238  * for any node marked SIMPLE.)  Note that this is not the same thing as
 239  * REGNODE_SIMPLE */
 240 #define SIMPLE  0x02
 241 #define SPSTART  0x04 /* Starts with * or + */
 242 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 243 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 244 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 245
 246 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 247
 248 /* whether trie related optimizations are enabled */
 249 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 250 #define TRIE_STUDY_OPT
 251 #define FULL_TRIE_STUDY
 252 #define TRIE_STCLASS
 253 #endif
 254
 255
 256
 257 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 258 #define PBITVAL(paren) (1 << ((paren) & 7))
 259 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 260 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 261 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 262
 263 #define REQUIRE_UTF8 STMT_START {                                       \
 264          if (!UTF) {                           \
 265           *flagp = RESTART_UTF8;            \
 266           return NULL;                      \
 267          }                                     \
 268       } STMT_END
 269
 270 /* This converts the named class defined in regcomp.h to its equivalent class
 271  * number defined in handy.h. */
 272 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 273 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 274
 275 #define _invlist_union_complement_2nd(a, b, output) \
 276       _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 277 #define _invlist_intersection_complement_2nd(a, b, output) \
 278     _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 279
 280 /* About scan_data_t.
 281
 282   During optimisation we recurse through the regexp program performing
 283   various inplace (keyhole style) optimisations. In addition study_chunk
 284   and scan_commit populate this data structure with information about
 285   what strings MUST appear in the pattern. We look for the longest
 286   string that must appear at a fixed location, and we look for the
 287   longest string that may appear at a floating location. So for instance
 288   in the pattern:
 289
 290  /FOO[xX]A.*B[xX]BAR/
 291
 292   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 293   strings (because they follow a .* construct). study_chunk will identify
 294   both FOO and BAR as being the longest fixed and floating strings respectively.
 295
 296   The strings can be composites, for instance
 297
 298  /(f)(o)(o)/
 299
 300   will result in a composite fixed substring 'foo'.
 301
 302   For each string some basic information is maintained:
 303
 304   - offset or min_offset
 305  This is the position the string must appear at, or not before.
 306  It also implicitly (when combined with minlenp) tells us how many
 307  characters must match before the string we are searching for.
 308  Likewise when combined with minlenp and the length of the string it
 309  tells us how many characters must appear after the string we have
 310  found.
 311
 312   - max_offset
 313  Only used for floating strings. This is the rightmost point that
 314  the string can appear at. If set to SSize_t_MAX it indicates that the
 315  string can occur infinitely far to the right.
 316
 317   - minlenp
 318  A pointer to the minimum number of characters of the pattern that the
 319  string was found inside. This is important as in the case of positive
 320  lookahead or positive lookbehind we can have multiple patterns
 321  involved. Consider
 322
 323  /(?=FOO).*F/
 324
 325  The minimum length of the pattern overall is 3, the minimum length
 326  of the lookahead part is 3, but the minimum length of the part that
 327  will actually match is 1. So 'FOO's minimum length is 3, but the
 328  minimum length for the F is 1. This is important as the minimum length
 329  is used to determine offsets in front of and behind the string being
 330  looked for.  Since strings can be composites this is the length of the
 331  pattern at the time it was committed with a scan_commit. Note that
 332  the length is calculated by study_chunk, so that the minimum lengths
 333  are not known until the full pattern has been compiled, thus the
 334  pointer to the value.
 335
 336   - lookbehind
 337
 338  In the case of lookbehind the string being searched for can be
 339  offset past the start point of the final matching string.
 340  If this value was just blithely removed from the min_offset it would
 341  invalidate some of the calculations for how many chars must match
 342  before or after (as they are derived from min_offset and minlen and
 343  the length of the string being searched for).
 344  When the final pattern is compiled and the data is moved from the
 345  scan_data_t structure into the regexp structure the information
 346  about lookbehind is factored in, with the information that would
 347  have been lost precalculated in the end_shift field for the
 348  associated string.
 349
 350   The fields pos_min and pos_delta are used to store the minimum offset
 351   and the delta to the maximum offset at the current point in the pattern.
 352
 353 */
 354
 355 typedef struct scan_data_t {
 356  /*I32 len_min;      unused */
 357  /*I32 len_delta;    unused */
 358  SSize_t pos_min;
 359  SSize_t pos_delta;
 360  SV *last_found;
 361  SSize_t last_end;     /* min value, <0 unless valid. */
 362  SSize_t last_start_min;
 363  SSize_t last_start_max;
 364  SV **longest;     /* Either &l_fixed, or &l_float. */
 365  SV *longest_fixed;      /* longest fixed string found in pattern */
 366  SSize_t offset_fixed;   /* offset where it starts */
 367  SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 368  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 369  SV *longest_float;      /* longest floating string found in pattern */
 370  SSize_t offset_float_min; /* earliest point in string it can appear */
 371  SSize_t offset_float_max; /* latest point in string it can appear */
 372  SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 373  SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 374  I32 flags;
 375  I32 whilem_c;
 376  SSize_t *last_closep;
 377  regnode_ssc *start_class;
 378 } scan_data_t;
 379
 380 /* The below is perhaps overboard, but this allows us to save a test at the
 381  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 382  * and 'a' differ by a single bit; the same with the upper and lower case of
 383  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 384  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 385  * then inverts it to form a mask, with just a single 0, in the bit position
 386  * where the upper- and lowercase differ.  XXX There are about 40 other
 387  * instances in the Perl core where this micro-optimization could be used.
 388  * Should decide if maintenance cost is worse, before changing those
 389  *
 390  * Returns a boolean as to whether or not 'v' is either a lowercase or
 391  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 392  * compile-time constant, the generated code is better than some optimizing
 393  * compilers figure out, amounting to a mask and test.  The results are
 394  * meaningless if 'c' is not one of [A-Za-z] */
 395 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 396        (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 397
 398 /*
 399  * Forward declarations for pregcomp()'s friends.
 400  */
 401
 402 static const scan_data_t zero_scan_data =
 403   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 404
 405 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 406 #define SF_BEFORE_SEOL  0x0001
 407 #define SF_BEFORE_MEOL  0x0002
 408 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 409 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 410
 411 #define SF_FIX_SHIFT_EOL (+2)
 412 #define SF_FL_SHIFT_EOL  (+4)
 413
 414 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 415 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 416
 417 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 418 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 419 #define SF_IS_INF  0x0040
 420 #define SF_HAS_PAR  0x0080
 421 #define SF_IN_PAR  0x0100
 422 #define SF_HAS_EVAL  0x0200
 423 #define SCF_DO_SUBSTR  0x0400
 424 #define SCF_DO_STCLASS_AND 0x0800
 425 #define SCF_DO_STCLASS_OR 0x1000
 426 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 427 #define SCF_WHILEM_VISITED_POS 0x2000
 428
 429 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 430 #define SCF_SEEN_ACCEPT         0x8000
 431 #define SCF_TRIE_DOING_RESTUDY 0x10000
 432
 433 #define UTF cBOOL(RExC_utf8)
 434
 435 /* The enums for all these are ordered so things work out correctly */
 436 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 437 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 438              == REGEX_DEPENDS_CHARSET)
 439 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 440 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 441              >= REGEX_UNICODE_CHARSET)
 442 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 443            == REGEX_ASCII_RESTRICTED_CHARSET)
 444 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 445            >= REGEX_ASCII_RESTRICTED_CHARSET)
 446 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 447           == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 448
 449 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 450
 451 /* For programs that want to be strictly Unicode compatible by dying if any
 452  * attempt is made to match a non-Unicode code point against a Unicode
 453  * property.  */
 454 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 455
 456 #define OOB_NAMEDCLASS  -1
 457
 458 /* There is no code point that is out-of-bounds, so this is problematic.  But
 459  * its only current use is to initialize a variable that is always set before
 460  * looked at. */
 461 #define OOB_UNICODE  0xDEADBEEF
 462
 463 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 464 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 465
 466
 467 /* length of regex to show in messages that don't mark a position within */
 468 #define RegexLengthToShowInErrorMessages 127
 469
 470 /*
 471  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 472  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 473  * op/pragma/warn/regcomp.
 474  */
 475 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 476 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 477
 478 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 479       " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 480
 481 #define REPORT_LOCATION_ARGS(offset)            \
 482     UTF8fARG(UTF, offset, RExC_precomp), \
 483     UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 484
 485 /*
 486  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 487  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 488  * "...".
 489  */
 490 #define _FAIL(code) STMT_START {     \
 491  const char *ellipses = "";      \
 492  IV len = RExC_end - RExC_precomp;     \
 493                   \
 494  if (!SIZE_ONLY)       \
 495   SAVEFREESV(RExC_rx_sv);      \
 496  if (len > RegexLengthToShowInErrorMessages) {   \
 497   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 498   len = RegexLengthToShowInErrorMessages - 10;   \
 499   ellipses = "...";      \
 500  }         \
 501  code;                                                               \
 502 } STMT_END
 503
 504 #define FAIL(msg) _FAIL(       \
 505  Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",     \
 506    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 507
 508 #define FAIL2(msg,arg) _FAIL(       \
 509  Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",     \
 510    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 511
 512 /*
 513  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 514  */
 515 #define Simple_vFAIL(m) STMT_START {     \
 516  const IV offset = RExC_parse - RExC_precomp;   \
 517  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 518    m, REPORT_LOCATION_ARGS(offset)); \
 519 } STMT_END
 520
 521 /*
 522  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 523  */
 524 #define vFAIL(m) STMT_START {    \
 525  if (!SIZE_ONLY)     \
 526   SAVEFREESV(RExC_rx_sv);    \
 527  Simple_vFAIL(m);     \
 528 } STMT_END
 529
 530 /*
 531  * Like Simple_vFAIL(), but accepts two arguments.
 532  */
 533 #define Simple_vFAIL2(m,a1) STMT_START {   \
 534  const IV offset = RExC_parse - RExC_precomp;   \
 535  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,   \
 536      REPORT_LOCATION_ARGS(offset)); \
 537 } STMT_END
 538
 539 /*
 540  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 541  */
 542 #define vFAIL2(m,a1) STMT_START {   \
 543  if (!SIZE_ONLY)     \
 544   SAVEFREESV(RExC_rx_sv);    \
 545  Simple_vFAIL2(m, a1);    \
 546 } STMT_END
 547
 548
 549 /*
 550  * Like Simple_vFAIL(), but accepts three arguments.
 551  */
 552 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 553  const IV offset = RExC_parse - RExC_precomp;  \
 554  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 555    REPORT_LOCATION_ARGS(offset)); \
 556 } STMT_END
 557
 558 /*
 559  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 560  */
 561 #define vFAIL3(m,a1,a2) STMT_START {   \
 562  if (!SIZE_ONLY)     \
 563   SAVEFREESV(RExC_rx_sv);    \
 564  Simple_vFAIL3(m, a1, a2);    \
 565 } STMT_END
 566
 567 /*
 568  * Like Simple_vFAIL(), but accepts four arguments.
 569  */
 570 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 571  const IV offset = RExC_parse - RExC_precomp;  \
 572  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,  \
 573    REPORT_LOCATION_ARGS(offset)); \
 574 } STMT_END
 575
 576 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 577  if (!SIZE_ONLY)     \
 578   SAVEFREESV(RExC_rx_sv);    \
 579  Simple_vFAIL4(m, a1, a2, a3);   \
 580 } STMT_END
 581
 582 /* A specialized version of vFAIL2 that works with UTF8f */
 583 #define vFAIL2utf8f(m, a1) STMT_START { \
 584  const IV offset = RExC_parse - RExC_precomp;   \
 585  if (!SIZE_ONLY)                                \
 586   SAVEFREESV(RExC_rx_sv);                    \
 587  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 588    REPORT_LOCATION_ARGS(offset));         \
 589 } STMT_END
 590
 591
 592 /* m is not necessarily a "literal string", in this macro */
 593 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 594  const IV offset = loc - RExC_precomp;                               \
 595  Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 596    m, REPORT_LOCATION_ARGS(offset));       \
 597 } STMT_END
 598
 599 #define ckWARNreg(loc,m) STMT_START {     \
 600  const IV offset = loc - RExC_precomp;    \
 601  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 602    REPORT_LOCATION_ARGS(offset));  \
 603 } STMT_END
 604
 605 #define vWARN_dep(loc, m) STMT_START {            \
 606  const IV offset = loc - RExC_precomp;    \
 607  Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 608    REPORT_LOCATION_ARGS(offset));         \
 609 } STMT_END
 610
 611 #define ckWARNdep(loc,m) STMT_START {            \
 612  const IV offset = loc - RExC_precomp;    \
 613  Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 614    m REPORT_LOCATION,      \
 615    REPORT_LOCATION_ARGS(offset));  \
 616 } STMT_END
 617
 618 #define ckWARNregdep(loc,m) STMT_START {    \
 619  const IV offset = loc - RExC_precomp;    \
 620  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 621    m REPORT_LOCATION,      \
 622    REPORT_LOCATION_ARGS(offset));  \
 623 } STMT_END
 624
 625 #define ckWARN2reg_d(loc,m, a1) STMT_START {    \
 626  const IV offset = loc - RExC_precomp;    \
 627  Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),   \
 628    m REPORT_LOCATION,      \
 629    a1, REPORT_LOCATION_ARGS(offset)); \
 630 } STMT_END
 631
 632 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 633  const IV offset = loc - RExC_precomp;    \
 634  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 635    a1, REPORT_LOCATION_ARGS(offset)); \
 636 } STMT_END
 637
 638 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 639  const IV offset = loc - RExC_precomp;    \
 640  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 641    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 642 } STMT_END
 643
 644 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 645  const IV offset = loc - RExC_precomp;    \
 646  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 647    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 648 } STMT_END
 649
 650 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 651  const IV offset = loc - RExC_precomp;    \
 652  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 653    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 654 } STMT_END
 655
 656 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 657  const IV offset = loc - RExC_precomp;    \
 658  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 659    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 660 } STMT_END
 661
 662 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 663  const IV offset = loc - RExC_precomp;    \
 664  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 665    a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 666 } STMT_END
 667
 668
 669 /* Allow for side effects in s */
 670 #define REGC(c,s) STMT_START {   \
 671  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 672 } STMT_END
 673
 674 /* Macros for recording node offsets.   20001227 mjd@plover.com
 675  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 676  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 677  * Element 0 holds the number n.
 678  * Position is 1 indexed.
 679  */
 680 #ifndef RE_TRACK_PATTERN_OFFSETS
 681 #define Set_Node_Offset_To_R(node,byte)
 682 #define Set_Node_Offset(node,byte)
 683 #define Set_Cur_Node_Offset
 684 #define Set_Node_Length_To_R(node,len)
 685 #define Set_Node_Length(node,len)
 686 #define Set_Node_Cur_Length(node,start)
 687 #define Node_Offset(n)
 688 #define Node_Length(n)
 689 #define Set_Node_Offset_Length(node,offset,len)
 690 #define ProgLen(ri) ri->u.proglen
 691 #define SetProgLen(ri,x) ri->u.proglen = x
 692 #else
 693 #define ProgLen(ri) ri->u.offsets[0]
 694 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 695 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 696  if (! SIZE_ONLY) {       \
 697   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 698      __LINE__, (int)(node), (int)(byte)));  \
 699   if((node) < 0) {      \
 700    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 701           (int)(node));                  \
 702   } else {       \
 703    RExC_offsets[2*(node)-1] = (byte);    \
 704   }        \
 705  }         \
 706 } STMT_END
 707
 708 #define Set_Node_Offset(node,byte) \
 709  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 710 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 711
 712 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 713  if (! SIZE_ONLY) {       \
 714   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 715     __LINE__, (int)(node), (int)(len)));   \
 716   if((node) < 0) {      \
 717    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 718           (int)(node));                  \
 719   } else {       \
 720    RExC_offsets[2*(node)] = (len);    \
 721   }        \
 722  }         \
 723 } STMT_END
 724
 725 #define Set_Node_Length(node,len) \
 726  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 727 #define Set_Node_Cur_Length(node, start)                \
 728  Set_Node_Length(node, RExC_parse - start)
 729
 730 /* Get offsets and lengths */
 731 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 732 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 733
 734 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 735  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 736  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 737 } STMT_END
 738 #endif
 739
 740 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 741 #define EXPERIMENTAL_INPLACESCAN
 742 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 743
 744 #define DEBUG_RExC_seen() \
 745   DEBUG_OPTIMISE_MORE_r({                                             \
 746    PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 747                    \
 748    if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 749     PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 750                    \
 751    if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 752     PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 753                    \
 754    if (RExC_seen & REG_GPOS_SEEN)                                  \
 755     PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 756                    \
 757    if (RExC_seen & REG_CANY_SEEN)                                  \
 758     PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 759                    \
 760    if (RExC_seen & REG_RECURSE_SEEN)                               \
 761     PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 762                    \
 763    if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 764     PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 765                    \
 766    if (RExC_seen & REG_VERBARG_SEEN)                               \
 767     PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 768                    \
 769    if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 770     PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 771                    \
 772    if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 773     PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 774                    \
 775    if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 776     PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 777                    \
 778    if (RExC_seen & REG_GOSTART_SEEN)                               \
 779     PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 780                    \
 781    if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 782     PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 783                    \
 784    PerlIO_printf(Perl_debug_log,"\n");                             \
 785   });
 786
 787 #define DEBUG_STUDYDATA(str,data,depth)                              \
 788 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 789  PerlIO_printf(Perl_debug_log,                                    \
 790   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 791   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 792   (int)(depth)*2, "",                                          \
 793   (IV)((data)->pos_min),                                       \
 794   (IV)((data)->pos_delta),                                     \
 795   (UV)((data)->flags),                                         \
 796   (IV)((data)->whilem_c),                                      \
 797   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 798   is_inf ? "INF " : ""                                         \
 799  );                                                               \
 800  if ((data)->last_found)                                          \
 801   PerlIO_printf(Perl_debug_log,                                \
 802    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 803    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 804    SvPVX_const((data)->last_found),                         \
 805    (IV)((data)->last_end),                                  \
 806    (IV)((data)->last_start_min),                            \
 807    (IV)((data)->last_start_max),                            \
 808    ((data)->longest &&                                      \
 809    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 810    SvPVX_const((data)->longest_fixed),                      \
 811    (IV)((data)->offset_fixed),                              \
 812    ((data)->longest &&                                      \
 813    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 814    SvPVX_const((data)->longest_float),                      \
 815    (IV)((data)->offset_float_min),                          \
 816    (IV)((data)->offset_float_max)                           \
 817   );                                                           \
 818  PerlIO_printf(Perl_debug_log,"\n");                              \
 819 });
 820
 821 /* Mark that we cannot extend a found fixed substring at this point.
 822    Update the longest found anchored substring and the longest found
 823    floating substrings if needed. */
 824
 825 STATIC void
 826 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 827      SSize_t *minlenp, int is_inf)
 828 {
 829  const STRLEN l = CHR_SVLEN(data->last_found);
 830  const STRLEN old_l = CHR_SVLEN(*data->longest);
 831  GET_RE_DEBUG_FLAGS_DECL;
 832
 833  PERL_ARGS_ASSERT_SCAN_COMMIT;
 834
 835  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 836   SvSetMagicSV(*data->longest, data->last_found);
 837   if (*data->longest == data->longest_fixed) {
 838    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 839    if (data->flags & SF_BEFORE_EOL)
 840     data->flags
 841      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 842    else
 843     data->flags &= ~SF_FIX_BEFORE_EOL;
 844    data->minlen_fixed=minlenp;
 845    data->lookbehind_fixed=0;
 846   }
 847   else { /* *data->longest == data->longest_float */
 848    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 849    data->offset_float_max = (l
 850          ? data->last_start_max
 851          : (data->pos_delta == SSize_t_MAX
 852           ? SSize_t_MAX
 853           : data->pos_min + data->pos_delta));
 854    if (is_inf
 855     || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 856     data->offset_float_max = SSize_t_MAX;
 857    if (data->flags & SF_BEFORE_EOL)
 858     data->flags
 859      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 860    else
 861     data->flags &= ~SF_FL_BEFORE_EOL;
 862    data->minlen_float=minlenp;
 863    data->lookbehind_float=0;
 864   }
 865  }
 866  SvCUR_set(data->last_found, 0);
 867  {
 868   SV * const sv = data->last_found;
 869   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 870    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 871    if (mg)
 872     mg->mg_len = 0;
 873   }
 874  }
 875  data->last_end = -1;
 876  data->flags &= ~SF_BEFORE_EOL;
 877  DEBUG_STUDYDATA("commit: ",data,0);
 878 }
 879
 880 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 881  * list that describes which code points it matches */
 882
 883 STATIC void
 884 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 885 {
 886  /* Set the SSC 'ssc' to match an empty string or any code point */
 887
 888  PERL_ARGS_ASSERT_SSC_ANYTHING;
 889
 890  assert(is_ANYOF_SYNTHETIC(ssc));
 891
 892  ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 893  _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 894  ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 895 }
 896
 897 STATIC int
 898 S_ssc_is_anything(const regnode_ssc *ssc)
 899 {
 900  /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 901  * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 902  * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 903  * in any way, so there's no point in using it */
 904
 905  UV start, end;
 906  bool ret;
 907
 908  PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 909
 910  assert(is_ANYOF_SYNTHETIC(ssc));
 911
 912  if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 913   return FALSE;
 914  }
 915
 916  /* See if the list consists solely of the range 0 - Infinity */
 917  invlist_iterinit(ssc->invlist);
 918  ret = invlist_iternext(ssc->invlist, &start, &end)
 919   && start == 0
 920   && end == UV_MAX;
 921
 922  invlist_iterfinish(ssc->invlist);
 923
 924  if (ret) {
 925   return TRUE;
 926  }
 927
 928  /* If e.g., both \w and \W are set, matches everything */
 929  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 930   int i;
 931   for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 932    if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 933     return TRUE;
 934    }
 935   }
 936  }
 937
 938  return FALSE;
 939 }
 940
 941 STATIC void
 942 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 943 {
 944  /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 945  * string, any code point, or any posix class under locale */
 946
 947  PERL_ARGS_ASSERT_SSC_INIT;
 948
 949  Zero(ssc, 1, regnode_ssc);
 950  set_ANYOF_SYNTHETIC(ssc);
 951  ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 952  ssc_anything(ssc);
 953
 954  /* If any portion of the regex is to operate under locale rules,
 955  * initialization includes it.  The reason this isn't done for all regexes
 956  * is that the optimizer was written under the assumption that locale was
 957  * all-or-nothing.  Given the complexity and lack of documentation in the
 958  * optimizer, and that there are inadequate test cases for locale, many
 959  * parts of it may not work properly, it is safest to avoid locale unless
 960  * necessary. */
 961  if (RExC_contains_locale) {
 962   ANYOF_POSIXL_SETALL(ssc);
 963  }
 964  else {
 965   ANYOF_POSIXL_ZERO(ssc);
 966  }
 967 }
 968
 969 STATIC int
 970 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
 971       const regnode_ssc *ssc)
 972 {
 973  /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 974  * to the list of code points matched, and locale posix classes; hence does
 975  * not check its flags) */
 976
 977  UV start, end;
 978  bool ret;
 979
 980  PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 981
 982  assert(is_ANYOF_SYNTHETIC(ssc));
 983
 984  invlist_iterinit(ssc->invlist);
 985  ret = invlist_iternext(ssc->invlist, &start, &end)
 986   && start == 0
 987   && end == UV_MAX;
 988
 989  invlist_iterfinish(ssc->invlist);
 990
 991  if (! ret) {
 992   return FALSE;
 993  }
 994
 995  if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 996   return FALSE;
 997  }
 998
 999  return TRUE;
1000 }
1001
1002 STATIC SV*
1003 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1004        const regnode_charclass* const node)
1005 {
1006  /* Returns a mortal inversion list defining which code points are matched
1007  * by 'node', which is of type ANYOF.  Handles complementing the result if
1008  * appropriate.  If some code points aren't knowable at this time, the
1009  * returned list must, and will, contain every code point that is a
1010  * possibility. */
1011
1012  SV* invlist = sv_2mortal(_new_invlist(0));
1013  SV* only_utf8_locale_invlist = NULL;
1014  unsigned int i;
1015  const U32 n = ARG(node);
1016  bool new_node_has_latin1 = FALSE;
1017
1018  PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1019
1020  /* Look at the data structure created by S_set_ANYOF_arg() */
1021  if (n != ANYOF_NONBITMAP_EMPTY) {
1022   SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1023   AV * const av = MUTABLE_AV(SvRV(rv));
1024   SV **const ary = AvARRAY(av);
1025   assert(RExC_rxi->data->what[n] == 's');
1026
1027   if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1028    invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1029   }
1030   else if (ary[0] && ary[0] != &PL_sv_undef) {
1031
1032    /* Here, no compile-time swash, and there are things that won't be
1033    * known until runtime -- we have to assume it could be anything */
1034    return _add_range_to_invlist(invlist, 0, UV_MAX);
1035   }
1036   else if (ary[3] && ary[3] != &PL_sv_undef) {
1037
1038    /* Here no compile-time swash, and no run-time only data.  Use the
1039    * node's inversion list */
1040    invlist = sv_2mortal(invlist_clone(ary[3]));
1041   }
1042
1043   /* Get the code points valid only under UTF-8 locales */
1044   if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1045    && ary[2] && ary[2] != &PL_sv_undef)
1046   {
1047    only_utf8_locale_invlist = ary[2];
1048   }
1049  }
1050
1051  /* An ANYOF node contains a bitmap for the first 256 code points, and an
1052  * inversion list for the others, but if there are code points that should
1053  * match only conditionally on the target string being UTF-8, those are
1054  * placed in the inversion list, and not the bitmap.  Since there are
1055  * circumstances under which they could match, they are included in the
1056  * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1057  * here, so that when we invert below, the end result actually does include
1058  * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1059  * before we add the unconditionally matched code points */
1060  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1061   _invlist_intersection_complement_2nd(invlist,
1062            PL_UpperLatin1,
1063            &invlist);
1064  }
1065
1066  /* Add in the points from the bit map */
1067  for (i = 0; i < 256; i++) {
1068   if (ANYOF_BITMAP_TEST(node, i)) {
1069    invlist = add_cp_to_invlist(invlist, i);
1070    new_node_has_latin1 = TRUE;
1071   }
1072  }
1073
1074  /* If this can match all upper Latin1 code points, have to add them
1075  * as well */
1076  if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1077   _invlist_union(invlist, PL_UpperLatin1, &invlist);
1078  }
1079
1080  /* Similarly for these */
1081  if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1082   invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1083  }
1084
1085  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1086   _invlist_invert(invlist);
1087  }
1088  else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1089
1090   /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1091   * locale.  We can skip this if there are no 0-255 at all. */
1092   _invlist_union(invlist, PL_Latin1, &invlist);
1093  }
1094
1095  /* Similarly add the UTF-8 locale possible matches.  These have to be
1096  * deferred until after the non-UTF-8 locale ones are taken care of just
1097  * above, or it leads to wrong results under ANYOF_INVERT */
1098  if (only_utf8_locale_invlist) {
1099   _invlist_union_maybe_complement_2nd(invlist,
1100            only_utf8_locale_invlist,
1101            ANYOF_FLAGS(node) & ANYOF_INVERT,
1102            &invlist);
1103  }
1104
1105  return invlist;
1106 }
1107
1108 /* These two functions currently do the exact same thing */
1109 #define ssc_init_zero  ssc_init
1110
1111 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1112 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1113
1114 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1115  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1116  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1117
1118 STATIC void
1119 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1120     const regnode_charclass *and_with)
1121 {
1122  /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1123  * another SSC or a regular ANYOF class.  Can create false positives. */
1124
1125  SV* anded_cp_list;
1126  U8  anded_flags;
1127
1128  PERL_ARGS_ASSERT_SSC_AND;
1129
1130  assert(is_ANYOF_SYNTHETIC(ssc));
1131
1132  /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1133  * the code point inversion list and just the relevant flags */
1134  if (is_ANYOF_SYNTHETIC(and_with)) {
1135   anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1136   anded_flags = ANYOF_FLAGS(and_with);
1137
1138   /* XXX This is a kludge around what appears to be deficiencies in the
1139   * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1140   * there are paths through the optimizer where it doesn't get weeded
1141   * out when it should.  And if we don't make some extra provision for
1142   * it like the code just below, it doesn't get added when it should.
1143   * This solution is to add it only when AND'ing, which is here, and
1144   * only when what is being AND'ed is the pristine, original node
1145   * matching anything.  Thus it is like adding it to ssc_anything() but
1146   * only when the result is to be AND'ed.  Probably the same solution
1147   * could be adopted for the same problem we have with /l matching,
1148   * which is solved differently in S_ssc_init(), and that would lead to
1149   * fewer false positives than that solution has.  But if this solution
1150   * creates bugs, the consequences are only that a warning isn't raised
1151   * that should be; while the consequences for having /l bugs is
1152   * incorrect matches */
1153   if (ssc_is_anything((regnode_ssc *)and_with)) {
1154    anded_flags |= ANYOF_WARN_SUPER;
1155   }
1156  }
1157  else {
1158   anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1159   anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1160  }
1161
1162  ANYOF_FLAGS(ssc) &= anded_flags;
1163
1164  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1165  * C2 is the list of code points in 'and-with'; P2, its posix classes.
1166  * 'and_with' may be inverted.  When not inverted, we have the situation of
1167  * computing:
1168  *  (C1 | P1) & (C2 | P2)
1169  *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1170  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1171  *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1172  *                    <=  ((C1 & C2) | P1 | P2)
1173  * Alternatively, the last few steps could be:
1174  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1175  *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1176  *                    <=  (C1 | C2 | (P1 & P2))
1177  * We favor the second approach if either P1 or P2 is non-empty.  This is
1178  * because these components are a barrier to doing optimizations, as what
1179  * they match cannot be known until the moment of matching as they are
1180  * dependent on the current locale, 'AND"ing them likely will reduce or
1181  * eliminate them.
1182  * But we can do better if we know that C1,P1 are in their initial state (a
1183  * frequent occurrence), each matching everything:
1184  *  (<everything>) & (C2 | P2) =  C2 | P2
1185  * Similarly, if C2,P2 are in their initial state (again a frequent
1186  * occurrence), the result is a no-op
1187  *  (C1 | P1) & (<everything>) =  C1 | P1
1188  *
1189  * Inverted, we have
1190  *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1191  *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1192  *                         <=  (C1 & ~C2) | (P1 & ~P2)
1193  * */
1194
1195  if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1196   && ! is_ANYOF_SYNTHETIC(and_with))
1197  {
1198   unsigned int i;
1199
1200   ssc_intersection(ssc,
1201       anded_cp_list,
1202       FALSE /* Has already been inverted */
1203       );
1204
1205   /* If either P1 or P2 is empty, the intersection will be also; can skip
1206   * the loop */
1207   if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1208    ANYOF_POSIXL_ZERO(ssc);
1209   }
1210   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1211
1212    /* Note that the Posix class component P from 'and_with' actually
1213    * looks like:
1214    *      P = Pa | Pb | ... | Pn
1215    * where each component is one posix class, such as in [\w\s].
1216    * Thus
1217    *      ~P = ~(Pa | Pb | ... | Pn)
1218    *         = ~Pa & ~Pb & ... & ~Pn
1219    *        <= ~Pa | ~Pb | ... | ~Pn
1220    * The last is something we can easily calculate, but unfortunately
1221    * is likely to have many false positives.  We could do better
1222    * in some (but certainly not all) instances if two classes in
1223    * P have known relationships.  For example
1224    *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1225    * So
1226    *      :lower: & :print: = :lower:
1227    * And similarly for classes that must be disjoint.  For example,
1228    * since \s and \w can have no elements in common based on rules in
1229    * the POSIX standard,
1230    *      \w & ^\S = nothing
1231    * Unfortunately, some vendor locales do not meet the Posix
1232    * standard, in particular almost everything by Microsoft.
1233    * The loop below just changes e.g., \w into \W and vice versa */
1234
1235    regnode_charclass_posixl temp;
1236    int add = 1;    /* To calculate the index of the complement */
1237
1238    ANYOF_POSIXL_ZERO(&temp);
1239    for (i = 0; i < ANYOF_MAX; i++) {
1240     assert(i % 2 != 0
1241      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1242      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1243
1244     if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1245      ANYOF_POSIXL_SET(&temp, i + add);
1246     }
1247     add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1248    }
1249    ANYOF_POSIXL_AND(&temp, ssc);
1250
1251   } /* else ssc already has no posixes */
1252  } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1253   in its initial state */
1254  else if (! is_ANYOF_SYNTHETIC(and_with)
1255    || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1256  {
1257   /* But if 'ssc' is in its initial state, the result is just 'and_with';
1258   * copy it over 'ssc' */
1259   if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1260    if (is_ANYOF_SYNTHETIC(and_with)) {
1261     StructCopy(and_with, ssc, regnode_ssc);
1262    }
1263    else {
1264     ssc->invlist = anded_cp_list;
1265     ANYOF_POSIXL_ZERO(ssc);
1266     if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1267      ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1268     }
1269    }
1270   }
1271   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1272     || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1273   {
1274    /* One or the other of P1, P2 is non-empty. */
1275    if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1276     ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1277    }
1278    ssc_union(ssc, anded_cp_list, FALSE);
1279   }
1280   else { /* P1 = P2 = empty */
1281    ssc_intersection(ssc, anded_cp_list, FALSE);
1282   }
1283  }
1284 }
1285
1286 STATIC void
1287 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1288    const regnode_charclass *or_with)
1289 {
1290  /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1291  * another SSC or a regular ANYOF class.  Can create false positives if
1292  * 'or_with' is to be inverted. */
1293
1294  SV* ored_cp_list;
1295  U8 ored_flags;
1296
1297  PERL_ARGS_ASSERT_SSC_OR;
1298
1299  assert(is_ANYOF_SYNTHETIC(ssc));
1300
1301  /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1302  * the code point inversion list and just the relevant flags */
1303  if (is_ANYOF_SYNTHETIC(or_with)) {
1304   ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1305   ored_flags = ANYOF_FLAGS(or_with);
1306  }
1307  else {
1308   ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1309   ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1310  }
1311
1312  ANYOF_FLAGS(ssc) |= ored_flags;
1313
1314  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1315  * C2 is the list of code points in 'or-with'; P2, its posix classes.
1316  * 'or_with' may be inverted.  When not inverted, we have the simple
1317  * situation of computing:
1318  *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1319  * If P1|P2 yields a situation with both a class and its complement are
1320  * set, like having both \w and \W, this matches all code points, and we
1321  * can delete these from the P component of the ssc going forward.  XXX We
1322  * might be able to delete all the P components, but I (khw) am not certain
1323  * about this, and it is better to be safe.
1324  *
1325  * Inverted, we have
1326  *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1327  *                         <=  (C1 | P1) | ~C2
1328  *                         <=  (C1 | ~C2) | P1
1329  * (which results in actually simpler code than the non-inverted case)
1330  * */
1331
1332  if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1333   && ! is_ANYOF_SYNTHETIC(or_with))
1334  {
1335   /* We ignore P2, leaving P1 going forward */
1336  }   /* else  Not inverted */
1337  else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1338   ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1339   if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1340    unsigned int i;
1341    for (i = 0; i < ANYOF_MAX; i += 2) {
1342     if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1343     {
1344      ssc_match_all_cp(ssc);
1345      ANYOF_POSIXL_CLEAR(ssc, i);
1346      ANYOF_POSIXL_CLEAR(ssc, i+1);
1347     }
1348    }
1349   }
1350  }
1351
1352  ssc_union(ssc,
1353    ored_cp_list,
1354    FALSE /* Already has been inverted */
1355    );
1356 }
1357
1358 PERL_STATIC_INLINE void
1359 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1360 {
1361  PERL_ARGS_ASSERT_SSC_UNION;
1362
1363  assert(is_ANYOF_SYNTHETIC(ssc));
1364
1365  _invlist_union_maybe_complement_2nd(ssc->invlist,
1366           invlist,
1367           invert2nd,
1368           &ssc->invlist);
1369 }
1370
1371 PERL_STATIC_INLINE void
1372 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1373       SV* const invlist,
1374       const bool invert2nd)
1375 {
1376  PERL_ARGS_ASSERT_SSC_INTERSECTION;
1377
1378  assert(is_ANYOF_SYNTHETIC(ssc));
1379
1380  _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1381            invlist,
1382            invert2nd,
1383            &ssc->invlist);
1384 }
1385
1386 PERL_STATIC_INLINE void
1387 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1388 {
1389  PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1390
1391  assert(is_ANYOF_SYNTHETIC(ssc));
1392
1393  ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1394 }
1395
1396 PERL_STATIC_INLINE void
1397 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1398 {
1399  /* AND just the single code point 'cp' into the SSC 'ssc' */
1400
1401  SV* cp_list = _new_invlist(2);
1402
1403  PERL_ARGS_ASSERT_SSC_CP_AND;
1404
1405  assert(is_ANYOF_SYNTHETIC(ssc));
1406
1407  cp_list = add_cp_to_invlist(cp_list, cp);
1408  ssc_intersection(ssc, cp_list,
1409      FALSE /* Not inverted */
1410      );
1411  SvREFCNT_dec_NN(cp_list);
1412 }
1413
1414 PERL_STATIC_INLINE void
1415 S_ssc_clear_locale(regnode_ssc *ssc)
1416 {
1417  /* Set the SSC 'ssc' to not match any locale things */
1418  PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1419
1420  assert(is_ANYOF_SYNTHETIC(ssc));
1421
1422  ANYOF_POSIXL_ZERO(ssc);
1423  ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1424 }
1425
1426 STATIC void
1427 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1428 {
1429  /* The inversion list in the SSC is marked mortal; now we need a more
1430  * permanent copy, which is stored the same way that is done in a regular
1431  * ANYOF node, with the first 256 code points in a bit map */
1432
1433  SV* invlist = invlist_clone(ssc->invlist);
1434
1435  PERL_ARGS_ASSERT_SSC_FINALIZE;
1436
1437  assert(is_ANYOF_SYNTHETIC(ssc));
1438
1439  /* The code in this file assumes that all but these flags aren't relevant
1440  * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1441  * time we reach here */
1442  assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1443
1444  populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1445
1446  set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1447         NULL, NULL, NULL, FALSE);
1448
1449  /* Make sure is clone-safe */
1450  ssc->invlist = NULL;
1451
1452  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1453   ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1454  }
1455
1456  assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1457 }
1458
1459 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1460 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1461 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1462 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1463        ? (TRIE_LIST_CUR( idx ) - 1)           \
1464        : 0 )
1465
1466
1467 #ifdef DEBUGGING
1468 /*
1469    dump_trie(trie,widecharmap,revcharmap)
1470    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1471    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1472
1473    These routines dump out a trie in a somewhat readable format.
1474    The _interim_ variants are used for debugging the interim
1475    tables that are used to generate the final compressed
1476    representation which is what dump_trie expects.
1477
1478    Part of the reason for their existence is to provide a form
1479    of documentation as to how the different representations function.
1480
1481 */
1482
1483 /*
1484   Dumps the final compressed table form of the trie to Perl_debug_log.
1485   Used for debugging make_trie().
1486 */
1487
1488 STATIC void
1489 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1490    AV *revcharmap, U32 depth)
1491 {
1492  U32 state;
1493  SV *sv=sv_newmortal();
1494  int colwidth= widecharmap ? 6 : 4;
1495  U16 word;
1496  GET_RE_DEBUG_FLAGS_DECL;
1497
1498  PERL_ARGS_ASSERT_DUMP_TRIE;
1499
1500  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1501   (int)depth * 2 + 2,"",
1502   "Match","Base","Ofs" );
1503
1504  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1505   SV ** const tmp = av_fetch( revcharmap, state, 0);
1506   if ( tmp ) {
1507    PerlIO_printf( Perl_debug_log, "%*s",
1508     colwidth,
1509     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1510        PL_colors[0], PL_colors[1],
1511        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1512        PERL_PV_ESCAPE_FIRSTCHAR
1513     )
1514    );
1515   }
1516  }
1517  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1518   (int)depth * 2 + 2,"");
1519
1520  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1521   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1522  PerlIO_printf( Perl_debug_log, "\n");
1523
1524  for( state = 1 ; state < trie->statecount ; state++ ) {
1525   const U32 base = trie->states[ state ].trans.base;
1526
1527   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1528          (int)depth * 2 + 2,"", (UV)state);
1529
1530   if ( trie->states[ state ].wordnum ) {
1531    PerlIO_printf( Perl_debug_log, " W%4X",
1532           trie->states[ state ].wordnum );
1533   } else {
1534    PerlIO_printf( Perl_debug_log, "%6s", "" );
1535   }
1536
1537   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1538
1539   if ( base ) {
1540    U32 ofs = 0;
1541
1542    while( ( base + ofs  < trie->uniquecharcount ) ||
1543     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1544      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1545                  != state))
1546      ofs++;
1547
1548    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1549
1550    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1551     if ( ( base + ofs >= trie->uniquecharcount )
1552       && ( base + ofs - trie->uniquecharcount
1553               < trie->lasttrans )
1554       && trie->trans[ base + ofs
1555          - trie->uniquecharcount ].check == state )
1556     {
1557     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1558      colwidth,
1559      (UV)trie->trans[ base + ofs
1560            - trie->uniquecharcount ].next );
1561     } else {
1562      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1563     }
1564    }
1565
1566    PerlIO_printf( Perl_debug_log, "]");
1567
1568   }
1569   PerlIO_printf( Perl_debug_log, "\n" );
1570  }
1571  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1572         (int)depth*2, "");
1573  for (word=1; word <= trie->wordcount; word++) {
1574   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1575    (int)word, (int)(trie->wordinfo[word].prev),
1576    (int)(trie->wordinfo[word].len));
1577  }
1578  PerlIO_printf(Perl_debug_log, "\n" );
1579 }
1580 /*
1581   Dumps a fully constructed but uncompressed trie in list form.
1582   List tries normally only are used for construction when the number of
1583   possible chars (trie->uniquecharcount) is very high.
1584   Used for debugging make_trie().
1585 */
1586 STATIC void
1587 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1588       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1589       U32 depth)
1590 {
1591  U32 state;
1592  SV *sv=sv_newmortal();
1593  int colwidth= widecharmap ? 6 : 4;
1594  GET_RE_DEBUG_FLAGS_DECL;
1595
1596  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1597
1598  /* print out the table precompression.  */
1599  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1600   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1601   "------:-----+-----------------\n" );
1602
1603  for( state=1 ; state < next_alloc ; state ++ ) {
1604   U16 charid;
1605
1606   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1607    (int)depth * 2 + 2,"", (UV)state  );
1608   if ( ! trie->states[ state ].wordnum ) {
1609    PerlIO_printf( Perl_debug_log, "%5s| ","");
1610   } else {
1611    PerlIO_printf( Perl_debug_log, "W%4x| ",
1612     trie->states[ state ].wordnum
1613    );
1614   }
1615   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1616    SV ** const tmp = av_fetch( revcharmap,
1617           TRIE_LIST_ITEM(state,charid).forid, 0);
1618    if ( tmp ) {
1619     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1620      colwidth,
1621      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1622        colwidth,
1623        PL_colors[0], PL_colors[1],
1624        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1625        | PERL_PV_ESCAPE_FIRSTCHAR
1626      ) ,
1627      TRIE_LIST_ITEM(state,charid).forid,
1628      (UV)TRIE_LIST_ITEM(state,charid).newstate
1629     );
1630     if (!(charid % 10))
1631      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1632       (int)((depth * 2) + 14), "");
1633    }
1634   }
1635   PerlIO_printf( Perl_debug_log, "\n");
1636  }
1637 }
1638
1639 /*
1640   Dumps a fully constructed but uncompressed trie in table form.
1641   This is the normal DFA style state transition table, with a few
1642   twists to facilitate compression later.
1643   Used for debugging make_trie().
1644 */
1645 STATIC void
1646 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1647       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1648       U32 depth)
1649 {
1650  U32 state;
1651  U16 charid;
1652  SV *sv=sv_newmortal();
1653  int colwidth= widecharmap ? 6 : 4;
1654  GET_RE_DEBUG_FLAGS_DECL;
1655
1656  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1657
1658  /*
1659  print out the table precompression so that we can do a visual check
1660  that they are identical.
1661  */
1662
1663  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1664
1665  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1666   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1667   if ( tmp ) {
1668    PerlIO_printf( Perl_debug_log, "%*s",
1669     colwidth,
1670     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1671        PL_colors[0], PL_colors[1],
1672        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1673        PERL_PV_ESCAPE_FIRSTCHAR
1674     )
1675    );
1676   }
1677  }
1678
1679  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1680
1681  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1682   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1683  }
1684
1685  PerlIO_printf( Perl_debug_log, "\n" );
1686
1687  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1688
1689   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1690    (int)depth * 2 + 2,"",
1691    (UV)TRIE_NODENUM( state ) );
1692
1693   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1694    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1695    if (v)
1696     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1697    else
1698     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1699   }
1700   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1701    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1702            (UV)trie->trans[ state ].check );
1703   } else {
1704    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1705            (UV)trie->trans[ state ].check,
1706    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1707   }
1708  }
1709 }
1710
1711 #endif
1712
1713
1714 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1715   startbranch: the first branch in the whole branch sequence
1716   first      : start branch of sequence of branch-exact nodes.
1717    May be the same as startbranch
1718   last       : Thing following the last branch.
1719    May be the same as tail.
1720   tail       : item following the branch sequence
1721   count      : words in the sequence
1722   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1723   depth      : indent depth
1724
1725 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1726
1727 A trie is an N'ary tree where the branches are determined by digital
1728 decomposition of the key. IE, at the root node you look up the 1st character and
1729 follow that branch repeat until you find the end of the branches. Nodes can be
1730 marked as "accepting" meaning they represent a complete word. Eg:
1731
1732   /he|she|his|hers/
1733
1734 would convert into the following structure. Numbers represent states, letters
1735 following numbers represent valid transitions on the letter from that state, if
1736 the number is in square brackets it represents an accepting state, otherwise it
1737 will be in parenthesis.
1738
1739  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1740  |    |
1741  |   (2)
1742  |    |
1743  (1)   +-i->(6)-+-s->[7]
1744  |
1745  +-s->(3)-+-h->(4)-+-e->[5]
1746
1747  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1748
1749 This shows that when matching against the string 'hers' we will begin at state 1
1750 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1751 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1752 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1753 single traverse. We store a mapping from accepting to state to which word was
1754 matched, and then when we have multiple possibilities we try to complete the
1755 rest of the regex in the order in which they occured in the alternation.
1756
1757 The only prior NFA like behaviour that would be changed by the TRIE support is
1758 the silent ignoring of duplicate alternations which are of the form:
1759
1760  / (DUPE|DUPE) X? (?{ ... }) Y /x
1761
1762 Thus EVAL blocks following a trie may be called a different number of times with
1763 and without the optimisation. With the optimisations dupes will be silently
1764 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1765 the following demonstrates:
1766
1767  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1768
1769 which prints out 'word' three times, but
1770
1771  'words'=~/(word|word|word)(?{ print $1 })S/
1772
1773 which doesnt print it out at all. This is due to other optimisations kicking in.
1774
1775 Example of what happens on a structural level:
1776
1777 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1778
1779    1: CURLYM[1] {1,32767}(18)
1780    5:   BRANCH(8)
1781    6:     EXACT <ac>(16)
1782    8:   BRANCH(11)
1783    9:     EXACT <ad>(16)
1784   11:   BRANCH(14)
1785   12:     EXACT <ab>(16)
1786   16:   SUCCEED(0)
1787   17:   NOTHING(18)
1788   18: END(0)
1789
1790 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1791 and should turn into:
1792
1793    1: CURLYM[1] {1,32767}(18)
1794    5:   TRIE(16)
1795   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1796   <ac>
1797   <ad>
1798   <ab>
1799   16:   SUCCEED(0)
1800   17:   NOTHING(18)
1801   18: END(0)
1802
1803 Cases where tail != last would be like /(?foo|bar)baz/:
1804
1805    1: BRANCH(4)
1806    2:   EXACT <foo>(8)
1807    4: BRANCH(7)
1808    5:   EXACT <bar>(8)
1809    7: TAIL(8)
1810    8: EXACT <baz>(10)
1811   10: END(0)
1812
1813 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1814 and would end up looking like:
1815
1816  1: TRIE(8)
1817  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1818   <foo>
1819   <bar>
1820    7: TAIL(8)
1821    8: EXACT <baz>(10)
1822   10: END(0)
1823
1824  d = uvchr_to_utf8_flags(d, uv, 0);
1825
1826 is the recommended Unicode-aware way of saying
1827
1828  *(d++) = uv;
1829 */
1830
1831 #define TRIE_STORE_REVCHAR(val)                                            \
1832  STMT_START {                                                           \
1833   if (UTF) {          \
1834    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1835    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1836    unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1837    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1838    SvPOK_on(zlopp);         \
1839    SvUTF8_on(zlopp);         \
1840    av_push(revcharmap, zlopp);        \
1841   } else {          \
1842    char ooooff = (char)val;                                           \
1843    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1844   }           \
1845   } STMT_END
1846
1847 /* This gets the next character from the input, folding it if not already
1848  * folded. */
1849 #define TRIE_READ_CHAR STMT_START {                                           \
1850  wordlen++;                                                                \
1851  if ( UTF ) {                                                              \
1852   /* if it is UTF then it is either already folded, or does not need    \
1853   * folding */                                                         \
1854   uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1855  }                                                                         \
1856  else if (folder == PL_fold_latin1) {                                      \
1857   /* This folder implies Unicode rules, which in the range expressible  \
1858   *  by not UTF is the lower case, with the two exceptions, one of     \
1859   *  which should have been taken care of before calling this */       \
1860   assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1861   uvc = toLOWER_L1(*uc);                                                \
1862   if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1863   len = 1;                                                              \
1864  } else {                                                                  \
1865   /* raw data, will be folded later if needed */                        \
1866   uvc = (U32)*uc;                                                       \
1867   len = 1;                                                              \
1868  }                                                                         \
1869 } STMT_END
1870
1871
1872
1873 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1874  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1875   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1876   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1877  }                                                           \
1878  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1879  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1880  TRIE_LIST_CUR( state )++;                                   \
1881 } STMT_END
1882
1883 #define TRIE_LIST_NEW(state) STMT_START {                       \
1884  Newxz( trie->states[ state ].trans.list,               \
1885   4, reg_trie_trans_le );                                 \
1886  TRIE_LIST_CUR( state ) = 1;                                \
1887  TRIE_LIST_LEN( state ) = 4;                                \
1888 } STMT_END
1889
1890 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1891  U16 dupe= trie->states[ state ].wordnum;                    \
1892  regnode * const noper_next = regnext( noper );              \
1893                 \
1894  DEBUG_r({                                                   \
1895   /* store the word for dumping */                        \
1896   SV* tmp;                                                \
1897   if (OP(noper) != NOTHING)                               \
1898    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1899   else                                                    \
1900    tmp = newSVpvn_utf8( "", 0, UTF );   \
1901   av_push( trie_words, tmp );                             \
1902  });                                                         \
1903                 \
1904  curword++;                                                  \
1905  trie->wordinfo[curword].prev   = 0;                         \
1906  trie->wordinfo[curword].len    = wordlen;                   \
1907  trie->wordinfo[curword].accept = state;                     \
1908                 \
1909  if ( noper_next < tail ) {                                  \
1910   if (!trie->jump)                                        \
1911    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1912             sizeof(U16) ); \
1913   trie->jump[curword] = (U16)(noper_next - convert);      \
1914   if (!jumper)                                            \
1915    jumper = noper_next;                                \
1916   if (!nextbranch)                                        \
1917    nextbranch= regnext(cur);                           \
1918  }                                                           \
1919                 \
1920  if ( dupe ) {                                               \
1921   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1922   /* chain, so that when the bits of chain are later    */\
1923   /* linked together, the dups appear in the chain      */\
1924   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1925   trie->wordinfo[dupe].prev = curword;                    \
1926  } else {                                                    \
1927   /* we haven't inserted this word yet.                */ \
1928   trie->states[ state ].wordnum = curword;                \
1929  }                                                           \
1930 } STMT_END
1931
1932
1933 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1934  ( ( base + charid >=  ucharcount     \
1935   && base + charid < ubound     \
1936   && state == trie->trans[ base - ucharcount + charid ].check \
1937   && trie->trans[ base - ucharcount + charid ].next )  \
1938   ? trie->trans[ base - ucharcount + charid ].next  \
1939   : ( state==1 ? special : 0 )     \
1940  )
1941
1942 #define MADE_TRIE       1
1943 #define MADE_JUMP_TRIE  2
1944 #define MADE_EXACT_TRIE 4
1945
1946 STATIC I32
1947 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1948     regnode *first, regnode *last, regnode *tail,
1949     U32 word_count, U32 flags, U32 depth)
1950 {
1951  /* first pass, loop through and scan words */
1952  reg_trie_data *trie;
1953  HV *widecharmap = NULL;
1954  AV *revcharmap = newAV();
1955  regnode *cur;
1956  STRLEN len = 0;
1957  UV uvc = 0;
1958  U16 curword = 0;
1959  U32 next_alloc = 0;
1960  regnode *jumper = NULL;
1961  regnode *nextbranch = NULL;
1962  regnode *convert = NULL;
1963  U32 *prev_states; /* temp array mapping each state to previous one */
1964  /* we just use folder as a flag in utf8 */
1965  const U8 * folder = NULL;
1966
1967 #ifdef DEBUGGING
1968  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1969  AV *trie_words = NULL;
1970  /* along with revcharmap, this only used during construction but both are
1971  * useful during debugging so we store them in the struct when debugging.
1972  */
1973 #else
1974  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1975  STRLEN trie_charcount=0;
1976 #endif
1977  SV *re_trie_maxbuff;
1978  GET_RE_DEBUG_FLAGS_DECL;
1979
1980  PERL_ARGS_ASSERT_MAKE_TRIE;
1981 #ifndef DEBUGGING
1982  PERL_UNUSED_ARG(depth);
1983 #endif
1984
1985  switch (flags) {
1986   case EXACT: break;
1987   case EXACTFA:
1988   case EXACTFU_SS:
1989   case EXACTFU: folder = PL_fold_latin1; break;
1990   case EXACTF:  folder = PL_fold; break;
1991   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1992  }
1993
1994  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1995  trie->refcount = 1;
1996  trie->startstate = 1;
1997  trie->wordcount = word_count;
1998  RExC_rxi->data->data[ data_slot ] = (void*)trie;
1999  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2000  if (flags == EXACT)
2001   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2002  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2003      trie->wordcount+1, sizeof(reg_trie_wordinfo));
2004
2005  DEBUG_r({
2006   trie_words = newAV();
2007  });
2008
2009  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2010  assert(re_trie_maxbuff);
2011  if (!SvIOK(re_trie_maxbuff)) {
2012   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2013  }
2014  DEBUG_TRIE_COMPILE_r({
2015   PerlIO_printf( Perl_debug_log,
2016   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2017   (int)depth * 2 + 2, "",
2018   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2019   REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2020  });
2021
2022    /* Find the node we are going to overwrite */
2023  if ( first == startbranch && OP( last ) != BRANCH ) {
2024   /* whole branch chain */
2025   convert = first;
2026  } else {
2027   /* branch sub-chain */
2028   convert = NEXTOPER( first );
2029  }
2030
2031  /*  -- First loop and Setup --
2032
2033  We first traverse the branches and scan each word to determine if it
2034  contains widechars, and how many unique chars there are, this is
2035  important as we have to build a table with at least as many columns as we
2036  have unique chars.
2037
2038  We use an array of integers to represent the character codes 0..255
2039  (trie->charmap) and we use a an HV* to store Unicode characters. We use
2040  the native representation of the character value as the key and IV's for
2041  the coded index.
2042
2043  *TODO* If we keep track of how many times each character is used we can
2044  remap the columns so that the table compression later on is more
2045  efficient in terms of memory by ensuring the most common value is in the
2046  middle and the least common are on the outside.  IMO this would be better
2047  than a most to least common mapping as theres a decent chance the most
2048  common letter will share a node with the least common, meaning the node
2049  will not be compressible. With a middle is most common approach the worst
2050  case is when we have the least common nodes twice.
2051
2052  */
2053
2054  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2055   regnode *noper = NEXTOPER( cur );
2056   const U8 *uc = (U8*)STRING( noper );
2057   const U8 *e  = uc + STR_LEN( noper );
2058   int foldlen = 0;
2059   U32 wordlen      = 0;         /* required init */
2060   STRLEN minchars = 0;
2061   STRLEN maxchars = 0;
2062   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2063            bitmap?*/
2064
2065   if (OP(noper) == NOTHING) {
2066    regnode *noper_next= regnext(noper);
2067    if (noper_next != tail && OP(noper_next) == flags) {
2068     noper = noper_next;
2069     uc= (U8*)STRING(noper);
2070     e= uc + STR_LEN(noper);
2071     trie->minlen= STR_LEN(noper);
2072    } else {
2073     trie->minlen= 0;
2074     continue;
2075    }
2076   }
2077
2078   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2079    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2080           regardless of encoding */
2081    if (OP( noper ) == EXACTFU_SS) {
2082     /* false positives are ok, so just set this */
2083     TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2084    }
2085   }
2086   for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2087           branch */
2088    TRIE_CHARCOUNT(trie)++;
2089    TRIE_READ_CHAR;
2090
2091    /* TRIE_READ_CHAR returns the current character, or its fold if /i
2092    * is in effect.  Under /i, this character can match itself, or
2093    * anything that folds to it.  If not under /i, it can match just
2094    * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2095    * all fold to k, and all are single characters.   But some folds
2096    * expand to more than one character, so for example LATIN SMALL
2097    * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2098    * the string beginning at 'uc' is 'ffi', it could be matched by
2099    * three characters, or just by the one ligature character. (It
2100    * could also be matched by two characters: LATIN SMALL LIGATURE FF
2101    * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2102    * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2103    * match.)  The trie needs to know the minimum and maximum number
2104    * of characters that could match so that it can use size alone to
2105    * quickly reject many match attempts.  The max is simple: it is
2106    * the number of folded characters in this branch (since a fold is
2107    * never shorter than what folds to it. */
2108
2109    maxchars++;
2110
2111    /* And the min is equal to the max if not under /i (indicated by
2112    * 'folder' being NULL), or there are no multi-character folds.  If
2113    * there is a multi-character fold, the min is incremented just
2114    * once, for the character that folds to the sequence.  Each
2115    * character in the sequence needs to be added to the list below of
2116    * characters in the trie, but we count only the first towards the
2117    * min number of characters needed.  This is done through the
2118    * variable 'foldlen', which is returned by the macros that look
2119    * for these sequences as the number of bytes the sequence
2120    * occupies.  Each time through the loop, we decrement 'foldlen' by
2121    * how many bytes the current char occupies.  Only when it reaches
2122    * 0 do we increment 'minchars' or look for another multi-character
2123    * sequence. */
2124    if (folder == NULL) {
2125     minchars++;
2126    }
2127    else if (foldlen > 0) {
2128     foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2129    }
2130    else {
2131     minchars++;
2132
2133     /* See if *uc is the beginning of a multi-character fold.  If
2134     * so, we decrement the length remaining to look at, to account
2135     * for the current character this iteration.  (We can use 'uc'
2136     * instead of the fold returned by TRIE_READ_CHAR because for
2137     * non-UTF, the latin1_safe macro is smart enough to account
2138     * for all the unfolded characters, and because for UTF, the
2139     * string will already have been folded earlier in the
2140     * compilation process */
2141     if (UTF) {
2142      if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2143       foldlen -= UTF8SKIP(uc);
2144      }
2145     }
2146     else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2147      foldlen--;
2148     }
2149    }
2150
2151    /* The current character (and any potential folds) should be added
2152    * to the possible matching characters for this position in this
2153    * branch */
2154    if ( uvc < 256 ) {
2155     if ( folder ) {
2156      U8 folded= folder[ (U8) uvc ];
2157      if ( !trie->charmap[ folded ] ) {
2158       trie->charmap[ folded ]=( ++trie->uniquecharcount );
2159       TRIE_STORE_REVCHAR( folded );
2160      }
2161     }
2162     if ( !trie->charmap[ uvc ] ) {
2163      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2164      TRIE_STORE_REVCHAR( uvc );
2165     }
2166     if ( set_bit ) {
2167      /* store the codepoint in the bitmap, and its folded
2168      * equivalent. */
2169      TRIE_BITMAP_SET(trie, uvc);
2170
2171      /* store the folded codepoint */
2172      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2173
2174      if ( !UTF ) {
2175       /* store first byte of utf8 representation of
2176       variant codepoints */
2177       if (! UVCHR_IS_INVARIANT(uvc)) {
2178        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2179       }
2180      }
2181      set_bit = 0; /* We've done our bit :-) */
2182     }
2183    } else {
2184
2185     /* XXX We could come up with the list of code points that fold
2186     * to this using PL_utf8_foldclosures, except not for
2187     * multi-char folds, as there may be multiple combinations
2188     * there that could work, which needs to wait until runtime to
2189     * resolve (The comment about LIGATURE FFI above is such an
2190     * example */
2191
2192     SV** svpp;
2193     if ( !widecharmap )
2194      widecharmap = newHV();
2195
2196     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2197
2198     if ( !svpp )
2199      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2200
2201     if ( !SvTRUE( *svpp ) ) {
2202      sv_setiv( *svpp, ++trie->uniquecharcount );
2203      TRIE_STORE_REVCHAR(uvc);
2204     }
2205    }
2206   } /* end loop through characters in this branch of the trie */
2207
2208   /* We take the min and max for this branch and combine to find the min
2209   * and max for all branches processed so far */
2210   if( cur == first ) {
2211    trie->minlen = minchars;
2212    trie->maxlen = maxchars;
2213   } else if (minchars < trie->minlen) {
2214    trie->minlen = minchars;
2215   } else if (maxchars > trie->maxlen) {
2216    trie->maxlen = maxchars;
2217   }
2218  } /* end first pass */
2219  DEBUG_TRIE_COMPILE_r(
2220   PerlIO_printf( Perl_debug_log,
2221     "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2222     (int)depth * 2 + 2,"",
2223     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2224     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2225     (int)trie->minlen, (int)trie->maxlen )
2226  );
2227
2228  /*
2229   We now know what we are dealing with in terms of unique chars and
2230   string sizes so we can calculate how much memory a naive
2231   representation using a flat table  will take. If it's over a reasonable
2232   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2233   conservative but potentially much slower representation using an array
2234   of lists.
2235
2236   At the end we convert both representations into the same compressed
2237   form that will be used in regexec.c for matching with. The latter
2238   is a form that cannot be used to construct with but has memory
2239   properties similar to the list form and access properties similar
2240   to the table form making it both suitable for fast searches and
2241   small enough that its feasable to store for the duration of a program.
2242
2243   See the comment in the code where the compressed table is produced
2244   inplace from the flat tabe representation for an explanation of how
2245   the compression works.
2246
2247  */
2248
2249
2250  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2251  prev_states[1] = 0;
2252
2253  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2254              > SvIV(re_trie_maxbuff) )
2255  {
2256   /*
2257    Second Pass -- Array Of Lists Representation
2258
2259    Each state will be represented by a list of charid:state records
2260    (reg_trie_trans_le) the first such element holds the CUR and LEN
2261    points of the allocated array. (See defines above).
2262
2263    We build the initial structure using the lists, and then convert
2264    it into the compressed table form which allows faster lookups
2265    (but cant be modified once converted).
2266   */
2267
2268   STRLEN transcount = 1;
2269
2270   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2271    "%*sCompiling trie using list compiler\n",
2272    (int)depth * 2 + 2, ""));
2273
2274   trie->states = (reg_trie_state *)
2275    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2276         sizeof(reg_trie_state) );
2277   TRIE_LIST_NEW(1);
2278   next_alloc = 2;
2279
2280   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2281
2282    regnode *noper   = NEXTOPER( cur );
2283    U8 *uc           = (U8*)STRING( noper );
2284    const U8 *e      = uc + STR_LEN( noper );
2285    U32 state        = 1;         /* required init */
2286    U16 charid       = 0;         /* sanity init */
2287    U32 wordlen      = 0;         /* required init */
2288
2289    if (OP(noper) == NOTHING) {
2290     regnode *noper_next= regnext(noper);
2291     if (noper_next != tail && OP(noper_next) == flags) {
2292      noper = noper_next;
2293      uc= (U8*)STRING(noper);
2294      e= uc + STR_LEN(noper);
2295     }
2296    }
2297
2298    if (OP(noper) != NOTHING) {
2299     for ( ; uc < e ; uc += len ) {
2300
2301      TRIE_READ_CHAR;
2302
2303      if ( uvc < 256 ) {
2304       charid = trie->charmap[ uvc ];
2305      } else {
2306       SV** const svpp = hv_fetch( widecharmap,
2307              (char*)&uvc,
2308              sizeof( UV ),
2309              0);
2310       if ( !svpp ) {
2311        charid = 0;
2312       } else {
2313        charid=(U16)SvIV( *svpp );
2314       }
2315      }
2316      /* charid is now 0 if we dont know the char read, or
2317      * nonzero if we do */
2318      if ( charid ) {
2319
2320       U16 check;
2321       U32 newstate = 0;
2322
2323       charid--;
2324       if ( !trie->states[ state ].trans.list ) {
2325        TRIE_LIST_NEW( state );
2326       }
2327       for ( check = 1;
2328        check <= TRIE_LIST_USED( state );
2329        check++ )
2330       {
2331        if ( TRIE_LIST_ITEM( state, check ).forid
2332                  == charid )
2333        {
2334         newstate = TRIE_LIST_ITEM( state, check ).newstate;
2335         break;
2336        }
2337       }
2338       if ( ! newstate ) {
2339        newstate = next_alloc++;
2340        prev_states[newstate] = state;
2341        TRIE_LIST_PUSH( state, charid, newstate );
2342        transcount++;
2343       }
2344       state = newstate;
2345      } else {
2346       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2347      }
2348     }
2349    }
2350    TRIE_HANDLE_WORD(state);
2351
2352   } /* end second pass */
2353
2354   /* next alloc is the NEXT state to be allocated */
2355   trie->statecount = next_alloc;
2356   trie->states = (reg_trie_state *)
2357    PerlMemShared_realloc( trie->states,
2358         next_alloc
2359         * sizeof(reg_trie_state) );
2360
2361   /* and now dump it out before we compress it */
2362   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2363               revcharmap, next_alloc,
2364               depth+1)
2365   );
2366
2367   trie->trans = (reg_trie_trans *)
2368    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2369   {
2370    U32 state;
2371    U32 tp = 0;
2372    U32 zp = 0;
2373
2374
2375    for( state=1 ; state < next_alloc ; state ++ ) {
2376     U32 base=0;
2377
2378     /*
2379     DEBUG_TRIE_COMPILE_MORE_r(
2380      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2381     );
2382     */
2383
2384     if (trie->states[state].trans.list) {
2385      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2386      U16 maxid=minid;
2387      U16 idx;
2388
2389      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2390       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2391       if ( forid < minid ) {
2392        minid=forid;
2393       } else if ( forid > maxid ) {
2394        maxid=forid;
2395       }
2396      }
2397      if ( transcount < tp + maxid - minid + 1) {
2398       transcount *= 2;
2399       trie->trans = (reg_trie_trans *)
2400        PerlMemShared_realloc( trie->trans,
2401              transcount
2402              * sizeof(reg_trie_trans) );
2403       Zero( trie->trans + (transcount / 2),
2404        transcount / 2,
2405        reg_trie_trans );
2406      }
2407      base = trie->uniquecharcount + tp - minid;
2408      if ( maxid == minid ) {
2409       U32 set = 0;
2410       for ( ; zp < tp ; zp++ ) {
2411        if ( ! trie->trans[ zp ].next ) {
2412         base = trie->uniquecharcount + zp - minid;
2413         trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2414                 1).newstate;
2415         trie->trans[ zp ].check = state;
2416         set = 1;
2417         break;
2418        }
2419       }
2420       if ( !set ) {
2421        trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2422                 1).newstate;
2423        trie->trans[ tp ].check = state;
2424        tp++;
2425        zp = tp;
2426       }
2427      } else {
2428       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2429        const U32 tid = base
2430           - trie->uniquecharcount
2431           + TRIE_LIST_ITEM( state, idx ).forid;
2432        trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2433                 idx ).newstate;
2434        trie->trans[ tid ].check = state;
2435       }
2436       tp += ( maxid - minid + 1 );
2437      }
2438      Safefree(trie->states[ state ].trans.list);
2439     }
2440     /*
2441     DEBUG_TRIE_COMPILE_MORE_r(
2442      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2443     );
2444     */
2445     trie->states[ state ].trans.base=base;
2446    }
2447    trie->lasttrans = tp + 1;
2448   }
2449  } else {
2450   /*
2451   Second Pass -- Flat Table Representation.
2452
2453   we dont use the 0 slot of either trans[] or states[] so we add 1 to
2454   each.  We know that we will need Charcount+1 trans at most to store
2455   the data (one row per char at worst case) So we preallocate both
2456   structures assuming worst case.
2457
2458   We then construct the trie using only the .next slots of the entry
2459   structs.
2460
2461   We use the .check field of the first entry of the node temporarily
2462   to make compression both faster and easier by keeping track of how
2463   many non zero fields are in the node.
2464
2465   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2466   transition.
2467
2468   There are two terms at use here: state as a TRIE_NODEIDX() which is
2469   a number representing the first entry of the node, and state as a
2470   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2471   and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2472   if there are 2 entrys per node. eg:
2473
2474    A B       A B
2475   1. 2 4    1. 3 7
2476   2. 0 3    3. 0 5
2477   3. 0 0    5. 0 0
2478   4. 0 0    7. 0 0
2479
2480   The table is internally in the right hand, idx form. However as we
2481   also have to deal with the states array which is indexed by nodenum
2482   we have to use TRIE_NODENUM() to convert.
2483
2484   */
2485   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2486    "%*sCompiling trie using table compiler\n",
2487    (int)depth * 2 + 2, ""));
2488
2489   trie->trans = (reg_trie_trans *)
2490    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2491         * trie->uniquecharcount + 1,
2492         sizeof(reg_trie_trans) );
2493   trie->states = (reg_trie_state *)
2494    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2495         sizeof(reg_trie_state) );
2496   next_alloc = trie->uniquecharcount + 1;
2497
2498
2499   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2500
2501    regnode *noper   = NEXTOPER( cur );
2502    const U8 *uc     = (U8*)STRING( noper );
2503    const U8 *e      = uc + STR_LEN( noper );
2504
2505    U32 state        = 1;         /* required init */
2506
2507    U16 charid       = 0;         /* sanity init */
2508    U32 accept_state = 0;         /* sanity init */
2509
2510    U32 wordlen      = 0;         /* required init */
2511
2512    if (OP(noper) == NOTHING) {
2513     regnode *noper_next= regnext(noper);
2514     if (noper_next != tail && OP(noper_next) == flags) {
2515      noper = noper_next;
2516      uc= (U8*)STRING(noper);
2517      e= uc + STR_LEN(noper);
2518     }
2519    }
2520
2521    if ( OP(noper) != NOTHING ) {
2522     for ( ; uc < e ; uc += len ) {
2523
2524      TRIE_READ_CHAR;
2525
2526      if ( uvc < 256 ) {
2527       charid = trie->charmap[ uvc ];
2528      } else {
2529       SV* const * const svpp = hv_fetch( widecharmap,
2530               (char*)&uvc,
2531               sizeof( UV ),
2532               0);
2533       charid = svpp ? (U16)SvIV(*svpp) : 0;
2534      }
2535      if ( charid ) {
2536       charid--;
2537       if ( !trie->trans[ state + charid ].next ) {
2538        trie->trans[ state + charid ].next = next_alloc;
2539        trie->trans[ state ].check++;
2540        prev_states[TRIE_NODENUM(next_alloc)]
2541          = TRIE_NODENUM(state);
2542        next_alloc += trie->uniquecharcount;
2543       }
2544       state = trie->trans[ state + charid ].next;
2545      } else {
2546       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2547      }
2548      /* charid is now 0 if we dont know the char read, or
2549      * nonzero if we do */
2550     }
2551    }
2552    accept_state = TRIE_NODENUM( state );
2553    TRIE_HANDLE_WORD(accept_state);
2554
2555   } /* end second pass */
2556
2557   /* and now dump it out before we compress it */
2558   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2559               revcharmap,
2560               next_alloc, depth+1));
2561
2562   {
2563   /*
2564   * Inplace compress the table.*
2565
2566   For sparse data sets the table constructed by the trie algorithm will
2567   be mostly 0/FAIL transitions or to put it another way mostly empty.
2568   (Note that leaf nodes will not contain any transitions.)
2569
2570   This algorithm compresses the tables by eliminating most such
2571   transitions, at the cost of a modest bit of extra work during lookup:
2572
2573   - Each states[] entry contains a .base field which indicates the
2574   index in the state[] array wheres its transition data is stored.
2575
2576   - If .base is 0 there are no valid transitions from that node.
2577
2578   - If .base is nonzero then charid is added to it to find an entry in
2579   the trans array.
2580
2581   -If trans[states[state].base+charid].check!=state then the
2582   transition is taken to be a 0/Fail transition. Thus if there are fail
2583   transitions at the front of the node then the .base offset will point
2584   somewhere inside the previous nodes data (or maybe even into a node
2585   even earlier), but the .check field determines if the transition is
2586   valid.
2587
2588   XXX - wrong maybe?
2589   The following process inplace converts the table to the compressed
2590   table: We first do not compress the root node 1,and mark all its
2591   .check pointers as 1 and set its .base pointer as 1 as well. This
2592   allows us to do a DFA construction from the compressed table later,
2593   and ensures that any .base pointers we calculate later are greater
2594   than 0.
2595
2596   - We set 'pos' to indicate the first entry of the second node.
2597
2598   - We then iterate over the columns of the node, finding the first and
2599   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2600   and set the .check pointers accordingly, and advance pos
2601   appropriately and repreat for the next node. Note that when we copy
2602   the next pointers we have to convert them from the original
2603   NODEIDX form to NODENUM form as the former is not valid post
2604   compression.
2605
2606   - If a node has no transitions used we mark its base as 0 and do not
2607   advance the pos pointer.
2608
2609   - If a node only has one transition we use a second pointer into the
2610   structure to fill in allocated fail transitions from other states.
2611   This pointer is independent of the main pointer and scans forward
2612   looking for null transitions that are allocated to a state. When it
2613   finds one it writes the single transition into the "hole".  If the
2614   pointer doesnt find one the single transition is appended as normal.
2615
2616   - Once compressed we can Renew/realloc the structures to release the
2617   excess space.
2618
2619   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2620   specifically Fig 3.47 and the associated pseudocode.
2621
2622   demq
2623   */
2624   const U32 laststate = TRIE_NODENUM( next_alloc );
2625   U32 state, charid;
2626   U32 pos = 0, zp=0;
2627   trie->statecount = laststate;
2628
2629   for ( state = 1 ; state < laststate ; state++ ) {
2630    U8 flag = 0;
2631    const U32 stateidx = TRIE_NODEIDX( state );
2632    const U32 o_used = trie->trans[ stateidx ].check;
2633    U32 used = trie->trans[ stateidx ].check;
2634    trie->trans[ stateidx ].check = 0;
2635
2636    for ( charid = 0;
2637     used && charid < trie->uniquecharcount;
2638     charid++ )
2639    {
2640     if ( flag || trie->trans[ stateidx + charid ].next ) {
2641      if ( trie->trans[ stateidx + charid ].next ) {
2642       if (o_used == 1) {
2643        for ( ; zp < pos ; zp++ ) {
2644         if ( ! trie->trans[ zp ].next ) {
2645          break;
2646         }
2647        }
2648        trie->states[ state ].trans.base
2649              = zp
2650              + trie->uniquecharcount
2651              - charid ;
2652        trie->trans[ zp ].next
2653         = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2654                + charid ].next );
2655        trie->trans[ zp ].check = state;
2656        if ( ++zp > pos ) pos = zp;
2657        break;
2658       }
2659       used--;
2660      }
2661      if ( !flag ) {
2662       flag = 1;
2663       trie->states[ state ].trans.base
2664          = pos + trie->uniquecharcount - charid ;
2665      }
2666      trie->trans[ pos ].next
2667       = SAFE_TRIE_NODENUM(
2668          trie->trans[ stateidx + charid ].next );
2669      trie->trans[ pos ].check = state;
2670      pos++;
2671     }
2672    }
2673   }
2674   trie->lasttrans = pos + 1;
2675   trie->states = (reg_trie_state *)
2676    PerlMemShared_realloc( trie->states, laststate
2677         * sizeof(reg_trie_state) );
2678   DEBUG_TRIE_COMPILE_MORE_r(
2679    PerlIO_printf( Perl_debug_log,
2680     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2681     (int)depth * 2 + 2,"",
2682     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2683      + 1 ),
2684     (IV)next_alloc,
2685     (IV)pos,
2686     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2687    );
2688
2689   } /* end table compress */
2690  }
2691  DEBUG_TRIE_COMPILE_MORE_r(
2692    PerlIO_printf(Perl_debug_log,
2693     "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2694     (int)depth * 2 + 2, "",
2695     (UV)trie->statecount,
2696     (UV)trie->lasttrans)
2697  );
2698  /* resize the trans array to remove unused space */
2699  trie->trans = (reg_trie_trans *)
2700   PerlMemShared_realloc( trie->trans, trie->lasttrans
2701        * sizeof(reg_trie_trans) );
2702
2703  {   /* Modify the program and insert the new TRIE node */
2704   U8 nodetype =(U8)(flags & 0xFF);
2705   char *str=NULL;
2706
2707 #ifdef DEBUGGING
2708   regnode *optimize = NULL;
2709 #ifdef RE_TRACK_PATTERN_OFFSETS
2710
2711   U32 mjd_offset = 0;
2712   U32 mjd_nodelen = 0;
2713 #endif /* RE_TRACK_PATTERN_OFFSETS */
2714 #endif /* DEBUGGING */
2715   /*
2716   This means we convert either the first branch or the first Exact,
2717   depending on whether the thing following (in 'last') is a branch
2718   or not and whther first is the startbranch (ie is it a sub part of
2719   the alternation or is it the whole thing.)
2720   Assuming its a sub part we convert the EXACT otherwise we convert
2721   the whole branch sequence, including the first.
2722   */
2723   /* Find the node we are going to overwrite */
2724   if ( first != startbranch || OP( last ) == BRANCH ) {
2725    /* branch sub-chain */
2726    NEXT_OFF( first ) = (U16)(last - first);
2727 #ifdef RE_TRACK_PATTERN_OFFSETS
2728    DEBUG_r({
2729     mjd_offset= Node_Offset((convert));
2730     mjd_nodelen= Node_Length((convert));
2731    });
2732 #endif
2733    /* whole branch chain */
2734   }
2735 #ifdef RE_TRACK_PATTERN_OFFSETS
2736   else {
2737    DEBUG_r({
2738     const  regnode *nop = NEXTOPER( convert );
2739     mjd_offset= Node_Offset((nop));
2740     mjd_nodelen= Node_Length((nop));
2741    });
2742   }
2743   DEBUG_OPTIMISE_r(
2744    PerlIO_printf(Perl_debug_log,
2745     "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2746     (int)depth * 2 + 2, "",
2747     (UV)mjd_offset, (UV)mjd_nodelen)
2748   );
2749 #endif
2750   /* But first we check to see if there is a common prefix we can
2751   split out as an EXACT and put in front of the TRIE node.  */
2752   trie->startstate= 1;
2753   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2754    U32 state;
2755    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2756     U32 ofs = 0;
2757     I32 idx = -1;
2758     U32 count = 0;
2759     const U32 base = trie->states[ state ].trans.base;
2760
2761     if ( trie->states[state].wordnum )
2762       count = 1;
2763
2764     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2765      if ( ( base + ofs >= trie->uniquecharcount ) &&
2766       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2767       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2768      {
2769       if ( ++count > 1 ) {
2770        SV **tmp = av_fetch( revcharmap, ofs, 0);
2771        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2772        if ( state == 1 ) break;
2773        if ( count == 2 ) {
2774         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2775         DEBUG_OPTIMISE_r(
2776          PerlIO_printf(Perl_debug_log,
2777           "%*sNew Start State=%"UVuf" Class: [",
2778           (int)depth * 2 + 2, "",
2779           (UV)state));
2780         if (idx >= 0) {
2781          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2782          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2783
2784          TRIE_BITMAP_SET(trie,*ch);
2785          if ( folder )
2786           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2787          DEBUG_OPTIMISE_r(
2788           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2789          );
2790         }
2791        }
2792        TRIE_BITMAP_SET(trie,*ch);
2793        if ( folder )
2794         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2795        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2796       }
2797       idx = ofs;
2798      }
2799     }
2800     if ( count == 1 ) {
2801      SV **tmp = av_fetch( revcharmap, idx, 0);
2802      STRLEN len;
2803      char *ch = SvPV( *tmp, len );
2804      DEBUG_OPTIMISE_r({
2805       SV *sv=sv_newmortal();
2806       PerlIO_printf( Perl_debug_log,
2807        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2808        (int)depth * 2 + 2, "",
2809        (UV)state, (UV)idx,
2810        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2811         PL_colors[0], PL_colors[1],
2812         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2813         PERL_PV_ESCAPE_FIRSTCHAR
2814        )
2815       );
2816      });
2817      if ( state==1 ) {
2818       OP( convert ) = nodetype;
2819       str=STRING(convert);
2820       STR_LEN(convert)=0;
2821      }
2822      STR_LEN(convert) += len;
2823      while (len--)
2824       *str++ = *ch++;
2825     } else {
2826 #ifdef DEBUGGING
2827      if (state>1)
2828       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2829 #endif
2830      break;
2831     }
2832    }
2833    trie->prefixlen = (state-1);
2834    if (str) {
2835     regnode *n = convert+NODE_SZ_STR(convert);
2836     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2837     trie->startstate = state;
2838     trie->minlen -= (state - 1);
2839     trie->maxlen -= (state - 1);
2840 #ifdef DEBUGGING
2841    /* At least the UNICOS C compiler choked on this
2842     * being argument to DEBUG_r(), so let's just have
2843     * it right here. */
2844    if (
2845 #ifdef PERL_EXT_RE_BUILD
2846     1
2847 #else
2848     DEBUG_r_TEST
2849 #endif
2850     ) {
2851     regnode *fix = convert;
2852     U32 word = trie->wordcount;
2853     mjd_nodelen++;
2854     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2855     while( ++fix < n ) {
2856      Set_Node_Offset_Length(fix, 0, 0);
2857     }
2858     while (word--) {
2859      SV ** const tmp = av_fetch( trie_words, word, 0 );
2860      if (tmp) {
2861       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2862        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2863       else
2864        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2865      }
2866     }
2867    }
2868 #endif
2869     if (trie->maxlen) {
2870      convert = n;
2871     } else {
2872      NEXT_OFF(convert) = (U16)(tail - convert);
2873      DEBUG_r(optimize= n);
2874     }
2875    }
2876   }
2877   if (!jumper)
2878    jumper = last;
2879   if ( trie->maxlen ) {
2880    NEXT_OFF( convert ) = (U16)(tail - convert);
2881    ARG_SET( convert, data_slot );
2882    /* Store the offset to the first unabsorbed branch in
2883    jump[0], which is otherwise unused by the jump logic.
2884    We use this when dumping a trie and during optimisation. */
2885    if (trie->jump)
2886     trie->jump[0] = (U16)(nextbranch - convert);
2887
2888    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2889    *   and there is a bitmap
2890    *   and the first "jump target" node we found leaves enough room
2891    * then convert the TRIE node into a TRIEC node, with the bitmap
2892    * embedded inline in the opcode - this is hypothetically faster.
2893    */
2894    if ( !trie->states[trie->startstate].wordnum
2895     && trie->bitmap
2896     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2897    {
2898     OP( convert ) = TRIEC;
2899     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2900     PerlMemShared_free(trie->bitmap);
2901     trie->bitmap= NULL;
2902    } else
2903     OP( convert ) = TRIE;
2904
2905    /* store the type in the flags */
2906    convert->flags = nodetype;
2907    DEBUG_r({
2908    optimize = convert
2909      + NODE_STEP_REGNODE
2910      + regarglen[ OP( convert ) ];
2911    });
2912    /* XXX We really should free up the resource in trie now,
2913     as we won't use them - (which resources?) dmq */
2914   }
2915   /* needed for dumping*/
2916   DEBUG_r(if (optimize) {
2917    regnode *opt = convert;
2918
2919    while ( ++opt < optimize) {
2920     Set_Node_Offset_Length(opt,0,0);
2921    }
2922    /*
2923     Try to clean up some of the debris left after the
2924     optimisation.
2925    */
2926    while( optimize < jumper ) {
2927     mjd_nodelen += Node_Length((optimize));
2928     OP( optimize ) = OPTIMIZED;
2929     Set_Node_Offset_Length(optimize,0,0);
2930     optimize++;
2931    }
2932    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2933   });
2934  } /* end node insert */
2935  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2936
2937  /*  Finish populating the prev field of the wordinfo array.  Walk back
2938  *  from each accept state until we find another accept state, and if
2939  *  so, point the first word's .prev field at the second word. If the
2940  *  second already has a .prev field set, stop now. This will be the
2941  *  case either if we've already processed that word's accept state,
2942  *  or that state had multiple words, and the overspill words were
2943  *  already linked up earlier.
2944  */
2945  {
2946   U16 word;
2947   U32 state;
2948   U16 prev;
2949
2950   for (word=1; word <= trie->wordcount; word++) {
2951    prev = 0;
2952    if (trie->wordinfo[word].prev)
2953     continue;
2954    state = trie->wordinfo[word].accept;
2955    while (state) {
2956     state = prev_states[state];
2957     if (!state)
2958      break;
2959     prev = trie->states[state].wordnum;
2960     if (prev)
2961      break;
2962    }
2963    trie->wordinfo[word].prev = prev;
2964   }
2965   Safefree(prev_states);
2966  }
2967
2968
2969  /* and now dump out the compressed format */
2970  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2971
2972  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2973 #ifdef DEBUGGING
2974  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2975  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2976 #else
2977  SvREFCNT_dec_NN(revcharmap);
2978 #endif
2979  return trie->jump
2980   ? MADE_JUMP_TRIE
2981   : trie->startstate>1
2982    ? MADE_EXACT_TRIE
2983    : MADE_TRIE;
2984 }
2985
2986 STATIC regnode *
2987 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
2988 {
2989 /* The Trie is constructed and compressed now so we can build a fail array if
2990  * it's needed
2991
2992    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2993    3.32 in the
2994    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2995    Ullman 1985/88
2996    ISBN 0-201-10088-6
2997
2998    We find the fail state for each state in the trie, this state is the longest
2999    proper suffix of the current state's 'word' that is also a proper prefix of
3000    another word in our trie. State 1 represents the word '' and is thus the
3001    default fail state. This allows the DFA not to have to restart after its
3002    tried and failed a word at a given point, it simply continues as though it
3003    had been matching the other word in the first place.
3004    Consider
3005  'abcdgu'=~/abcdefg|cdgu/
3006    When we get to 'd' we are still matching the first word, we would encounter
3007    'g' which would fail, which would bring us to the state representing 'd' in
3008    the second word where we would try 'g' and succeed, proceeding to match
3009    'cdgu'.
3010  */
3011  /* add a fail transition */
3012  const U32 trie_offset = ARG(source);
3013  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3014  U32 *q;
3015  const U32 ucharcount = trie->uniquecharcount;
3016  const U32 numstates = trie->statecount;
3017  const U32 ubound = trie->lasttrans + ucharcount;
3018  U32 q_read = 0;
3019  U32 q_write = 0;
3020  U32 charid;
3021  U32 base = trie->states[ 1 ].trans.base;
3022  U32 *fail;
3023  reg_ac_data *aho;
3024  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3025  regnode *stclass;
3026  GET_RE_DEBUG_FLAGS_DECL;
3027
3028  PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3029  PERL_UNUSED_CONTEXT;
3030 #ifndef DEBUGGING
3031  PERL_UNUSED_ARG(depth);
3032 #endif
3033
3034  if ( OP(source) == TRIE ) {
3035   struct regnode_1 *op = (struct regnode_1 *)
3036    PerlMemShared_calloc(1, sizeof(struct regnode_1));
3037   StructCopy(source,op,struct regnode_1);
3038   stclass = (regnode *)op;
3039  } else {
3040   struct regnode_charclass *op = (struct regnode_charclass *)
3041    PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3042   StructCopy(source,op,struct regnode_charclass);
3043   stclass = (regnode *)op;
3044  }
3045  OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3046
3047  ARG_SET( stclass, data_slot );
3048  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3049  RExC_rxi->data->data[ data_slot ] = (void*)aho;
3050  aho->trie=trie_offset;
3051  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3052  Copy( trie->states, aho->states, numstates, reg_trie_state );
3053  Newxz( q, numstates, U32);
3054  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3055  aho->refcount = 1;
3056  fail = aho->fail;
3057  /* initialize fail[0..1] to be 1 so that we always have
3058  a valid final fail state */
3059  fail[ 0 ] = fail[ 1 ] = 1;
3060
3061  for ( charid = 0; charid < ucharcount ; charid++ ) {
3062   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3063   if ( newstate ) {
3064    q[ q_write ] = newstate;
3065    /* set to point at the root */
3066    fail[ q[ q_write++ ] ]=1;
3067   }
3068  }
3069  while ( q_read < q_write) {
3070   const U32 cur = q[ q_read++ % numstates ];
3071   base = trie->states[ cur ].trans.base;
3072
3073   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3074    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3075    if (ch_state) {
3076     U32 fail_state = cur;
3077     U32 fail_base;
3078     do {
3079      fail_state = fail[ fail_state ];
3080      fail_base = aho->states[ fail_state ].trans.base;
3081     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3082
3083     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3084     fail[ ch_state ] = fail_state;
3085     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3086     {
3087       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3088     }
3089     q[ q_write++ % numstates] = ch_state;
3090    }
3091   }
3092  }
3093  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3094  when we fail in state 1, this allows us to use the
3095  charclass scan to find a valid start char. This is based on the principle
3096  that theres a good chance the string being searched contains lots of stuff
3097  that cant be a start char.
3098  */
3099  fail[ 0 ] = fail[ 1 ] = 0;
3100  DEBUG_TRIE_COMPILE_r({
3101   PerlIO_printf(Perl_debug_log,
3102      "%*sStclass Failtable (%"UVuf" states): 0",
3103      (int)(depth * 2), "", (UV)numstates
3104   );
3105   for( q_read=1; q_read<numstates; q_read++ ) {
3106    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3107   }
3108   PerlIO_printf(Perl_debug_log, "\n");
3109  });
3110  Safefree(q);
3111  /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3112  return stclass;
3113 }
3114
3115
3116 #define DEBUG_PEEP(str,scan,depth) \
3117  DEBUG_OPTIMISE_r({if (scan){ \
3118  SV * const mysv=sv_newmortal(); \
3119  regnode *Next = regnext(scan); \
3120  regprop(RExC_rx, mysv, scan, NULL); \
3121  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3122  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3123  Next ? (REG_NODE_NUM(Next)) : 0 ); \
3124    }});
3125
3126
3127 /* The below joins as many adjacent EXACTish nodes as possible into a single
3128  * one.  The regop may be changed if the node(s) contain certain sequences that
3129  * require special handling.  The joining is only done if:
3130  * 1) there is room in the current conglomerated node to entirely contain the
3131  *    next one.
3132  * 2) they are the exact same node type
3133  *
3134  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3135  * these get optimized out
3136  *
3137  * If a node is to match under /i (folded), the number of characters it matches
3138  * can be different than its character length if it contains a multi-character
3139  * fold.  *min_subtract is set to the total delta number of characters of the
3140  * input nodes.
3141  *
3142  * And *unfolded_multi_char is set to indicate whether or not the node contains
3143  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3144  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3145  * SMALL LETTER SHARP S, as only if the target string being matched against
3146  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3147  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3148  * whose components are all above the Latin1 range are not run-time locale
3149  * dependent, and have already been folded by the time this function is
3150  * called.)
3151  *
3152  * This is as good a place as any to discuss the design of handling these
3153  * multi-character fold sequences.  It's been wrong in Perl for a very long
3154  * time.  There are three code points in Unicode whose multi-character folds
3155  * were long ago discovered to mess things up.  The previous designs for
3156  * dealing with these involved assigning a special node for them.  This
3157  * approach doesn't always work, as evidenced by this example:
3158  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3159  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3160  * would match just the \xDF, it won't be able to handle the case where a
3161  * successful match would have to cross the node's boundary.  The new approach
3162  * that hopefully generally solves the problem generates an EXACTFU_SS node
3163  * that is "sss" in this case.
3164  *
3165  * It turns out that there are problems with all multi-character folds, and not
3166  * just these three.  Now the code is general, for all such cases.  The
3167  * approach taken is:
3168  * 1)   This routine examines each EXACTFish node that could contain multi-
3169  *      character folded sequences.  Since a single character can fold into
3170  *      such a sequence, the minimum match length for this node is less than
3171  *      the number of characters in the node.  This routine returns in
3172  *      *min_subtract how many characters to subtract from the the actual
3173  *      length of the string to get a real minimum match length; it is 0 if
3174  *      there are no multi-char foldeds.  This delta is used by the caller to
3175  *      adjust the min length of the match, and the delta between min and max,
3176  *      so that the optimizer doesn't reject these possibilities based on size
3177  *      constraints.
3178  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3179  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3180  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3181  *      there is a possible fold length change.  That means that a regular
3182  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3183  *      with length changes, and so can be processed faster.  regexec.c takes
3184  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3185  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3186  *      known until runtime).  This saves effort in regex matching.  However,
3187  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3188  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3189  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3190  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3191  *      possibilities for the non-UTF8 patterns are quite simple, except for
3192  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3193  *      members of a fold-pair, and arrays are set up for all of them so that
3194  *      the other member of the pair can be found quickly.  Code elsewhere in
3195  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3196  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3197  *      described in the next item.
3198  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3199  *      validity of the fold won't be known until runtime, and so must remain
3200  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3201  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3202  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3203  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3204  *      The reason this is a problem is that the optimizer part of regexec.c
3205  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3206  *      that a character in the pattern corresponds to at most a single
3207  *      character in the target string.  (And I do mean character, and not byte
3208  *      here, unlike other parts of the documentation that have never been
3209  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3210  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3211  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3212  *      nodes, violate the assumption, and they are the only instances where it
3213  *      is violated.  I'm reluctant to try to change the assumption, as the
3214  *      code involved is impenetrable to me (khw), so instead the code here
3215  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3216  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3217  *      boolean indicating whether or not the node contains such a fold.  When
3218  *      it is true, the caller sets a flag that later causes the optimizer in
3219  *      this file to not set values for the floating and fixed string lengths,
3220  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3221  *      assumption.  Thus, there is no optimization based on string lengths for
3222  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3223  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3224  *      assumption is wrong only in these cases is that all other non-UTF-8
3225  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3226  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3227  *      EXACTF nodes because we don't know at compile time if it actually
3228  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3229  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3230  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3231  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3232  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3233  *      string would require the pattern to be forced into UTF-8, the overhead
3234  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3235  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3236  *      locale.)
3237  *
3238  *      Similarly, the code that generates tries doesn't currently handle
3239  *      not-already-folded multi-char folds, and it looks like a pain to change
3240  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3241  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3242  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3243  *      using /iaa matching will be doing so almost entirely with ASCII
3244  *      strings, so this should rarely be encountered in practice */
3245
3246 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3247  if (PL_regkind[OP(scan)] == EXACT) \
3248   join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3249
3250 STATIC U32
3251 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3252     UV *min_subtract, bool *unfolded_multi_char,
3253     U32 flags,regnode *val, U32 depth)
3254 {
3255  /* Merge several consecutive EXACTish nodes into one. */
3256  regnode *n = regnext(scan);
3257  U32 stringok = 1;
3258  regnode *next = scan + NODE_SZ_STR(scan);
3259  U32 merged = 0;
3260  U32 stopnow = 0;
3261 #ifdef DEBUGGING
3262  regnode *stop = scan;
3263  GET_RE_DEBUG_FLAGS_DECL;
3264 #else
3265  PERL_UNUSED_ARG(depth);
3266 #endif
3267
3268  PERL_ARGS_ASSERT_JOIN_EXACT;
3269 #ifndef EXPERIMENTAL_INPLACESCAN
3270  PERL_UNUSED_ARG(flags);
3271  PERL_UNUSED_ARG(val);
3272 #endif
3273  DEBUG_PEEP("join",scan,depth);
3274
3275  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3276  * EXACT ones that are mergeable to the current one. */
3277  while (n
3278   && (PL_regkind[OP(n)] == NOTHING
3279    || (stringok && OP(n) == OP(scan)))
3280   && NEXT_OFF(n)
3281   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3282  {
3283
3284   if (OP(n) == TAIL || n > next)
3285    stringok = 0;
3286   if (PL_regkind[OP(n)] == NOTHING) {
3287    DEBUG_PEEP("skip:",n,depth);
3288    NEXT_OFF(scan) += NEXT_OFF(n);
3289    next = n + NODE_STEP_REGNODE;
3290 #ifdef DEBUGGING
3291    if (stringok)
3292     stop = n;
3293 #endif
3294    n = regnext(n);
3295   }
3296   else if (stringok) {
3297    const unsigned int oldl = STR_LEN(scan);
3298    regnode * const nnext = regnext(n);
3299
3300    /* XXX I (khw) kind of doubt that this works on platforms (should
3301    * Perl ever run on one) where U8_MAX is above 255 because of lots
3302    * of other assumptions */
3303    /* Don't join if the sum can't fit into a single node */
3304    if (oldl + STR_LEN(n) > U8_MAX)
3305     break;
3306
3307    DEBUG_PEEP("merg",n,depth);
3308    merged++;
3309
3310    NEXT_OFF(scan) += NEXT_OFF(n);
3311    STR_LEN(scan) += STR_LEN(n);
3312    next = n + NODE_SZ_STR(n);
3313    /* Now we can overwrite *n : */
3314    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3315 #ifdef DEBUGGING
3316    stop = next - 1;
3317 #endif
3318    n = nnext;
3319    if (stopnow) break;
3320   }
3321
3322 #ifdef EXPERIMENTAL_INPLACESCAN
3323   if (flags && !NEXT_OFF(n)) {
3324    DEBUG_PEEP("atch", val, depth);
3325    if (reg_off_by_arg[OP(n)]) {
3326     ARG_SET(n, val - n);
3327    }
3328    else {
3329     NEXT_OFF(n) = val - n;
3330    }
3331    stopnow = 1;
3332   }
3333 #endif
3334  }
3335
3336  *min_subtract = 0;
3337  *unfolded_multi_char = FALSE;
3338
3339  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3340  * can now analyze for sequences of problematic code points.  (Prior to
3341  * this final joining, sequences could have been split over boundaries, and
3342  * hence missed).  The sequences only happen in folding, hence for any
3343  * non-EXACT EXACTish node */
3344  if (OP(scan) != EXACT) {
3345   U8* s0 = (U8*) STRING(scan);
3346   U8* s = s0;
3347   U8* s_end = s0 + STR_LEN(scan);
3348
3349   int total_count_delta = 0;  /* Total delta number of characters that
3350          multi-char folds expand to */
3351
3352   /* One pass is made over the node's string looking for all the
3353   * possibilities.  To avoid some tests in the loop, there are two main
3354   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3355   * non-UTF-8 */
3356   if (UTF) {
3357    U8* folded = NULL;
3358
3359    if (OP(scan) == EXACTFL) {
3360     U8 *d;
3361
3362     /* An EXACTFL node would already have been changed to another
3363     * node type unless there is at least one character in it that
3364     * is problematic; likely a character whose fold definition
3365     * won't be known until runtime, and so has yet to be folded.
3366     * For all but the UTF-8 locale, folds are 1-1 in length, but
3367     * to handle the UTF-8 case, we need to create a temporary
3368     * folded copy using UTF-8 locale rules in order to analyze it.
3369     * This is because our macros that look to see if a sequence is
3370     * a multi-char fold assume everything is folded (otherwise the
3371     * tests in those macros would be too complicated and slow).
3372     * Note that here, the non-problematic folds will have already
3373     * been done, so we can just copy such characters.  We actually
3374     * don't completely fold the EXACTFL string.  We skip the
3375     * unfolded multi-char folds, as that would just create work
3376     * below to figure out the size they already are */
3377
3378     Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3379     d = folded;
3380     while (s < s_end) {
3381      STRLEN s_len = UTF8SKIP(s);
3382      if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3383       Copy(s, d, s_len, U8);
3384       d += s_len;
3385      }
3386      else if (is_FOLDS_TO_MULTI_utf8(s)) {
3387       *unfolded_multi_char = TRUE;
3388       Copy(s, d, s_len, U8);
3389       d += s_len;
3390      }
3391      else if (isASCII(*s)) {
3392       *(d++) = toFOLD(*s);
3393      }
3394      else {
3395       STRLEN len;
3396       _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3397       d += len;
3398      }
3399      s += s_len;
3400     }
3401
3402     /* Point the remainder of the routine to look at our temporary
3403     * folded copy */
3404     s = folded;
3405     s_end = d;
3406    } /* End of creating folded copy of EXACTFL string */
3407
3408    /* Examine the string for a multi-character fold sequence.  UTF-8
3409    * patterns have all characters pre-folded by the time this code is
3410    * executed */
3411    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3412          length sequence we are looking for is 2 */
3413    {
3414     int count = 0;  /* How many characters in a multi-char fold */
3415     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3416     if (! len) {    /* Not a multi-char fold: get next char */
3417      s += UTF8SKIP(s);
3418      continue;
3419     }
3420
3421     /* Nodes with 'ss' require special handling, except for
3422     * EXACTFA-ish for which there is no multi-char fold to this */
3423     if (len == 2 && *s == 's' && *(s+1) == 's'
3424      && OP(scan) != EXACTFA
3425      && OP(scan) != EXACTFA_NO_TRIE)
3426     {
3427      count = 2;
3428      if (OP(scan) != EXACTFL) {
3429       OP(scan) = EXACTFU_SS;
3430      }
3431      s += 2;
3432     }
3433     else { /* Here is a generic multi-char fold. */
3434      U8* multi_end  = s + len;
3435
3436      /* Count how many characters are in it.  In the case of
3437      * /aa, no folds which contain ASCII code points are
3438      * allowed, so check for those, and skip if found. */
3439      if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3440       count = utf8_length(s, multi_end);
3441       s = multi_end;
3442      }
3443      else {
3444       while (s < multi_end) {
3445        if (isASCII(*s)) {
3446         s++;
3447         goto next_iteration;
3448        }
3449        else {
3450         s += UTF8SKIP(s);
3451        }
3452        count++;
3453       }
3454      }
3455     }
3456
3457     /* The delta is how long the sequence is minus 1 (1 is how long
3458     * the character that folds to the sequence is) */
3459     total_count_delta += count - 1;
3460    next_iteration: ;
3461    }
3462
3463    /* We created a temporary folded copy of the string in EXACTFL
3464    * nodes.  Therefore we need to be sure it doesn't go below zero,
3465    * as the real string could be shorter */
3466    if (OP(scan) == EXACTFL) {
3467     int total_chars = utf8_length((U8*) STRING(scan),
3468           (U8*) STRING(scan) + STR_LEN(scan));
3469     if (total_count_delta > total_chars) {
3470      total_count_delta = total_chars;
3471     }
3472    }
3473
3474    *min_subtract += total_count_delta;
3475    Safefree(folded);
3476   }
3477   else if (OP(scan) == EXACTFA) {
3478
3479    /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3480    * fold to the ASCII range (and there are no existing ones in the
3481    * upper latin1 range).  But, as outlined in the comments preceding
3482    * this function, we need to flag any occurrences of the sharp s.
3483    * This character forbids trie formation (because of added
3484    * complexity) */
3485    while (s < s_end) {
3486     if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3487      OP(scan) = EXACTFA_NO_TRIE;
3488      *unfolded_multi_char = TRUE;
3489      break;
3490     }
3491     s++;
3492     continue;
3493    }
3494   }
3495   else {
3496
3497    /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3498    * folds that are all Latin1.  As explained in the comments
3499    * preceding this function, we look also for the sharp s in EXACTF
3500    * and EXACTFL nodes; it can be in the final position.  Otherwise
3501    * we can stop looking 1 byte earlier because have to find at least
3502    * two characters for a multi-fold */
3503    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3504        ? s_end
3505        : s_end -1;
3506
3507    while (s < upper) {
3508     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3509     if (! len) {    /* Not a multi-char fold. */
3510      if (*s == LATIN_SMALL_LETTER_SHARP_S
3511       && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3512      {
3513       *unfolded_multi_char = TRUE;
3514      }
3515      s++;
3516      continue;
3517     }
3518
3519     if (len == 2
3520      && isARG2_lower_or_UPPER_ARG1('s', *s)
3521      && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3522     {
3523
3524      /* EXACTF nodes need to know that the minimum length
3525      * changed so that a sharp s in the string can match this
3526      * ss in the pattern, but they remain EXACTF nodes, as they
3527      * won't match this unless the target string is is UTF-8,
3528      * which we don't know until runtime.  EXACTFL nodes can't
3529      * transform into EXACTFU nodes */
3530      if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3531       OP(scan) = EXACTFU_SS;
3532      }
3533     }
3534
3535     *min_subtract += len - 1;
3536     s += len;
3537    }
3538   }
3539  }
3540
3541 #ifdef DEBUGGING
3542  /* Allow dumping but overwriting the collection of skipped
3543  * ops and/or strings with fake optimized ops */
3544  n = scan + NODE_SZ_STR(scan);
3545  while (n <= stop) {
3546   OP(n) = OPTIMIZED;
3547   FLAGS(n) = 0;
3548   NEXT_OFF(n) = 0;
3549   n++;
3550  }
3551 #endif
3552  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3553  return stopnow;
3554 }
3555
3556 /* REx optimizer.  Converts nodes into quicker variants "in place".
3557    Finds fixed substrings.  */
3558
3559 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3560    to the position after last scanned or to NULL. */
3561
3562 #define INIT_AND_WITHP \
3563  assert(!and_withp); \
3564  Newx(and_withp,1, regnode_ssc); \
3565  SAVEFREEPV(and_withp)
3566
3567 /* this is a chain of data about sub patterns we are processing that
3568    need to be handled separately/specially in study_chunk. Its so
3569    we can simulate recursion without losing state.  */
3570 struct scan_frame;
3571 typedef struct scan_frame {
3572  regnode *last;  /* last node to process in this frame */
3573  regnode *next;  /* next node to process when last is reached */
3574  struct scan_frame *prev; /*previous frame*/
3575  U32 prev_recursed_depth;
3576  I32 stop; /* what stopparen do we use */
3577 } scan_frame;
3578
3579
3580 STATIC SSize_t
3581 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3582       SSize_t *minlenp, SSize_t *deltap,
3583       regnode *last,
3584       scan_data_t *data,
3585       I32 stopparen,
3586       U32 recursed_depth,
3587       regnode_ssc *and_withp,
3588       U32 flags, U32 depth)
3589       /* scanp: Start here (read-write). */
3590       /* deltap: Write maxlen-minlen here. */
3591       /* last: Stop before this one. */
3592       /* data: string data about the pattern */
3593       /* stopparen: treat close N as END */
3594       /* recursed: which subroutines have we recursed into */
3595       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3596 {
3597  /* There must be at least this number of characters to match */
3598  SSize_t min = 0;
3599  I32 pars = 0, code;
3600  regnode *scan = *scanp, *next;
3601  SSize_t delta = 0;
3602  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3603  int is_inf_internal = 0;  /* The studied chunk is infinite */
3604  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3605  scan_data_t data_fake;
3606  SV *re_trie_maxbuff = NULL;
3607  regnode *first_non_open = scan;
3608  SSize_t stopmin = SSize_t_MAX;
3609  scan_frame *frame = NULL;
3610  GET_RE_DEBUG_FLAGS_DECL;
3611
3612  PERL_ARGS_ASSERT_STUDY_CHUNK;
3613
3614 #ifdef DEBUGGING
3615  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3616 #endif
3617  if ( depth == 0 ) {
3618   while (first_non_open && OP(first_non_open) == OPEN)
3619    first_non_open=regnext(first_non_open);
3620  }
3621
3622
3623   fake_study_recurse:
3624  while ( scan && OP(scan) != END && scan < last ){
3625   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3626         node length to get a real minimum (because
3627         the folded version may be shorter) */
3628   bool unfolded_multi_char = FALSE;
3629   /* Peephole optimizer: */
3630   DEBUG_OPTIMISE_MORE_r(
3631   {
3632    PerlIO_printf(Perl_debug_log,
3633     "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3634     ((int) depth*2), "", (long)stopparen,
3635     (unsigned long)depth, (unsigned long)recursed_depth);
3636    if (recursed_depth) {
3637     U32 i;
3638     U32 j;
3639     for ( j = 0 ; j < recursed_depth ; j++ ) {
3640      PerlIO_printf(Perl_debug_log,"[");
3641      for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3642       PerlIO_printf(Perl_debug_log,"%d",
3643        PAREN_TEST(RExC_study_chunk_recursed +
3644          (j * RExC_study_chunk_recursed_bytes), i)
3645        ? 1 : 0
3646       );
3647      PerlIO_printf(Perl_debug_log,"]");
3648     }
3649    }
3650    PerlIO_printf(Perl_debug_log,"\n");
3651   }
3652   );
3653   DEBUG_STUDYDATA("Peep:", data, depth);
3654   DEBUG_PEEP("Peep", scan, depth);
3655
3656
3657   /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3658   * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3659   * by a different invocation of reg() -- Yves
3660   */
3661   JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3662
3663   /* Follow the next-chain of the current node and optimize
3664   away all the NOTHINGs from it.  */
3665   if (OP(scan) != CURLYX) {
3666    const int max = (reg_off_by_arg[OP(scan)]
3667      ? I32_MAX
3668      /* I32 may be smaller than U16 on CRAYs! */
3669      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3670    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3671    int noff;
3672    regnode *n = scan;
3673
3674    /* Skip NOTHING and LONGJMP. */
3675    while ((n = regnext(n))
3676     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3677      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3678     && off + noff < max)
3679     off += noff;
3680    if (reg_off_by_arg[OP(scan)])
3681     ARG(scan) = off;
3682    else
3683     NEXT_OFF(scan) = off;
3684   }
3685
3686
3687
3688   /* The principal pseudo-switch.  Cannot be a switch, since we
3689   look into several different things.  */
3690   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3691     || OP(scan) == IFTHEN) {
3692    next = regnext(scan);
3693    code = OP(scan);
3694    /* demq: the op(next)==code check is to see if we have
3695    * "branch-branch" AFAICT */
3696
3697    if (OP(next) == code || code == IFTHEN) {
3698     /* NOTE - There is similar code to this block below for
3699     * handling TRIE nodes on a re-study.  If you change stuff here
3700     * check there too. */
3701     SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3702     regnode_ssc accum;
3703     regnode * const startbranch=scan;
3704
3705     if (flags & SCF_DO_SUBSTR) {
3706      /* Cannot merge strings after this. */
3707      scan_commit(pRExC_state, data, minlenp, is_inf);
3708     }
3709
3710     if (flags & SCF_DO_STCLASS)
3711      ssc_init_zero(pRExC_state, &accum);
3712
3713     while (OP(scan) == code) {
3714      SSize_t deltanext, minnext, fake;
3715      I32 f = 0;
3716      regnode_ssc this_class;
3717
3718      num++;
3719      data_fake.flags = 0;
3720      if (data) {
3721       data_fake.whilem_c = data->whilem_c;
3722       data_fake.last_closep = data->last_closep;
3723      }
3724      else
3725       data_fake.last_closep = &fake;
3726
3727      data_fake.pos_delta = delta;
3728      next = regnext(scan);
3729      scan = NEXTOPER(scan);
3730      if (code != BRANCH)
3731       scan = NEXTOPER(scan);
3732      if (flags & SCF_DO_STCLASS) {
3733       ssc_init(pRExC_state, &this_class);
3734       data_fake.start_class = &this_class;
3735       f = SCF_DO_STCLASS_AND;
3736      }
3737      if (flags & SCF_WHILEM_VISITED_POS)
3738       f |= SCF_WHILEM_VISITED_POS;
3739
3740      /* we suppose the run is continuous, last=next...*/
3741      minnext = study_chunk(pRExC_state, &scan, minlenp,
3742          &deltanext, next, &data_fake, stopparen,
3743          recursed_depth, NULL, f,depth+1);
3744      if (min1 > minnext)
3745       min1 = minnext;
3746      if (deltanext == SSize_t_MAX) {
3747       is_inf = is_inf_internal = 1;
3748       max1 = SSize_t_MAX;
3749      } else if (max1 < minnext + deltanext)
3750       max1 = minnext + deltanext;
3751      scan = next;
3752      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3753       pars++;
3754      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3755       if ( stopmin > minnext)
3756        stopmin = min + min1;
3757       flags &= ~SCF_DO_SUBSTR;
3758       if (data)
3759        data->flags |= SCF_SEEN_ACCEPT;
3760      }
3761      if (data) {
3762       if (data_fake.flags & SF_HAS_EVAL)
3763        data->flags |= SF_HAS_EVAL;
3764       data->whilem_c = data_fake.whilem_c;
3765      }
3766      if (flags & SCF_DO_STCLASS)
3767       ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3768     }
3769     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3770      min1 = 0;
3771     if (flags & SCF_DO_SUBSTR) {
3772      data->pos_min += min1;
3773      if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3774       data->pos_delta = SSize_t_MAX;
3775      else
3776       data->pos_delta += max1 - min1;
3777      if (max1 != min1 || is_inf)
3778       data->longest = &(data->longest_float);
3779     }
3780     min += min1;
3781     if (delta == SSize_t_MAX
3782     || SSize_t_MAX - delta - (max1 - min1) < 0)
3783      delta = SSize_t_MAX;
3784     else
3785      delta += max1 - min1;
3786     if (flags & SCF_DO_STCLASS_OR) {
3787      ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3788      if (min1) {
3789       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3790       flags &= ~SCF_DO_STCLASS;
3791      }
3792     }
3793     else if (flags & SCF_DO_STCLASS_AND) {
3794      if (min1) {
3795       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3796       flags &= ~SCF_DO_STCLASS;
3797      }
3798      else {
3799       /* Switch to OR mode: cache the old value of
3800       * data->start_class */
3801       INIT_AND_WITHP;
3802       StructCopy(data->start_class, and_withp, regnode_ssc);
3803       flags &= ~SCF_DO_STCLASS_AND;
3804       StructCopy(&accum, data->start_class, regnode_ssc);
3805       flags |= SCF_DO_STCLASS_OR;
3806      }
3807     }
3808
3809     if (PERL_ENABLE_TRIE_OPTIMISATION &&
3810       OP( startbranch ) == BRANCH )
3811     {
3812     /* demq.
3813
3814     Assuming this was/is a branch we are dealing with: 'scan'
3815     now points at the item that follows the branch sequence,
3816     whatever it is. We now start at the beginning of the
3817     sequence and look for subsequences of
3818
3819     BRANCH->EXACT=>x1
3820     BRANCH->EXACT=>x2
3821     tail
3822
3823     which would be constructed from a pattern like
3824     /A|LIST|OF|WORDS/
3825
3826     If we can find such a subsequence we need to turn the first
3827     element into a trie and then add the subsequent branch exact
3828     strings to the trie.
3829
3830     We have two cases
3831
3832      1. patterns where the whole set of branches can be
3833       converted.
3834
3835      2. patterns where only a subset can be converted.
3836
3837     In case 1 we can replace the whole set with a single regop
3838     for the trie. In case 2 we need to keep the start and end
3839     branches so
3840
3841      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3842      becomes BRANCH TRIE; BRANCH X;
3843
3844     There is an additional case, that being where there is a
3845     common prefix, which gets split out into an EXACT like node
3846     preceding the TRIE node.
3847
3848     If x(1..n)==tail then we can do a simple trie, if not we make
3849     a "jump" trie, such that when we match the appropriate word
3850     we "jump" to the appropriate tail node. Essentially we turn
3851     a nested if into a case structure of sorts.
3852
3853     */
3854
3855      int made=0;
3856      if (!re_trie_maxbuff) {
3857       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3858       if (!SvIOK(re_trie_maxbuff))
3859        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3860      }
3861      if ( SvIV(re_trie_maxbuff)>=0  ) {
3862       regnode *cur;
3863       regnode *first = (regnode *)NULL;
3864       regnode *last = (regnode *)NULL;
3865       regnode *tail = scan;
3866       U8 trietype = 0;
3867       U32 count=0;
3868
3869 #ifdef DEBUGGING
3870       SV * const mysv = sv_newmortal();   /* for dumping */
3871 #endif
3872       /* var tail is used because there may be a TAIL
3873       regop in the way. Ie, the exacts will point to the
3874       thing following the TAIL, but the last branch will
3875       point at the TAIL. So we advance tail. If we
3876       have nested (?:) we may have to move through several
3877       tails.
3878       */
3879
3880       while ( OP( tail ) == TAIL ) {
3881        /* this is the TAIL generated by (?:) */
3882        tail = regnext( tail );
3883       }
3884
3885
3886       DEBUG_TRIE_COMPILE_r({
3887        regprop(RExC_rx, mysv, tail, NULL);
3888        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3889        (int)depth * 2 + 2, "",
3890        "Looking for TRIE'able sequences. Tail node is: ",
3891        SvPV_nolen_const( mysv )
3892        );
3893       });
3894
3895       /*
3896
3897        Step through the branches
3898         cur represents each branch,
3899         noper is the first thing to be matched as part
3900          of that branch
3901         noper_next is the regnext() of that node.
3902
3903        We normally handle a case like this
3904        /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3905        support building with NOJUMPTRIE, which restricts
3906        the trie logic to structures like /FOO|BAR/.
3907
3908        If noper is a trieable nodetype then the branch is
3909        a possible optimization target. If we are building
3910        under NOJUMPTRIE then we require that noper_next is
3911        the same as scan (our current position in the regex
3912        program).
3913
3914        Once we have two or more consecutive such branches
3915        we can create a trie of the EXACT's contents and
3916        stitch it in place into the program.
3917
3918        If the sequence represents all of the branches in
3919        the alternation we replace the entire thing with a
3920        single TRIE node.
3921
3922        Otherwise when it is a subsequence we need to
3923        stitch it in place and replace only the relevant
3924        branches. This means the first branch has to remain
3925        as it is used by the alternation logic, and its
3926        next pointer, and needs to be repointed at the item
3927        on the branch chain following the last branch we
3928        have optimized away.
3929
3930        This could be either a BRANCH, in which case the
3931        subsequence is internal, or it could be the item
3932        following the branch sequence in which case the
3933        subsequence is at the end (which does not
3934        necessarily mean the first node is the start of the
3935        alternation).
3936
3937        TRIE_TYPE(X) is a define which maps the optype to a
3938        trietype.
3939
3940         optype          |  trietype
3941         ----------------+-----------
3942         NOTHING         | NOTHING
3943         EXACT           | EXACT
3944         EXACTFU         | EXACTFU
3945         EXACTFU_SS      | EXACTFU
3946         EXACTFA         | EXACTFA
3947
3948
3949       */
3950 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3951      ( EXACT == (X) )   ? EXACT :        \
3952      ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3953      ( EXACTFA == (X) ) ? EXACTFA :        \
3954      0 )
3955
3956       /* dont use tail as the end marker for this traverse */
3957       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3958        regnode * const noper = NEXTOPER( cur );
3959        U8 noper_type = OP( noper );
3960        U8 noper_trietype = TRIE_TYPE( noper_type );
3961 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3962        regnode * const noper_next = regnext( noper );
3963        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3964        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3965 #endif
3966
3967        DEBUG_TRIE_COMPILE_r({
3968         regprop(RExC_rx, mysv, cur, NULL);
3969         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3970         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3971
3972         regprop(RExC_rx, mysv, noper, NULL);
3973         PerlIO_printf( Perl_debug_log, " -> %s",
3974          SvPV_nolen_const(mysv));
3975
3976         if ( noper_next ) {
3977         regprop(RExC_rx, mysv, noper_next, NULL);
3978         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3979          SvPV_nolen_const(mysv));
3980         }
3981         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3982         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3983         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3984         );
3985        });
3986
3987        /* Is noper a trieable nodetype that can be merged
3988        * with the current trie (if there is one)? */
3989        if ( noper_trietype
3990         &&
3991         (
3992           ( noper_trietype == NOTHING)
3993           || ( trietype == NOTHING )
3994           || ( trietype == noper_trietype )
3995         )
3996 #ifdef NOJUMPTRIE
3997         && noper_next == tail
3998 #endif
3999         && count < U16_MAX)
4000        {
4001         /* Handle mergable triable node Either we are
4002         * the first node in a new trieable sequence,
4003         * in which case we do some bookkeeping,
4004         * otherwise we update the end pointer. */
4005         if ( !first ) {
4006          first = cur;
4007          if ( noper_trietype == NOTHING ) {
4008 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4009           regnode * const noper_next = regnext( noper );
4010           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4011           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4012 #endif
4013
4014           if ( noper_next_trietype ) {
4015            trietype = noper_next_trietype;
4016           } else if (noper_next_type)  {
4017            /* a NOTHING regop is 1 regop wide.
4018            * We need at least two for a trie
4019            * so we can't merge this in */
4020            first = NULL;
4021           }
4022          } else {
4023           trietype = noper_trietype;
4024          }
4025         } else {
4026          if ( trietype == NOTHING )
4027           trietype = noper_trietype;
4028          last = cur;
4029         }
4030         if (first)
4031          count++;
4032        } /* end handle mergable triable node */
4033        else {
4034         /* handle unmergable node -
4035         * noper may either be a triable node which can
4036         * not be tried together with the current trie,
4037         * or a non triable node */
4038         if ( last ) {
4039          /* If last is set and trietype is not
4040          * NOTHING then we have found at least two
4041          * triable branch sequences in a row of a
4042          * similar trietype so we can turn them
4043          * into a trie. If/when we allow NOTHING to
4044          * start a trie sequence this condition
4045          * will be required, and it isn't expensive
4046          * so we leave it in for now. */
4047          if ( trietype && trietype != NOTHING )
4048           make_trie( pRExC_state,
4049             startbranch, first, cur, tail,
4050             count, trietype, depth+1 );
4051          last = NULL; /* note: we clear/update
4052              first, trietype etc below,
4053              so we dont do it here */
4054         }
4055         if ( noper_trietype
4056 #ifdef NOJUMPTRIE
4057          && noper_next == tail
4058 #endif
4059         ){
4060          /* noper is triable, so we can start a new
4061          * trie sequence */
4062          count = 1;
4063          first = cur;
4064          trietype = noper_trietype;
4065         } else if (first) {
4066          /* if we already saw a first but the
4067          * current node is not triable then we have
4068          * to reset the first information. */
4069          count = 0;
4070          first = NULL;
4071          trietype = 0;
4072         }
4073        } /* end handle unmergable node */
4074       } /* loop over branches */
4075       DEBUG_TRIE_COMPILE_r({
4076        regprop(RExC_rx, mysv, cur, NULL);
4077        PerlIO_printf( Perl_debug_log,
4078        "%*s- %s (%d) <SCAN FINISHED>\n",
4079        (int)depth * 2 + 2,
4080        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4081
4082       });
4083       if ( last && trietype ) {
4084        if ( trietype != NOTHING ) {
4085         /* the last branch of the sequence was part of
4086         * a trie, so we have to construct it here
4087         * outside of the loop */
4088         made= make_trie( pRExC_state, startbranch,
4089             first, scan, tail, count,
4090             trietype, depth+1 );
4091 #ifdef TRIE_STUDY_OPT
4092         if ( ((made == MADE_EXACT_TRIE &&
4093          startbranch == first)
4094          || ( first_non_open == first )) &&
4095          depth==0 ) {
4096          flags |= SCF_TRIE_RESTUDY;
4097          if ( startbranch == first
4098           && scan == tail )
4099          {
4100           RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4101          }
4102         }
4103 #endif
4104        } else {
4105         /* at this point we know whatever we have is a
4106         * NOTHING sequence/branch AND if 'startbranch'
4107         * is 'first' then we can turn the whole thing
4108         * into a NOTHING
4109         */
4110         if ( startbranch == first ) {
4111          regnode *opt;
4112          /* the entire thing is a NOTHING sequence,
4113          * something like this: (?:|) So we can
4114          * turn it into a plain NOTHING op. */
4115          DEBUG_TRIE_COMPILE_r({
4116           regprop(RExC_rx, mysv, cur, NULL);
4117           PerlIO_printf( Perl_debug_log,
4118           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4119           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4120
4121          });
4122          OP(startbranch)= NOTHING;
4123          NEXT_OFF(startbranch)= tail - startbranch;
4124          for ( opt= startbranch + 1; opt < tail ; opt++ )
4125           OP(opt)= OPTIMIZED;
4126         }
4127        }
4128       } /* end if ( last) */
4129      } /* TRIE_MAXBUF is non zero */
4130
4131     } /* do trie */
4132
4133    }
4134    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4135     scan = NEXTOPER(NEXTOPER(scan));
4136    } else   /* single branch is optimized. */
4137     scan = NEXTOPER(scan);
4138    continue;
4139   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4140    scan_frame *newframe = NULL;
4141    I32 paren;
4142    regnode *start;
4143    regnode *end;
4144    U32 my_recursed_depth= recursed_depth;
4145
4146    if (OP(scan) != SUSPEND) {
4147     /* set the pointer */
4148     if (OP(scan) == GOSUB) {
4149      paren = ARG(scan);
4150      RExC_recurse[ARG2L(scan)] = scan;
4151      start = RExC_open_parens[paren-1];
4152      end   = RExC_close_parens[paren-1];
4153     } else {
4154      paren = 0;
4155      start = RExC_rxi->program + 1;
4156      end   = RExC_opend;
4157     }
4158     if (!recursed_depth
4159      ||
4160      !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4161     ) {
4162      if (!recursed_depth) {
4163       Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4164      } else {
4165       Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4166        RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4167        RExC_study_chunk_recursed_bytes, U8);
4168      }
4169      /* we havent recursed into this paren yet, so recurse into it */
4170      DEBUG_STUDYDATA("set:", data,depth);
4171      PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4172      my_recursed_depth= recursed_depth + 1;
4173      Newx(newframe,1,scan_frame);
4174     } else {
4175      DEBUG_STUDYDATA("inf:", data,depth);
4176      /* some form of infinite recursion, assume infinite length
4177      * */
4178      if (flags & SCF_DO_SUBSTR) {
4179       scan_commit(pRExC_state, data, minlenp, is_inf);
4180       data->longest = &(data->longest_float);
4181      }
4182      is_inf = is_inf_internal = 1;
4183      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4184       ssc_anything(data->start_class);
4185      flags &= ~SCF_DO_STCLASS;
4186     }
4187    } else {
4188     Newx(newframe,1,scan_frame);
4189     paren = stopparen;
4190     start = scan+2;
4191     end = regnext(scan);
4192    }
4193    if (newframe) {
4194     assert(start);
4195     assert(end);
4196     SAVEFREEPV(newframe);
4197     newframe->next = regnext(scan);
4198     newframe->last = last;
4199     newframe->stop = stopparen;
4200     newframe->prev = frame;
4201     newframe->prev_recursed_depth = recursed_depth;
4202
4203     DEBUG_STUDYDATA("frame-new:",data,depth);
4204     DEBUG_PEEP("fnew", scan, depth);
4205
4206     frame = newframe;
4207     scan =  start;
4208     stopparen = paren;
4209     last = end;
4210     depth = depth + 1;
4211     recursed_depth= my_recursed_depth;
4212
4213     continue;
4214    }
4215   }
4216   else if (OP(scan) == EXACT) {
4217    SSize_t l = STR_LEN(scan);
4218    UV uc;
4219    if (UTF) {
4220     const U8 * const s = (U8*)STRING(scan);
4221     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4222     l = utf8_length(s, s + l);
4223    } else {
4224     uc = *((U8*)STRING(scan));
4225    }
4226    min += l;
4227    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4228     /* The code below prefers earlier match for fixed
4229     offset, later match for variable offset.  */
4230     if (data->last_end == -1) { /* Update the start info. */
4231      data->last_start_min = data->pos_min;
4232      data->last_start_max = is_inf
4233       ? SSize_t_MAX : data->pos_min + data->pos_delta;
4234     }
4235     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4236     if (UTF)
4237      SvUTF8_on(data->last_found);
4238     {
4239      SV * const sv = data->last_found;
4240      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4241       mg_find(sv, PERL_MAGIC_utf8) : NULL;
4242      if (mg && mg->mg_len >= 0)
4243       mg->mg_len += utf8_length((U8*)STRING(scan),
4244            (U8*)STRING(scan)+STR_LEN(scan));
4245     }
4246     data->last_end = data->pos_min + l;
4247     data->pos_min += l; /* As in the first entry. */
4248     data->flags &= ~SF_BEFORE_EOL;
4249    }
4250
4251    /* ANDing the code point leaves at most it, and not in locale, and
4252    * can't match null string */
4253    if (flags & SCF_DO_STCLASS_AND) {
4254     ssc_cp_and(data->start_class, uc);
4255     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4256     ssc_clear_locale(data->start_class);
4257    }
4258    else if (flags & SCF_DO_STCLASS_OR) {
4259     ssc_add_cp(data->start_class, uc);
4260     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4261
4262     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4263     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4264    }
4265    flags &= ~SCF_DO_STCLASS;
4266   }
4267   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
4268              EXACTFish */
4269    SSize_t l = STR_LEN(scan);
4270    UV uc = *((U8*)STRING(scan));
4271    SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4272              separate code points */
4273    const U8 * s = (U8*)STRING(scan);
4274
4275    /* Search for fixed substrings supports EXACT only. */
4276    if (flags & SCF_DO_SUBSTR) {
4277     assert(data);
4278     scan_commit(pRExC_state, data, minlenp, is_inf);
4279    }
4280    if (UTF) {
4281     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4282     l = utf8_length(s, s + l);
4283    }
4284    if (unfolded_multi_char) {
4285     RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4286    }
4287    min += l - min_subtract;
4288    assert (min >= 0);
4289    delta += min_subtract;
4290    if (flags & SCF_DO_SUBSTR) {
4291     data->pos_min += l - min_subtract;
4292     if (data->pos_min < 0) {
4293      data->pos_min = 0;
4294     }
4295     data->pos_delta += min_subtract;
4296     if (min_subtract) {
4297      data->longest = &(data->longest_float);
4298     }
4299    }
4300
4301    if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4302     ssc_clear_locale(data->start_class);
4303    }
4304
4305    if (! UTF) {
4306
4307     /* We punt and assume can match anything if the node begins
4308     * with a multi-character fold.  Things are complicated.  For
4309     * example, /ffi/i could match any of:
4310     *  "\N{LATIN SMALL LIGATURE FFI}"
4311     *  "\N{LATIN SMALL LIGATURE FF}I"
4312     *  "F\N{LATIN SMALL LIGATURE FI}"
4313     *  plus several other things; and making sure we have all the
4314     *  possibilities is hard. */
4315     if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4316      EXACTF_invlist =
4317        _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4318     }
4319     else {
4320
4321      /* Any Latin1 range character can potentially match any
4322      * other depending on the locale */
4323      if (OP(scan) == EXACTFL) {
4324       _invlist_union(EXACTF_invlist, PL_Latin1,
4325                &EXACTF_invlist);
4326      }
4327      else {
4328       /* But otherwise, it matches at least itself.  We can
4329       * quickly tell if it has a distinct fold, and if so,
4330       * it matches that as well */
4331       EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4332       if (IS_IN_SOME_FOLD_L1(uc)) {
4333        EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4334               PL_fold_latin1[uc]);
4335       }
4336      }
4337
4338      /* Some characters match above-Latin1 ones under /i.  This
4339      * is true of EXACTFL ones when the locale is UTF-8 */
4340      if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4341       && (! isASCII(uc) || (OP(scan) != EXACTFA
4342            && OP(scan) != EXACTFA_NO_TRIE)))
4343      {
4344       add_above_Latin1_folds(pRExC_state,
4345            (U8) uc,
4346            &EXACTF_invlist);
4347      }
4348     }
4349    }
4350    else {  /* Pattern is UTF-8 */
4351     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4352     STRLEN foldlen = UTF8SKIP(s);
4353     const U8* e = s + STR_LEN(scan);
4354     SV** listp;
4355
4356     /* The only code points that aren't folded in a UTF EXACTFish
4357     * node are are the problematic ones in EXACTFL nodes */
4358     if (OP(scan) == EXACTFL
4359      && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4360     {
4361      /* We need to check for the possibility that this EXACTFL
4362      * node begins with a multi-char fold.  Therefore we fold
4363      * the first few characters of it so that we can make that
4364      * check */
4365      U8 *d = folded;
4366      int i;
4367
4368      for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4369       if (isASCII(*s)) {
4370        *(d++) = (U8) toFOLD(*s);
4371        s++;
4372       }
4373       else {
4374        STRLEN len;
4375        to_utf8_fold(s, d, &len);
4376        d += len;
4377        s += UTF8SKIP(s);
4378       }
4379      }
4380
4381      /* And set up so the code below that looks in this folded
4382      * buffer instead of the node's string */
4383      e = d;
4384      foldlen = UTF8SKIP(folded);
4385      s = folded;
4386     }
4387
4388     /* When we reach here 's' points to the fold of the first
4389     * character(s) of the node; and 'e' points to far enough along
4390     * the folded string to be just past any possible multi-char
4391     * fold. 'foldlen' is the length in bytes of the first
4392     * character in 's'
4393     *
4394     * Unlike the non-UTF-8 case, the macro for determining if a
4395     * string is a multi-char fold requires all the characters to
4396     * already be folded.  This is because of all the complications
4397     * if not.  Note that they are folded anyway, except in EXACTFL
4398     * nodes.  Like the non-UTF case above, we punt if the node
4399     * begins with a multi-char fold  */
4400
4401     if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4402      EXACTF_invlist =
4403        _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4404     }
4405     else {  /* Single char fold */
4406
4407      /* It matches all the things that fold to it, which are
4408      * found in PL_utf8_foldclosures (including itself) */
4409      EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4410      if (! PL_utf8_foldclosures) {
4411       _load_PL_utf8_foldclosures();
4412      }
4413      if ((listp = hv_fetch(PL_utf8_foldclosures,
4414           (char *) s, foldlen, FALSE)))
4415      {
4416       AV* list = (AV*) *listp;
4417       IV k;
4418       for (k = 0; k <= av_tindex(list); k++) {
4419        SV** c_p = av_fetch(list, k, FALSE);
4420        UV c;
4421        assert(c_p);
4422
4423        c = SvUV(*c_p);
4424
4425        /* /aa doesn't allow folds between ASCII and non- */
4426        if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4427         && isASCII(c) != isASCII(uc))
4428        {
4429         continue;
4430        }
4431
4432        EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4433       }
4434      }
4435     }
4436    }
4437    if (flags & SCF_DO_STCLASS_AND) {
4438     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4439     ANYOF_POSIXL_ZERO(data->start_class);
4440     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4441    }
4442    else if (flags & SCF_DO_STCLASS_OR) {
4443     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4444     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4445
4446     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4447     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4448    }
4449    flags &= ~SCF_DO_STCLASS;
4450    SvREFCNT_dec(EXACTF_invlist);
4451   }
4452   else if (REGNODE_VARIES(OP(scan))) {
4453    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4454    I32 fl = 0, f = flags;
4455    regnode * const oscan = scan;
4456    regnode_ssc this_class;
4457    regnode_ssc *oclass = NULL;
4458    I32 next_is_eval = 0;
4459
4460    switch (PL_regkind[OP(scan)]) {
4461    case WHILEM:  /* End of (?:...)* . */
4462     scan = NEXTOPER(scan);
4463     goto finish;
4464    case PLUS:
4465     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4466      next = NEXTOPER(scan);
4467      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4468       mincount = 1;
4469       maxcount = REG_INFTY;
4470       next = regnext(scan);
4471       scan = NEXTOPER(scan);
4472       goto do_curly;
4473      }
4474     }
4475     if (flags & SCF_DO_SUBSTR)
4476      data->pos_min++;
4477     min++;
4478     /* FALLTHROUGH */
4479    case STAR:
4480     if (flags & SCF_DO_STCLASS) {
4481      mincount = 0;
4482      maxcount = REG_INFTY;
4483      next = regnext(scan);
4484      scan = NEXTOPER(scan);
4485      goto do_curly;
4486     }
4487     if (flags & SCF_DO_SUBSTR) {
4488      scan_commit(pRExC_state, data, minlenp, is_inf);
4489      /* Cannot extend fixed substrings */
4490      data->longest = &(data->longest_float);
4491     }
4492     is_inf = is_inf_internal = 1;
4493     scan = regnext(scan);
4494     goto optimize_curly_tail;
4495    case CURLY:
4496     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4497      && (scan->flags == stopparen))
4498     {
4499      mincount = 1;
4500      maxcount = 1;
4501     } else {
4502      mincount = ARG1(scan);
4503      maxcount = ARG2(scan);
4504     }
4505     next = regnext(scan);
4506     if (OP(scan) == CURLYX) {
4507      I32 lp = (data ? *(data->last_closep) : 0);
4508      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4509     }
4510     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4511     next_is_eval = (OP(scan) == EVAL);
4512    do_curly:
4513     if (flags & SCF_DO_SUBSTR) {
4514      if (mincount == 0)
4515       scan_commit(pRExC_state, data, minlenp, is_inf);
4516      /* Cannot extend fixed substrings */
4517      pos_before = data->pos_min;
4518     }
4519     if (data) {
4520      fl = data->flags;
4521      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4522      if (is_inf)
4523       data->flags |= SF_IS_INF;
4524     }
4525     if (flags & SCF_DO_STCLASS) {
4526      ssc_init(pRExC_state, &this_class);
4527      oclass = data->start_class;
4528      data->start_class = &this_class;
4529      f |= SCF_DO_STCLASS_AND;
4530      f &= ~SCF_DO_STCLASS_OR;
4531     }
4532     /* Exclude from super-linear cache processing any {n,m}
4533     regops for which the combination of input pos and regex
4534     pos is not enough information to determine if a match
4535     will be possible.
4536
4537     For example, in the regex /foo(bar\s*){4,8}baz/ with the
4538     regex pos at the \s*, the prospects for a match depend not
4539     only on the input position but also on how many (bar\s*)
4540     repeats into the {4,8} we are. */
4541    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4542      f &= ~SCF_WHILEM_VISITED_POS;
4543
4544     /* This will finish on WHILEM, setting scan, or on NULL: */
4545     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4546         last, data, stopparen, recursed_depth, NULL,
4547         (mincount == 0
4548         ? (f & ~SCF_DO_SUBSTR)
4549         : f)
4550         ,depth+1);
4551
4552     if (flags & SCF_DO_STCLASS)
4553      data->start_class = oclass;
4554     if (mincount == 0 || minnext == 0) {
4555      if (flags & SCF_DO_STCLASS_OR) {
4556       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4557      }
4558      else if (flags & SCF_DO_STCLASS_AND) {
4559       /* Switch to OR mode: cache the old value of
4560       * data->start_class */
4561       INIT_AND_WITHP;
4562       StructCopy(data->start_class, and_withp, regnode_ssc);
4563       flags &= ~SCF_DO_STCLASS_AND;
4564       StructCopy(&this_class, data->start_class, regnode_ssc);
4565       flags |= SCF_DO_STCLASS_OR;
4566       ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4567      }
4568     } else {  /* Non-zero len */
4569      if (flags & SCF_DO_STCLASS_OR) {
4570       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4571       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4572      }
4573      else if (flags & SCF_DO_STCLASS_AND)
4574       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4575      flags &= ~SCF_DO_STCLASS;
4576     }
4577     if (!scan)   /* It was not CURLYX, but CURLY. */
4578      scan = next;
4579     if (!(flags & SCF_TRIE_DOING_RESTUDY)
4580      /* ? quantifier ok, except for (?{ ... }) */
4581      && (next_is_eval || !(mincount == 0 && maxcount == 1))
4582      && (minnext == 0) && (deltanext == 0)
4583      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4584      && maxcount <= REG_INFTY/3) /* Complement check for big
4585             count */
4586     {
4587      /* Fatal warnings may leak the regexp without this: */
4588      SAVEFREESV(RExC_rx_sv);
4589      ckWARNreg(RExC_parse,
4590        "Quantifier unexpected on zero-length expression");
4591      (void)ReREFCNT_inc(RExC_rx_sv);
4592     }
4593
4594     min += minnext * mincount;
4595     is_inf_internal |= deltanext == SSize_t_MAX
4596       || (maxcount == REG_INFTY && minnext + deltanext > 0);
4597     is_inf |= is_inf_internal;
4598     if (is_inf) {
4599      delta = SSize_t_MAX;
4600     } else {
4601      delta += (minnext + deltanext) * maxcount
4602        - minnext * mincount;
4603     }
4604     /* Try powerful optimization CURLYX => CURLYN. */
4605     if (  OP(oscan) == CURLYX && data
4606      && data->flags & SF_IN_PAR
4607      && !(data->flags & SF_HAS_EVAL)
4608      && !deltanext && minnext == 1 ) {
4609      /* Try to optimize to CURLYN.  */
4610      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4611      regnode * const nxt1 = nxt;
4612 #ifdef DEBUGGING
4613      regnode *nxt2;
4614 #endif
4615
4616      /* Skip open. */
4617      nxt = regnext(nxt);
4618      if (!REGNODE_SIMPLE(OP(nxt))
4619       && !(PL_regkind[OP(nxt)] == EXACT
4620        && STR_LEN(nxt) == 1))
4621       goto nogo;
4622 #ifdef DEBUGGING
4623      nxt2 = nxt;
4624 #endif
4625      nxt = regnext(nxt);
4626      if (OP(nxt) != CLOSE)
4627       goto nogo;
4628      if (RExC_open_parens) {
4629       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4630       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4631      }
4632      /* Now we know that nxt2 is the only contents: */
4633      oscan->flags = (U8)ARG(nxt);
4634      OP(oscan) = CURLYN;
4635      OP(nxt1) = NOTHING; /* was OPEN. */
4636
4637 #ifdef DEBUGGING
4638      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4639      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4640      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4641      OP(nxt) = OPTIMIZED; /* was CLOSE. */
4642      OP(nxt + 1) = OPTIMIZED; /* was count. */
4643      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4644 #endif
4645     }
4646    nogo:
4647
4648     /* Try optimization CURLYX => CURLYM. */
4649     if (  OP(oscan) == CURLYX && data
4650      && !(data->flags & SF_HAS_PAR)
4651      && !(data->flags & SF_HAS_EVAL)
4652      && !deltanext /* atom is fixed width */
4653      && minnext != 0 /* CURLYM can't handle zero width */
4654
4655       /* Nor characters whose fold at run-time may be
4656       * multi-character */
4657      && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4658     ) {
4659      /* XXXX How to optimize if data == 0? */
4660      /* Optimize to a simpler form.  */
4661      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4662      regnode *nxt2;
4663
4664      OP(oscan) = CURLYM;
4665      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4666        && (OP(nxt2) != WHILEM))
4667       nxt = nxt2;
4668      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4669      /* Need to optimize away parenths. */
4670      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4671       /* Set the parenth number.  */
4672       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4673
4674       oscan->flags = (U8)ARG(nxt);
4675       if (RExC_open_parens) {
4676        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4677        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4678       }
4679       OP(nxt1) = OPTIMIZED; /* was OPEN. */
4680       OP(nxt) = OPTIMIZED; /* was CLOSE. */
4681
4682 #ifdef DEBUGGING
4683       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4684       OP(nxt + 1) = OPTIMIZED; /* was count. */
4685       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4686       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4687 #endif
4688 #if 0
4689       while ( nxt1 && (OP(nxt1) != WHILEM)) {
4690        regnode *nnxt = regnext(nxt1);
4691        if (nnxt == nxt) {
4692         if (reg_off_by_arg[OP(nxt1)])
4693          ARG_SET(nxt1, nxt2 - nxt1);
4694         else if (nxt2 - nxt1 < U16_MAX)
4695          NEXT_OFF(nxt1) = nxt2 - nxt1;
4696         else
4697          OP(nxt) = NOTHING; /* Cannot beautify */
4698        }
4699        nxt1 = nnxt;
4700       }
4701 #endif
4702       /* Optimize again: */
4703       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4704          NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4705      }
4706      else
4707       oscan->flags = 0;
4708     }
4709     else if ((OP(oscan) == CURLYX)
4710       && (flags & SCF_WHILEM_VISITED_POS)
4711       /* See the comment on a similar expression above.
4712        However, this time it's not a subexpression
4713        we care about, but the expression itself. */
4714       && (maxcount == REG_INFTY)
4715       && data && ++data->whilem_c < 16) {
4716      /* This stays as CURLYX, we can put the count/of pair. */
4717      /* Find WHILEM (as in regexec.c) */
4718      regnode *nxt = oscan + NEXT_OFF(oscan);
4719
4720      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4721       nxt += ARG(nxt);
4722      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4723       | (RExC_whilem_seen << 4)); /* On WHILEM */
4724     }
4725     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4726      pars++;
4727     if (flags & SCF_DO_SUBSTR) {
4728      SV *last_str = NULL;
4729      STRLEN last_chrs = 0;
4730      int counted = mincount != 0;
4731
4732      if (data->last_end > 0 && mincount != 0) { /* Ends with a
4733                 string. */
4734       SSize_t b = pos_before >= data->last_start_min
4735        ? pos_before : data->last_start_min;
4736       STRLEN l;
4737       const char * const s = SvPV_const(data->last_found, l);
4738       SSize_t old = b - data->last_start_min;
4739
4740       if (UTF)
4741        old = utf8_hop((U8*)s, old) - (U8*)s;
4742       l -= old;
4743       /* Get the added string: */
4744       last_str = newSVpvn_utf8(s  + old, l, UTF);
4745       last_chrs = UTF ? utf8_length((U8*)(s + old),
4746            (U8*)(s + old + l)) : l;
4747       if (deltanext == 0 && pos_before == b) {
4748        /* What was added is a constant string */
4749        if (mincount > 1) {
4750
4751         SvGROW(last_str, (mincount * l) + 1);
4752         repeatcpy(SvPVX(last_str) + l,
4753           SvPVX_const(last_str), l,
4754           mincount - 1);
4755         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4756         /* Add additional parts. */
4757         SvCUR_set(data->last_found,
4758           SvCUR(data->last_found) - l);
4759         sv_catsv(data->last_found, last_str);
4760         {
4761          SV * sv = data->last_found;
4762          MAGIC *mg =
4763           SvUTF8(sv) && SvMAGICAL(sv) ?
4764           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4765          if (mg && mg->mg_len >= 0)
4766           mg->mg_len += last_chrs * (mincount-1);
4767         }
4768         last_chrs *= mincount;
4769         data->last_end += l * (mincount - 1);
4770        }
4771       } else {
4772        /* start offset must point into the last copy */
4773        data->last_start_min += minnext * (mincount - 1);
4774        data->last_start_max += is_inf ? SSize_t_MAX
4775         : (maxcount - 1) * (minnext + data->pos_delta);
4776       }
4777      }
4778      /* It is counted once already... */
4779      data->pos_min += minnext * (mincount - counted);
4780 #if 0
4781 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4782        " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4783        " maxcount=%"UVuf" mincount=%"UVuf"\n",
4784  (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4785  (UV)mincount);
4786 if (deltanext != SSize_t_MAX)
4787 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4788  (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4789   - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4790 #endif
4791      if (deltanext == SSize_t_MAX
4792       || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4793       data->pos_delta = SSize_t_MAX;
4794      else
4795       data->pos_delta += - counted * deltanext +
4796       (minnext + deltanext) * maxcount - minnext * mincount;
4797      if (mincount != maxcount) {
4798       /* Cannot extend fixed substrings found inside
4799        the group.  */
4800       scan_commit(pRExC_state, data, minlenp, is_inf);
4801       if (mincount && last_str) {
4802        SV * const sv = data->last_found;
4803        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4804         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4805
4806        if (mg)
4807         mg->mg_len = -1;
4808        sv_setsv(sv, last_str);
4809        data->last_end = data->pos_min;
4810        data->last_start_min = data->pos_min - last_chrs;
4811        data->last_start_max = is_inf
4812         ? SSize_t_MAX
4813         : data->pos_min + data->pos_delta - last_chrs;
4814       }
4815       data->longest = &(data->longest_float);
4816      }
4817      SvREFCNT_dec(last_str);
4818     }
4819     if (data && (fl & SF_HAS_EVAL))
4820      data->flags |= SF_HAS_EVAL;
4821    optimize_curly_tail:
4822     if (OP(oscan) != CURLYX) {
4823      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4824       && NEXT_OFF(next))
4825       NEXT_OFF(oscan) += NEXT_OFF(next);
4826     }
4827     continue;
4828
4829    default:
4830 #ifdef DEBUGGING
4831     Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4832                  OP(scan));
4833 #endif
4834    case REF:
4835    case CLUMP:
4836     if (flags & SCF_DO_SUBSTR) {
4837      /* Cannot expect anything... */
4838      scan_commit(pRExC_state, data, minlenp, is_inf);
4839      data->longest = &(data->longest_float);
4840     }
4841     is_inf = is_inf_internal = 1;
4842     if (flags & SCF_DO_STCLASS_OR) {
4843      if (OP(scan) == CLUMP) {
4844       /* Actually is any start char, but very few code points
4845       * aren't start characters */
4846       ssc_match_all_cp(data->start_class);
4847      }
4848      else {
4849       ssc_anything(data->start_class);
4850      }
4851     }
4852     flags &= ~SCF_DO_STCLASS;
4853     break;
4854    }
4855   }
4856   else if (OP(scan) == LNBREAK) {
4857    if (flags & SCF_DO_STCLASS) {
4858      if (flags & SCF_DO_STCLASS_AND) {
4859      ssc_intersection(data->start_class,
4860          PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4861      ssc_clear_locale(data->start_class);
4862      ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4863     }
4864     else if (flags & SCF_DO_STCLASS_OR) {
4865      ssc_union(data->start_class,
4866        PL_XPosix_ptrs[_CC_VERTSPACE],
4867        FALSE);
4868      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4869
4870      /* See commit msg for
4871      * 749e076fceedeb708a624933726e7989f2302f6a */
4872      ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4873     }
4874     flags &= ~SCF_DO_STCLASS;
4875    }
4876    min++;
4877    delta++;    /* Because of the 2 char string cr-lf */
4878    if (flags & SCF_DO_SUBSTR) {
4879     /* Cannot expect anything... */
4880     scan_commit(pRExC_state, data, minlenp, is_inf);
4881      data->pos_min += 1;
4882     data->pos_delta += 1;
4883     data->longest = &(data->longest_float);
4884     }
4885   }
4886   else if (REGNODE_SIMPLE(OP(scan))) {
4887
4888    if (flags & SCF_DO_SUBSTR) {
4889     scan_commit(pRExC_state, data, minlenp, is_inf);
4890     data->pos_min++;
4891    }
4892    min++;
4893    if (flags & SCF_DO_STCLASS) {
4894     bool invert = 0;
4895     SV* my_invlist = sv_2mortal(_new_invlist(0));
4896     U8 namedclass;
4897
4898     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4899     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4900
4901     /* Some of the logic below assumes that switching
4902     locale on will only add false positives. */
4903     switch (OP(scan)) {
4904
4905     default:
4906 #ifdef DEBUGGING
4907     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4908                  OP(scan));
4909 #endif
4910     case CANY:
4911     case SANY:
4912      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4913       ssc_match_all_cp(data->start_class);
4914      break;
4915
4916     case REG_ANY:
4917      {
4918       SV* REG_ANY_invlist = _new_invlist(2);
4919       REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4920                '\n');
4921       if (flags & SCF_DO_STCLASS_OR) {
4922        ssc_union(data->start_class,
4923          REG_ANY_invlist,
4924          TRUE /* TRUE => invert, hence all but \n
4925            */
4926          );
4927       }
4928       else if (flags & SCF_DO_STCLASS_AND) {
4929        ssc_intersection(data->start_class,
4930            REG_ANY_invlist,
4931            TRUE  /* TRUE => invert */
4932            );
4933        ssc_clear_locale(data->start_class);
4934       }
4935       SvREFCNT_dec_NN(REG_ANY_invlist);
4936      }
4937      break;
4938
4939     case ANYOF:
4940      if (flags & SCF_DO_STCLASS_AND)
4941       ssc_and(pRExC_state, data->start_class,
4942         (regnode_charclass *) scan);
4943      else
4944       ssc_or(pRExC_state, data->start_class,
4945               (regnode_charclass *) scan);
4946      break;
4947
4948     case NPOSIXL:
4949      invert = 1;
4950      /* FALLTHROUGH */
4951
4952     case POSIXL:
4953      namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4954      if (flags & SCF_DO_STCLASS_AND) {
4955       bool was_there = cBOOL(
4956           ANYOF_POSIXL_TEST(data->start_class,
4957                 namedclass));
4958       ANYOF_POSIXL_ZERO(data->start_class);
4959       if (was_there) {    /* Do an AND */
4960        ANYOF_POSIXL_SET(data->start_class, namedclass);
4961       }
4962       /* No individual code points can now match */
4963       data->start_class->invlist
4964             = sv_2mortal(_new_invlist(0));
4965      }
4966      else {
4967       int complement = namedclass + ((invert) ? -1 : 1);
4968
4969       assert(flags & SCF_DO_STCLASS_OR);
4970
4971       /* If the complement of this class was already there,
4972       * the result is that they match all code points,
4973       * (\d + \D == everything).  Remove the classes from
4974       * future consideration.  Locale is not relevant in
4975       * this case */
4976       if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4977        ssc_match_all_cp(data->start_class);
4978        ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4979        ANYOF_POSIXL_CLEAR(data->start_class, complement);
4980       }
4981       else {  /* The usual case; just add this class to the
4982         existing set */
4983        ANYOF_POSIXL_SET(data->start_class, namedclass);
4984       }
4985      }
4986      break;
4987
4988     case NPOSIXA:   /* For these, we always know the exact set of
4989         what's matched */
4990      invert = 1;
4991      /* FALLTHROUGH */
4992     case POSIXA:
4993      if (FLAGS(scan) == _CC_ASCII) {
4994       my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4995      }
4996      else {
4997       _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4998            PL_XPosix_ptrs[_CC_ASCII],
4999            &my_invlist);
5000      }
5001      goto join_posix;
5002
5003     case NPOSIXD:
5004     case NPOSIXU:
5005      invert = 1;
5006      /* FALLTHROUGH */
5007     case POSIXD:
5008     case POSIXU:
5009      my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5010
5011      /* NPOSIXD matches all upper Latin1 code points unless the
5012      * target string being matched is UTF-8, which is
5013      * unknowable until match time.  Since we are going to
5014      * invert, we want to get rid of all of them so that the
5015      * inversion will match all */
5016      if (OP(scan) == NPOSIXD) {
5017       _invlist_subtract(my_invlist, PL_UpperLatin1,
5018           &my_invlist);
5019      }
5020
5021     join_posix:
5022
5023      if (flags & SCF_DO_STCLASS_AND) {
5024       ssc_intersection(data->start_class, my_invlist, invert);
5025       ssc_clear_locale(data->start_class);
5026      }
5027      else {
5028       assert(flags & SCF_DO_STCLASS_OR);
5029       ssc_union(data->start_class, my_invlist, invert);
5030      }
5031     }
5032     if (flags & SCF_DO_STCLASS_OR)
5033      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5034     flags &= ~SCF_DO_STCLASS;
5035    }
5036   }
5037   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5038    data->flags |= (OP(scan) == MEOL
5039        ? SF_BEFORE_MEOL
5040        : SF_BEFORE_SEOL);
5041    scan_commit(pRExC_state, data, minlenp, is_inf);
5042
5043   }
5044   else if (  PL_regkind[OP(scan)] == BRANCHJ
5045     /* Lookbehind, or need to calculate parens/evals/stclass: */
5046     && (scan->flags || data || (flags & SCF_DO_STCLASS))
5047     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5048   {
5049    if ( OP(scan) == UNLESSM &&
5050     scan->flags == 0 &&
5051     OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5052     OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5053    ) {
5054     regnode *opt;
5055     regnode *upto= regnext(scan);
5056     DEBUG_PARSE_r({
5057      SV * const mysv_val=sv_newmortal();
5058      DEBUG_STUDYDATA("OPFAIL",data,depth);
5059
5060      /*DEBUG_PARSE_MSG("opfail");*/
5061      regprop(RExC_rx, mysv_val, upto, NULL);
5062      PerlIO_printf(Perl_debug_log,
5063       "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5064       SvPV_nolen_const(mysv_val),
5065       (IV)REG_NODE_NUM(upto),
5066       (IV)(upto - scan)
5067      );
5068     });
5069     OP(scan) = OPFAIL;
5070     NEXT_OFF(scan) = upto - scan;
5071     for (opt= scan + 1; opt < upto ; opt++)
5072      OP(opt) = OPTIMIZED;
5073     scan= upto;
5074     continue;
5075    }
5076    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5077     || OP(scan) == UNLESSM )
5078    {
5079     /* Negative Lookahead/lookbehind
5080     In this case we can't do fixed string optimisation.
5081     */
5082
5083     SSize_t deltanext, minnext, fake = 0;
5084     regnode *nscan;
5085     regnode_ssc intrnl;
5086     int f = 0;
5087
5088     data_fake.flags = 0;
5089     if (data) {
5090      data_fake.whilem_c = data->whilem_c;
5091      data_fake.last_closep = data->last_closep;
5092     }
5093     else
5094      data_fake.last_closep = &fake;
5095     data_fake.pos_delta = delta;
5096     if ( flags & SCF_DO_STCLASS && !scan->flags
5097      && OP(scan) == IFMATCH ) { /* Lookahead */
5098      ssc_init(pRExC_state, &intrnl);
5099      data_fake.start_class = &intrnl;
5100      f |= SCF_DO_STCLASS_AND;
5101     }
5102     if (flags & SCF_WHILEM_VISITED_POS)
5103      f |= SCF_WHILEM_VISITED_POS;
5104     next = regnext(scan);
5105     nscan = NEXTOPER(NEXTOPER(scan));
5106     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5107          last, &data_fake, stopparen,
5108          recursed_depth, NULL, f, depth+1);
5109     if (scan->flags) {
5110      if (deltanext) {
5111       FAIL("Variable length lookbehind not implemented");
5112      }
5113      else if (minnext > (I32)U8_MAX) {
5114       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5115        (UV)U8_MAX);
5116      }
5117      scan->flags = (U8)minnext;
5118     }
5119     if (data) {
5120      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5121       pars++;
5122      if (data_fake.flags & SF_HAS_EVAL)
5123       data->flags |= SF_HAS_EVAL;
5124      data->whilem_c = data_fake.whilem_c;
5125     }
5126     if (f & SCF_DO_STCLASS_AND) {
5127      if (flags & SCF_DO_STCLASS_OR) {
5128       /* OR before, AND after: ideally we would recurse with
5129       * data_fake to get the AND applied by study of the
5130       * remainder of the pattern, and then derecurse;
5131       * *** HACK *** for now just treat as "no information".
5132       * See [perl #56690].
5133       */
5134       ssc_init(pRExC_state, data->start_class);
5135      }  else {
5136       /* AND before and after: combine and continue.  These
5137       * assertions are zero-length, so can match an EMPTY
5138       * string */
5139       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5140       ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5141      }
5142     }
5143    }
5144 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5145    else {
5146     /* Positive Lookahead/lookbehind
5147     In this case we can do fixed string optimisation,
5148     but we must be careful about it. Note in the case of
5149     lookbehind the positions will be offset by the minimum
5150     length of the pattern, something we won't know about
5151     until after the recurse.
5152     */
5153     SSize_t deltanext, fake = 0;
5154     regnode *nscan;
5155     regnode_ssc intrnl;
5156     int f = 0;
5157     /* We use SAVEFREEPV so that when the full compile
5158      is finished perl will clean up the allocated
5159      minlens when it's all done. This way we don't
5160      have to worry about freeing them when we know
5161      they wont be used, which would be a pain.
5162     */
5163     SSize_t *minnextp;
5164     Newx( minnextp, 1, SSize_t );
5165     SAVEFREEPV(minnextp);
5166
5167     if (data) {
5168      StructCopy(data, &data_fake, scan_data_t);
5169      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5170       f |= SCF_DO_SUBSTR;
5171       if (scan->flags)
5172        scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5173       data_fake.last_found=newSVsv(data->last_found);
5174      }
5175     }
5176     else
5177      data_fake.last_closep = &fake;
5178     data_fake.flags = 0;
5179     data_fake.pos_delta = delta;
5180     if (is_inf)
5181      data_fake.flags |= SF_IS_INF;
5182     if ( flags & SCF_DO_STCLASS && !scan->flags
5183      && OP(scan) == IFMATCH ) { /* Lookahead */
5184      ssc_init(pRExC_state, &intrnl);
5185      data_fake.start_class = &intrnl;
5186      f |= SCF_DO_STCLASS_AND;
5187     }
5188     if (flags & SCF_WHILEM_VISITED_POS)
5189      f |= SCF_WHILEM_VISITED_POS;
5190     next = regnext(scan);
5191     nscan = NEXTOPER(NEXTOPER(scan));
5192
5193     *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5194           &deltanext, last, &data_fake,
5195           stopparen, recursed_depth, NULL,
5196           f,depth+1);
5197     if (scan->flags) {
5198      if (deltanext) {
5199       FAIL("Variable length lookbehind not implemented");
5200      }
5201      else if (*minnextp > (I32)U8_MAX) {
5202       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5203        (UV)U8_MAX);
5204      }
5205      scan->flags = (U8)*minnextp;
5206     }
5207
5208     *minnextp += min;
5209
5210     if (f & SCF_DO_STCLASS_AND) {
5211      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5212      ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5213     }
5214     if (data) {
5215      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5216       pars++;
5217      if (data_fake.flags & SF_HAS_EVAL)
5218       data->flags |= SF_HAS_EVAL;
5219      data->whilem_c = data_fake.whilem_c;
5220      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5221       if (RExC_rx->minlen<*minnextp)
5222        RExC_rx->minlen=*minnextp;
5223       scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5224       SvREFCNT_dec_NN(data_fake.last_found);
5225
5226       if ( data_fake.minlen_fixed != minlenp )
5227       {
5228        data->offset_fixed= data_fake.offset_fixed;
5229        data->minlen_fixed= data_fake.minlen_fixed;
5230        data->lookbehind_fixed+= scan->flags;
5231       }
5232       if ( data_fake.minlen_float != minlenp )
5233       {
5234        data->minlen_float= data_fake.minlen_float;
5235        data->offset_float_min=data_fake.offset_float_min;
5236        data->offset_float_max=data_fake.offset_float_max;
5237        data->lookbehind_float+= scan->flags;
5238       }
5239      }
5240     }
5241    }
5242 #endif
5243   }
5244   else if (OP(scan) == OPEN) {
5245    if (stopparen != (I32)ARG(scan))
5246     pars++;
5247   }
5248   else if (OP(scan) == CLOSE) {
5249    if (stopparen == (I32)ARG(scan)) {
5250     break;
5251    }
5252    if ((I32)ARG(scan) == is_par) {
5253     next = regnext(scan);
5254
5255     if ( next && (OP(next) != WHILEM) && next < last)
5256      is_par = 0;  /* Disable optimization */
5257    }
5258    if (data)
5259     *(data->last_closep) = ARG(scan);
5260   }
5261   else if (OP(scan) == EVAL) {
5262     if (data)
5263      data->flags |= SF_HAS_EVAL;
5264   }
5265   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5266    if (flags & SCF_DO_SUBSTR) {
5267     scan_commit(pRExC_state, data, minlenp, is_inf);
5268     flags &= ~SCF_DO_SUBSTR;
5269    }
5270    if (data && OP(scan)==ACCEPT) {
5271     data->flags |= SCF_SEEN_ACCEPT;
5272     if (stopmin > min)
5273      stopmin = min;
5274    }
5275   }
5276   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5277   {
5278     if (flags & SCF_DO_SUBSTR) {
5279      scan_commit(pRExC_state, data, minlenp, is_inf);
5280      data->longest = &(data->longest_float);
5281     }
5282     is_inf = is_inf_internal = 1;
5283     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5284      ssc_anything(data->start_class);
5285     flags &= ~SCF_DO_STCLASS;
5286   }
5287   else if (OP(scan) == GPOS) {
5288    if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5289     !(delta || is_inf || (data && data->pos_delta)))
5290    {
5291     if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5292      RExC_rx->intflags |= PREGf_ANCH_GPOS;
5293     if (RExC_rx->gofs < (STRLEN)min)
5294      RExC_rx->gofs = min;
5295    } else {
5296     RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5297     RExC_rx->gofs = 0;
5298    }
5299   }
5300 #ifdef TRIE_STUDY_OPT
5301 #ifdef FULL_TRIE_STUDY
5302   else if (PL_regkind[OP(scan)] == TRIE) {
5303    /* NOTE - There is similar code to this block above for handling
5304    BRANCH nodes on the initial study.  If you change stuff here
5305    check there too. */
5306    regnode *trie_node= scan;
5307    regnode *tail= regnext(scan);
5308    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5309    SSize_t max1 = 0, min1 = SSize_t_MAX;
5310    regnode_ssc accum;
5311
5312    if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5313     /* Cannot merge strings after this. */
5314     scan_commit(pRExC_state, data, minlenp, is_inf);
5315    }
5316    if (flags & SCF_DO_STCLASS)
5317     ssc_init_zero(pRExC_state, &accum);
5318
5319    if (!trie->jump) {
5320     min1= trie->minlen;
5321     max1= trie->maxlen;
5322    } else {
5323     const regnode *nextbranch= NULL;
5324     U32 word;
5325
5326     for ( word=1 ; word <= trie->wordcount ; word++)
5327     {
5328      SSize_t deltanext=0, minnext=0, f = 0, fake;
5329      regnode_ssc this_class;
5330
5331      data_fake.flags = 0;
5332      if (data) {
5333       data_fake.whilem_c = data->whilem_c;
5334       data_fake.last_closep = data->last_closep;
5335      }
5336      else
5337       data_fake.last_closep = &fake;
5338      data_fake.pos_delta = delta;
5339      if (flags & SCF_DO_STCLASS) {
5340       ssc_init(pRExC_state, &this_class);
5341       data_fake.start_class = &this_class;
5342       f = SCF_DO_STCLASS_AND;
5343      }
5344      if (flags & SCF_WHILEM_VISITED_POS)
5345       f |= SCF_WHILEM_VISITED_POS;
5346
5347      if (trie->jump[word]) {
5348       if (!nextbranch)
5349        nextbranch = trie_node + trie->jump[0];
5350       scan= trie_node + trie->jump[word];
5351       /* We go from the jump point to the branch that follows
5352       it. Note this means we need the vestigal unused
5353       branches even though they arent otherwise used. */
5354       minnext = study_chunk(pRExC_state, &scan, minlenp,
5355        &deltanext, (regnode *)nextbranch, &data_fake,
5356        stopparen, recursed_depth, NULL, f,depth+1);
5357      }
5358      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5359       nextbranch= regnext((regnode*)nextbranch);
5360
5361      if (min1 > (SSize_t)(minnext + trie->minlen))
5362       min1 = minnext + trie->minlen;
5363      if (deltanext == SSize_t_MAX) {
5364       is_inf = is_inf_internal = 1;
5365       max1 = SSize_t_MAX;
5366      } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5367       max1 = minnext + deltanext + trie->maxlen;
5368
5369      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5370       pars++;
5371      if (data_fake.flags & SCF_SEEN_ACCEPT) {
5372       if ( stopmin > min + min1)
5373        stopmin = min + min1;
5374       flags &= ~SCF_DO_SUBSTR;
5375       if (data)
5376        data->flags |= SCF_SEEN_ACCEPT;
5377      }
5378      if (data) {
5379       if (data_fake.flags & SF_HAS_EVAL)
5380        data->flags |= SF_HAS_EVAL;
5381       data->whilem_c = data_fake.whilem_c;
5382      }
5383      if (flags & SCF_DO_STCLASS)
5384       ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5385     }
5386    }
5387    if (flags & SCF_DO_SUBSTR) {
5388     data->pos_min += min1;
5389     data->pos_delta += max1 - min1;
5390     if (max1 != min1 || is_inf)
5391      data->longest = &(data->longest_float);
5392    }
5393    min += min1;
5394    delta += max1 - min1;
5395    if (flags & SCF_DO_STCLASS_OR) {
5396     ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5397     if (min1) {
5398      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5399      flags &= ~SCF_DO_STCLASS;
5400     }
5401    }
5402    else if (flags & SCF_DO_STCLASS_AND) {
5403     if (min1) {
5404      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5405      flags &= ~SCF_DO_STCLASS;
5406     }
5407     else {
5408      /* Switch to OR mode: cache the old value of
5409      * data->start_class */
5410      INIT_AND_WITHP;
5411      StructCopy(data->start_class, and_withp, regnode_ssc);
5412      flags &= ~SCF_DO_STCLASS_AND;
5413      StructCopy(&accum, data->start_class, regnode_ssc);
5414      flags |= SCF_DO_STCLASS_OR;
5415     }
5416    }
5417    scan= tail;
5418    continue;
5419   }
5420 #else
5421   else if (PL_regkind[OP(scan)] == TRIE) {
5422    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5423    U8*bang=NULL;
5424
5425    min += trie->minlen;
5426    delta += (trie->maxlen - trie->minlen);
5427    flags &= ~SCF_DO_STCLASS; /* xxx */
5428    if (flags & SCF_DO_SUBSTR) {
5429     /* Cannot expect anything... */
5430     scan_commit(pRExC_state, data, minlenp, is_inf);
5431      data->pos_min += trie->minlen;
5432      data->pos_delta += (trie->maxlen - trie->minlen);
5433     if (trie->maxlen != trie->minlen)
5434      data->longest = &(data->longest_float);
5435     }
5436     if (trie->jump) /* no more substrings -- for now /grr*/
5437    flags &= ~SCF_DO_SUBSTR;
5438   }
5439 #endif /* old or new */
5440 #endif /* TRIE_STUDY_OPT */
5441
5442   /* Else: zero-length, ignore. */
5443   scan = regnext(scan);
5444  }
5445  /* If we are exiting a recursion we can unset its recursed bit
5446  * and allow ourselves to enter it again - no danger of an
5447  * infinite loop there.
5448  if (stopparen > -1 && recursed) {
5449   DEBUG_STUDYDATA("unset:", data,depth);
5450   PAREN_UNSET( recursed, stopparen);
5451  }
5452  */
5453  if (frame) {
5454   DEBUG_STUDYDATA("frame-end:",data,depth);
5455   DEBUG_PEEP("fend", scan, depth);
5456   /* restore previous context */
5457   last = frame->last;
5458   scan = frame->next;
5459   stopparen = frame->stop;
5460   recursed_depth = frame->prev_recursed_depth;
5461   depth = depth - 1;
5462
5463   frame = frame->prev;
5464   goto fake_study_recurse;
5465  }
5466
5467   finish:
5468  assert(!frame);
5469  DEBUG_STUDYDATA("pre-fin:",data,depth);
5470
5471  *scanp = scan;
5472  *deltap = is_inf_internal ? SSize_t_MAX : delta;
5473
5474  if (flags & SCF_DO_SUBSTR && is_inf)
5475   data->pos_delta = SSize_t_MAX - data->pos_min;
5476  if (is_par > (I32)U8_MAX)
5477   is_par = 0;
5478  if (is_par && pars==1 && data) {
5479   data->flags |= SF_IN_PAR;
5480   data->flags &= ~SF_HAS_PAR;
5481  }
5482  else if (pars && data) {
5483   data->flags |= SF_HAS_PAR;
5484   data->flags &= ~SF_IN_PAR;
5485  }
5486  if (flags & SCF_DO_STCLASS_OR)
5487   ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5488  if (flags & SCF_TRIE_RESTUDY)
5489   data->flags |=  SCF_TRIE_RESTUDY;
5490
5491  DEBUG_STUDYDATA("post-fin:",data,depth);
5492
5493  {
5494   SSize_t final_minlen= min < stopmin ? min : stopmin;
5495
5496   if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5497    RExC_maxlen = final_minlen + delta;
5498   }
5499   return final_minlen;
5500  }
5501  /* not-reached */
5502 }
5503
5504 STATIC U32
5505 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5506 {
5507  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5508
5509  PERL_ARGS_ASSERT_ADD_DATA;
5510
5511  Renewc(RExC_rxi->data,
5512   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5513   char, struct reg_data);
5514  if(count)
5515   Renew(RExC_rxi->data->what, count + n, U8);
5516  else
5517   Newx(RExC_rxi->data->what, n, U8);
5518  RExC_rxi->data->count = count + n;
5519  Copy(s, RExC_rxi->data->what + count, n, U8);
5520  return count;
5521 }
5522
5523 /*XXX: todo make this not included in a non debugging perl, but appears to be
5524  * used anyway there, in 'use re' */
5525 #ifndef PERL_IN_XSUB_RE
5526 void
5527 Perl_reginitcolors(pTHX)
5528 {
5529  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5530  if (s) {
5531   char *t = savepv(s);
5532   int i = 0;
5533   PL_colors[0] = t;
5534   while (++i < 6) {
5535    t = strchr(t, '\t');
5536    if (t) {
5537     *t = '\0';
5538     PL_colors[i] = ++t;
5539    }
5540    else
5541     PL_colors[i] = t = (char *)"";
5542   }
5543  } else {
5544   int i = 0;
5545   while (i < 6)
5546    PL_colors[i++] = (char *)"";
5547  }
5548  PL_colorset = 1;
5549 }
5550 #endif
5551
5552
5553 #ifdef TRIE_STUDY_OPT
5554 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5555  STMT_START {                                            \
5556   if (                                                \
5557    (data.flags & SCF_TRIE_RESTUDY)               \
5558    && ! restudied++                              \
5559   ) {                                                 \
5560    dOsomething;                                    \
5561    goto reStudy;                                   \
5562   }                                                   \
5563  } STMT_END
5564 #else
5565 #define CHECK_RESTUDY_GOTO_butfirst
5566 #endif
5567
5568 /*
5569  * pregcomp - compile a regular expression into internal code
5570  *
5571  * Decides which engine's compiler to call based on the hint currently in
5572  * scope
5573  */
5574
5575 #ifndef PERL_IN_XSUB_RE
5576
5577 /* return the currently in-scope regex engine (or the default if none)  */
5578
5579 regexp_engine const *
5580 Perl_current_re_engine(pTHX)
5581 {
5582  if (IN_PERL_COMPILETIME) {
5583   HV * const table = GvHV(PL_hintgv);
5584   SV **ptr;
5585
5586   if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5587    return &reh_regexp_engine;
5588   ptr = hv_fetchs(table, "regcomp", FALSE);
5589   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5590    return &reh_regexp_engine;
5591   return INT2PTR(regexp_engine*,SvIV(*ptr));
5592  }
5593  else {
5594   SV *ptr;
5595   if (!PL_curcop->cop_hints_hash)
5596    return &reh_regexp_engine;
5597   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5598   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5599    return &reh_regexp_engine;
5600   return INT2PTR(regexp_engine*,SvIV(ptr));
5601  }
5602 }
5603
5604
5605 REGEXP *
5606 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5607 {
5608  regexp_engine const *eng = current_re_engine();
5609  GET_RE_DEBUG_FLAGS_DECL;
5610
5611  PERL_ARGS_ASSERT_PREGCOMP;
5612
5613  /* Dispatch a request to compile a regexp to correct regexp engine. */
5614  DEBUG_COMPILE_r({
5615   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5616       PTR2UV(eng));
5617  });
5618  return CALLREGCOMP_ENG(eng, pattern, flags);
5619 }
5620 #endif
5621
5622 /* public(ish) entry point for the perl core's own regex compiling code.
5623  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5624  * pattern rather than a list of OPs, and uses the internal engine rather
5625  * than the current one */
5626
5627 REGEXP *
5628 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5629 {
5630  SV *pat = pattern; /* defeat constness! */
5631  PERL_ARGS_ASSERT_RE_COMPILE;
5632  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5633 #ifdef PERL_IN_XSUB_RE
5634         &my_reg_engine,
5635 #else
5636         &reh_regexp_engine,
5637 #endif
5638         NULL, NULL, rx_flags, 0);
5639 }
5640
5641
5642 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5643  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5644  * point to the realloced string and length.
5645  *
5646  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5647  * stuff added */
5648
5649 static void
5650 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5651      char **pat_p, STRLEN *plen_p, int num_code_blocks)
5652 {
5653  U8 *const src = (U8*)*pat_p;
5654  U8 *dst;
5655  int n=0;
5656  STRLEN s = 0, d = 0;
5657  bool do_end = 0;
5658  GET_RE_DEBUG_FLAGS_DECL;
5659
5660  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5661   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5662
5663  Newx(dst, *plen_p * 2 + 1, U8);
5664
5665  while (s < *plen_p) {
5666   if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5667    dst[d]   = src[s];
5668   else {
5669    dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5670    dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5671   }
5672   if (n < num_code_blocks) {
5673    if (!do_end && pRExC_state->code_blocks[n].start == s) {
5674     pRExC_state->code_blocks[n].start = d;
5675     assert(dst[d] == '(');
5676     do_end = 1;
5677    }
5678    else if (do_end && pRExC_state->code_blocks[n].end == s) {
5679     pRExC_state->code_blocks[n].end = d;
5680     assert(dst[d] == ')');
5681     do_end = 0;
5682     n++;
5683    }
5684   }
5685   s++;
5686   d++;
5687  }
5688  dst[d] = '\0';
5689  *plen_p = d;
5690  *pat_p = (char*) dst;
5691  SAVEFREEPV(*pat_p);
5692  RExC_orig_utf8 = RExC_utf8 = 1;
5693 }
5694
5695
5696
5697 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5698  * while recording any code block indices, and handling overloading,
5699  * nested qr// objects etc.  If pat is null, it will allocate a new
5700  * string, or just return the first arg, if there's only one.
5701  *
5702  * Returns the malloced/updated pat.
5703  * patternp and pat_count is the array of SVs to be concatted;
5704  * oplist is the optional list of ops that generated the SVs;
5705  * recompile_p is a pointer to a boolean that will be set if
5706  *   the regex will need to be recompiled.
5707  * delim, if non-null is an SV that will be inserted between each element
5708  */
5709
5710 static SV*
5711 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5712     SV *pat, SV ** const patternp, int pat_count,
5713     OP *oplist, bool *recompile_p, SV *delim)
5714 {
5715  SV **svp;
5716  int n = 0;
5717  bool use_delim = FALSE;
5718  bool alloced = FALSE;
5719
5720  /* if we know we have at least two args, create an empty string,
5721  * then concatenate args to that. For no args, return an empty string */
5722  if (!pat && pat_count != 1) {
5723   pat = newSVpvs("");
5724   SAVEFREESV(pat);
5725   alloced = TRUE;
5726  }
5727
5728  for (svp = patternp; svp < patternp + pat_count; svp++) {
5729   SV *sv;
5730   SV *rx  = NULL;
5731   STRLEN orig_patlen = 0;
5732   bool code = 0;
5733   SV *msv = use_delim ? delim : *svp;
5734   if (!msv) msv = &PL_sv_undef;
5735
5736   /* if we've got a delimiter, we go round the loop twice for each
5737   * svp slot (except the last), using the delimiter the second
5738   * time round */
5739   if (use_delim) {
5740    svp--;
5741    use_delim = FALSE;
5742   }
5743   else if (delim)
5744    use_delim = TRUE;
5745
5746   if (SvTYPE(msv) == SVt_PVAV) {
5747    /* we've encountered an interpolated array within
5748    * the pattern, e.g. /...@a..../. Expand the list of elements,
5749    * then recursively append elements.
5750    * The code in this block is based on S_pushav() */
5751
5752    AV *const av = (AV*)msv;
5753    const SSize_t maxarg = AvFILL(av) + 1;
5754    SV **array;
5755
5756    if (oplist) {
5757     assert(oplist->op_type == OP_PADAV
5758      || oplist->op_type == OP_RV2AV);
5759     oplist = OP_SIBLING(oplist);
5760    }
5761
5762    if (SvRMAGICAL(av)) {
5763     SSize_t i;
5764
5765     Newx(array, maxarg, SV*);
5766     SAVEFREEPV(array);
5767     for (i=0; i < maxarg; i++) {
5768      SV ** const svp = av_fetch(av, i, FALSE);
5769      array[i] = svp ? *svp : &PL_sv_undef;
5770     }
5771    }
5772    else
5773     array = AvARRAY(av);
5774
5775    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5776         array, maxarg, NULL, recompile_p,
5777         /* $" */
5778         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5779
5780    continue;
5781   }
5782
5783
5784   /* we make the assumption here that each op in the list of
5785   * op_siblings maps to one SV pushed onto the stack,
5786   * except for code blocks, with have both an OP_NULL and
5787   * and OP_CONST.
5788   * This allows us to match up the list of SVs against the
5789   * list of OPs to find the next code block.
5790   *
5791   * Note that       PUSHMARK PADSV PADSV ..
5792   * is optimised to
5793   *                 PADRANGE PADSV  PADSV  ..
5794   * so the alignment still works. */
5795
5796   if (oplist) {
5797    if (oplist->op_type == OP_NULL
5798     && (oplist->op_flags & OPf_SPECIAL))
5799    {
5800     assert(n < pRExC_state->num_code_blocks);
5801     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5802     pRExC_state->code_blocks[n].block = oplist;
5803     pRExC_state->code_blocks[n].src_regex = NULL;
5804     n++;
5805     code = 1;
5806     oplist = OP_SIBLING(oplist); /* skip CONST */
5807     assert(oplist);
5808    }
5809    oplist = OP_SIBLING(oplist);;
5810   }
5811
5812   /* apply magic and QR overloading to arg */
5813
5814   SvGETMAGIC(msv);
5815   if (SvROK(msv) && SvAMAGIC(msv)) {
5816    SV *sv = AMG_CALLunary(msv, regexp_amg);
5817    if (sv) {
5818     if (SvROK(sv))
5819      sv = SvRV(sv);
5820     if (SvTYPE(sv) != SVt_REGEXP)
5821      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5822     msv = sv;
5823    }
5824   }
5825
5826   /* try concatenation overload ... */
5827   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5828     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5829   {
5830    sv_setsv(pat, sv);
5831    /* overloading involved: all bets are off over literal
5832    * code. Pretend we haven't seen it */
5833    pRExC_state->num_code_blocks -= n;
5834    n = 0;
5835   }
5836   else  {
5837    /* ... or failing that, try "" overload */
5838    while (SvAMAGIC(msv)
5839      && (sv = AMG_CALLunary(msv, string_amg))
5840      && sv != msv
5841      &&  !(   SvROK(msv)
5842       && SvROK(sv)
5843       && SvRV(msv) == SvRV(sv))
5844    ) {
5845     msv = sv;
5846     SvGETMAGIC(msv);
5847    }
5848    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5849     msv = SvRV(msv);
5850
5851    if (pat) {
5852     /* this is a partially unrolled
5853     *     sv_catsv_nomg(pat, msv);
5854     * that allows us to adjust code block indices if
5855     * needed */
5856     STRLEN dlen;
5857     char *dst = SvPV_force_nomg(pat, dlen);
5858     orig_patlen = dlen;
5859     if (SvUTF8(msv) && !SvUTF8(pat)) {
5860      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5861      sv_setpvn(pat, dst, dlen);
5862      SvUTF8_on(pat);
5863     }
5864     sv_catsv_nomg(pat, msv);
5865     rx = msv;
5866    }
5867    else
5868     pat = msv;
5869
5870    if (code)
5871     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5872   }
5873
5874   /* extract any code blocks within any embedded qr//'s */
5875   if (rx && SvTYPE(rx) == SVt_REGEXP
5876    && RX_ENGINE((REGEXP*)rx)->op_comp)
5877   {
5878
5879    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5880    if (ri->num_code_blocks) {
5881     int i;
5882     /* the presence of an embedded qr// with code means
5883     * we should always recompile: the text of the
5884     * qr// may not have changed, but it may be a
5885     * different closure than last time */
5886     *recompile_p = 1;
5887     Renew(pRExC_state->code_blocks,
5888      pRExC_state->num_code_blocks + ri->num_code_blocks,
5889      struct reg_code_block);
5890     pRExC_state->num_code_blocks += ri->num_code_blocks;
5891
5892     for (i=0; i < ri->num_code_blocks; i++) {
5893      struct reg_code_block *src, *dst;
5894      STRLEN offset =  orig_patlen
5895       + ReANY((REGEXP *)rx)->pre_prefix;
5896      assert(n < pRExC_state->num_code_blocks);
5897      src = &ri->code_blocks[i];
5898      dst = &pRExC_state->code_blocks[n];
5899      dst->start     = src->start + offset;
5900      dst->end     = src->end   + offset;
5901      dst->block     = src->block;
5902      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5903            src->src_regex
5904             ? src->src_regex
5905             : (REGEXP*)rx);
5906      n++;
5907     }
5908    }
5909   }
5910  }
5911  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5912  if (alloced)
5913   SvSETMAGIC(pat);
5914
5915  return pat;
5916 }
5917
5918
5919
5920 /* see if there are any run-time code blocks in the pattern.
5921  * False positives are allowed */
5922
5923 static bool
5924 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5925      char *pat, STRLEN plen)
5926 {
5927  int n = 0;
5928  STRLEN s;
5929
5930  PERL_UNUSED_CONTEXT;
5931
5932  for (s = 0; s < plen; s++) {
5933   if (n < pRExC_state->num_code_blocks
5934    && s == pRExC_state->code_blocks[n].start)
5935   {
5936    s = pRExC_state->code_blocks[n].end;
5937    n++;
5938    continue;
5939   }
5940   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5941   * positives here */
5942   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5943    (pat[s+2] == '{'
5944     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5945   )
5946    return 1;
5947  }
5948  return 0;
5949 }
5950
5951 /* Handle run-time code blocks. We will already have compiled any direct
5952  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5953  * copy of it, but with any literal code blocks blanked out and
5954  * appropriate chars escaped; then feed it into
5955  *
5956  *    eval "qr'modified_pattern'"
5957  *
5958  * For example,
5959  *
5960  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5961  *
5962  * becomes
5963  *
5964  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5965  *
5966  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5967  * and merge them with any code blocks of the original regexp.
5968  *
5969  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5970  * instead, just save the qr and return FALSE; this tells our caller that
5971  * the original pattern needs upgrading to utf8.
5972  */
5973
5974 static bool
5975 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5976  char *pat, STRLEN plen)
5977 {
5978  SV *qr;
5979
5980  GET_RE_DEBUG_FLAGS_DECL;
5981
5982  if (pRExC_state->runtime_code_qr) {
5983   /* this is the second time we've been called; this should
5984   * only happen if the main pattern got upgraded to utf8
5985   * during compilation; re-use the qr we compiled first time
5986   * round (which should be utf8 too)
5987   */
5988   qr = pRExC_state->runtime_code_qr;
5989   pRExC_state->runtime_code_qr = NULL;
5990   assert(RExC_utf8 && SvUTF8(qr));
5991  }
5992  else {
5993   int n = 0;
5994   STRLEN s;
5995   char *p, *newpat;
5996   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5997   SV *sv, *qr_ref;
5998   dSP;
5999
6000   /* determine how many extra chars we need for ' and \ escaping */
6001   for (s = 0; s < plen; s++) {
6002    if (pat[s] == '\'' || pat[s] == '\\')
6003     newlen++;
6004   }
6005
6006   Newx(newpat, newlen, char);
6007   p = newpat;
6008   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6009
6010   for (s = 0; s < plen; s++) {
6011    if (n < pRExC_state->num_code_blocks
6012     && s == pRExC_state->code_blocks[n].start)
6013    {
6014     /* blank out literal code block */
6015     assert(pat[s] == '(');
6016     while (s <= pRExC_state->code_blocks[n].end) {
6017      *p++ = '_';
6018      s++;
6019     }
6020     s--;
6021     n++;
6022     continue;
6023    }
6024    if (pat[s] == '\'' || pat[s] == '\\')
6025     *p++ = '\\';
6026    *p++ = pat[s];
6027   }
6028   *p++ = '\'';
6029   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6030    *p++ = 'x';
6031   *p++ = '\0';
6032   DEBUG_COMPILE_r({
6033    PerlIO_printf(Perl_debug_log,
6034     "%sre-parsing pattern for runtime code:%s %s\n",
6035     PL_colors[4],PL_colors[5],newpat);
6036   });
6037
6038   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6039   Safefree(newpat);
6040
6041   ENTER;
6042   SAVETMPS;
6043   save_re_context();
6044   PUSHSTACKi(PERLSI_REQUIRE);
6045   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6046   * parsing qr''; normally only q'' does this. It also alters
6047   * hints handling */
6048   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6049   SvREFCNT_dec_NN(sv);
6050   SPAGAIN;
6051   qr_ref = POPs;
6052   PUTBACK;
6053   {
6054    SV * const errsv = ERRSV;
6055    if (SvTRUE_NN(errsv))
6056    {
6057     Safefree(pRExC_state->code_blocks);
6058     /* use croak_sv ? */
6059     Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6060    }
6061   }
6062   assert(SvROK(qr_ref));
6063   qr = SvRV(qr_ref);
6064   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6065   /* the leaving below frees the tmp qr_ref.
6066   * Give qr a life of its own */
6067   SvREFCNT_inc(qr);
6068   POPSTACK;
6069   FREETMPS;
6070   LEAVE;
6071
6072  }
6073
6074  if (!RExC_utf8 && SvUTF8(qr)) {
6075   /* first time through; the pattern got upgraded; save the
6076   * qr for the next time through */
6077   assert(!pRExC_state->runtime_code_qr);
6078   pRExC_state->runtime_code_qr = qr;
6079   return 0;
6080  }
6081
6082
6083  /* extract any code blocks within the returned qr//  */
6084
6085
6086  /* merge the main (r1) and run-time (r2) code blocks into one */
6087  {
6088   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6089   struct reg_code_block *new_block, *dst;
6090   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6091   int i1 = 0, i2 = 0;
6092
6093   if (!r2->num_code_blocks) /* we guessed wrong */
6094   {
6095    SvREFCNT_dec_NN(qr);
6096    return 1;
6097   }
6098
6099   Newx(new_block,
6100    r1->num_code_blocks + r2->num_code_blocks,
6101    struct reg_code_block);
6102   dst = new_block;
6103
6104   while (    i1 < r1->num_code_blocks
6105     || i2 < r2->num_code_blocks)
6106   {
6107    struct reg_code_block *src;
6108    bool is_qr = 0;
6109
6110    if (i1 == r1->num_code_blocks) {
6111     src = &r2->code_blocks[i2++];
6112     is_qr = 1;
6113    }
6114    else if (i2 == r2->num_code_blocks)
6115     src = &r1->code_blocks[i1++];
6116    else if (  r1->code_blocks[i1].start
6117      < r2->code_blocks[i2].start)
6118    {
6119     src = &r1->code_blocks[i1++];
6120     assert(src->end < r2->code_blocks[i2].start);
6121    }
6122    else {
6123     assert(  r1->code_blocks[i1].start
6124      > r2->code_blocks[i2].start);
6125     src = &r2->code_blocks[i2++];
6126     is_qr = 1;
6127     assert(src->end < r1->code_blocks[i1].start);
6128    }
6129
6130    assert(pat[src->start] == '(');
6131    assert(pat[src->end]   == ')');
6132    dst->start     = src->start;
6133    dst->end     = src->end;
6134    dst->block     = src->block;
6135    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6136          : src->src_regex;
6137    dst++;
6138   }
6139   r1->num_code_blocks += r2->num_code_blocks;
6140   Safefree(r1->code_blocks);
6141   r1->code_blocks = new_block;
6142  }
6143
6144  SvREFCNT_dec_NN(qr);
6145  return 1;
6146 }
6147
6148
6149 STATIC bool
6150 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6151      SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6152      SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6153      STRLEN longest_length, bool eol, bool meol)
6154 {
6155  /* This is the common code for setting up the floating and fixed length
6156  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6157  * as to whether succeeded or not */
6158
6159  I32 t;
6160  SSize_t ml;
6161
6162  if (! (longest_length
6163   || (eol /* Can't have SEOL and MULTI */
6164    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6165   )
6166    /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6167   || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6168  {
6169   return FALSE;
6170  }
6171
6172  /* copy the information about the longest from the reg_scan_data
6173   over to the program. */
6174  if (SvUTF8(sv_longest)) {
6175   *rx_utf8 = sv_longest;
6176   *rx_substr = NULL;
6177  } else {
6178   *rx_substr = sv_longest;
6179   *rx_utf8 = NULL;
6180  }
6181  /* end_shift is how many chars that must be matched that
6182   follow this item. We calculate it ahead of time as once the
6183   lookbehind offset is added in we lose the ability to correctly
6184   calculate it.*/
6185  ml = minlen ? *(minlen) : (SSize_t)longest_length;
6186  *rx_end_shift = ml - offset
6187   - longest_length + (SvTAIL(sv_longest) != 0)
6188   + lookbehind;
6189
6190  t = (eol/* Can't have SEOL and MULTI */
6191   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6192  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6193
6194  return TRUE;
6195 }
6196
6197 /*
6198  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6199  * regular expression into internal code.
6200  * The pattern may be passed either as:
6201  *    a list of SVs (patternp plus pat_count)
6202  *    a list of OPs (expr)
6203  * If both are passed, the SV list is used, but the OP list indicates
6204  * which SVs are actually pre-compiled code blocks
6205  *
6206  * The SVs in the list have magic and qr overloading applied to them (and
6207  * the list may be modified in-place with replacement SVs in the latter
6208  * case).
6209  *
6210  * If the pattern hasn't changed from old_re, then old_re will be
6211  * returned.
6212  *
6213  * eng is the current engine. If that engine has an op_comp method, then
6214  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6215  * do the initial concatenation of arguments and pass on to the external
6216  * engine.
6217  *
6218  * If is_bare_re is not null, set it to a boolean indicating whether the
6219  * arg list reduced (after overloading) to a single bare regex which has
6220  * been returned (i.e. /$qr/).
6221  *
6222  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6223  *
6224  * pm_flags contains the PMf_* flags, typically based on those from the
6225  * pm_flags field of the related PMOP. Currently we're only interested in
6226  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6227  *
6228  * We can't allocate space until we know how big the compiled form will be,
6229  * but we can't compile it (and thus know how big it is) until we've got a
6230  * place to put the code.  So we cheat:  we compile it twice, once with code
6231  * generation turned off and size counting turned on, and once "for real".
6232  * This also means that we don't allocate space until we are sure that the
6233  * thing really will compile successfully, and we never have to move the
6234  * code and thus invalidate pointers into it.  (Note that it has to be in
6235  * one piece because free() must be able to free it all.) [NB: not true in perl]
6236  *
6237  * Beware that the optimization-preparation code in here knows about some
6238  * of the structure of the compiled regexp.  [I'll say.]
6239  */
6240
6241 REGEXP *
6242 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6243      OP *expr, const regexp_engine* eng, REGEXP *old_re,
6244      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6245 {
6246  REGEXP *rx;
6247  struct regexp *r;
6248  regexp_internal *ri;
6249  STRLEN plen;
6250  char *exp;
6251  regnode *scan;
6252  I32 flags;
6253  SSize_t minlen = 0;
6254  U32 rx_flags;
6255  SV *pat;
6256  SV *code_blocksv = NULL;
6257  SV** new_patternp = patternp;
6258
6259  /* these are all flags - maybe they should be turned
6260  * into a single int with different bit masks */
6261  I32 sawlookahead = 0;
6262  I32 sawplus = 0;
6263  I32 sawopen = 0;
6264  I32 sawminmod = 0;
6265
6266  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6267  bool recompile = 0;
6268  bool runtime_code = 0;
6269  scan_data_t data;
6270  RExC_state_t RExC_state;
6271  RExC_state_t * const pRExC_state = &RExC_state;
6272 #ifdef TRIE_STUDY_OPT
6273  int restudied = 0;
6274  RExC_state_t copyRExC_state;
6275 #endif
6276  GET_RE_DEBUG_FLAGS_DECL;
6277
6278  PERL_ARGS_ASSERT_RE_OP_COMPILE;
6279
6280  DEBUG_r(if (!PL_colorset) reginitcolors());
6281
6282 #ifndef PERL_IN_XSUB_RE
6283  /* Initialize these here instead of as-needed, as is quick and avoids
6284  * having to test them each time otherwise */
6285  if (! PL_AboveLatin1) {
6286   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6287   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6288   PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6289   PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6290   PL_HasMultiCharFold =
6291      _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6292  }
6293 #endif
6294
6295  pRExC_state->code_blocks = NULL;
6296  pRExC_state->num_code_blocks = 0;
6297
6298  if (is_bare_re)
6299   *is_bare_re = FALSE;
6300
6301  if (expr && (expr->op_type == OP_LIST ||
6302     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6303   /* allocate code_blocks if needed */
6304   OP *o;
6305   int ncode = 0;
6306
6307   for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6308    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6309     ncode++; /* count of DO blocks */
6310   if (ncode) {
6311    pRExC_state->num_code_blocks = ncode;
6312    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6313   }
6314  }
6315
6316  if (!pat_count) {
6317   /* compile-time pattern with just OP_CONSTs and DO blocks */
6318
6319   int n;
6320   OP *o;
6321
6322   /* find how many CONSTs there are */
6323   assert(expr);
6324   n = 0;
6325   if (expr->op_type == OP_CONST)
6326    n = 1;
6327   else
6328    for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6329     if (o->op_type == OP_CONST)
6330      n++;
6331    }
6332
6333   /* fake up an SV array */
6334
6335   assert(!new_patternp);
6336   Newx(new_patternp, n, SV*);
6337   SAVEFREEPV(new_patternp);
6338   pat_count = n;
6339
6340   n = 0;
6341   if (expr->op_type == OP_CONST)
6342    new_patternp[n] = cSVOPx_sv(expr);
6343   else
6344    for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6345     if (o->op_type == OP_CONST)
6346      new_patternp[n++] = cSVOPo_sv;
6347    }
6348
6349  }
6350
6351  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6352   "Assembling pattern from %d elements%s\n", pat_count,
6353    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6354
6355  /* set expr to the first arg op */
6356
6357  if (pRExC_state->num_code_blocks
6358   && expr->op_type != OP_CONST)
6359  {
6360    expr = cLISTOPx(expr)->op_first;
6361    assert(   expr->op_type == OP_PUSHMARK
6362     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6363     || expr->op_type == OP_PADRANGE);
6364    expr = OP_SIBLING(expr);
6365  }
6366
6367  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6368       expr, &recompile, NULL);
6369
6370  /* handle bare (possibly after overloading) regex: foo =~ $re */
6371  {
6372   SV *re = pat;
6373   if (SvROK(re))
6374    re = SvRV(re);
6375   if (SvTYPE(re) == SVt_REGEXP) {
6376    if (is_bare_re)
6377     *is_bare_re = TRUE;
6378    SvREFCNT_inc(re);
6379    Safefree(pRExC_state->code_blocks);
6380    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6381     "Precompiled pattern%s\n",
6382      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6383
6384    return (REGEXP*)re;
6385   }
6386  }
6387
6388  exp = SvPV_nomg(pat, plen);
6389
6390  if (!eng->op_comp) {
6391   if ((SvUTF8(pat) && IN_BYTES)
6392     || SvGMAGICAL(pat) || SvAMAGIC(pat))
6393   {
6394    /* make a temporary copy; either to convert to bytes,
6395    * or to avoid repeating get-magic / overloaded stringify */
6396    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6397           (IN_BYTES ? 0 : SvUTF8(pat)));
6398   }
6399   Safefree(pRExC_state->code_blocks);
6400   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6401  }
6402
6403  /* ignore the utf8ness if the pattern is 0 length */
6404  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6405  RExC_uni_semantics = 0;
6406  RExC_contains_locale = 0;
6407  RExC_contains_i = 0;
6408  pRExC_state->runtime_code_qr = NULL;
6409
6410  DEBUG_COMPILE_r({
6411    SV *dsv= sv_newmortal();
6412    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6413    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6414       PL_colors[4],PL_colors[5],s);
6415   });
6416
6417   redo_first_pass:
6418  /* we jump here if we upgrade the pattern to utf8 and have to
6419  * recompile */
6420
6421  if ((pm_flags & PMf_USE_RE_EVAL)
6422     /* this second condition covers the non-regex literal case,
6423     * i.e.  $foo =~ '(?{})'. */
6424     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6425  )
6426   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6427
6428  /* return old regex if pattern hasn't changed */
6429  /* XXX: note in the below we have to check the flags as well as the
6430  * pattern.
6431  *
6432  * Things get a touch tricky as we have to compare the utf8 flag
6433  * independently from the compile flags.  */
6434
6435  if (   old_re
6436   && !recompile
6437   && !!RX_UTF8(old_re) == !!RExC_utf8
6438   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6439   && RX_PRECOMP(old_re)
6440   && RX_PRELEN(old_re) == plen
6441   && memEQ(RX_PRECOMP(old_re), exp, plen)
6442   && !runtime_code /* with runtime code, always recompile */ )
6443  {
6444   Safefree(pRExC_state->code_blocks);
6445   return old_re;
6446  }
6447
6448  rx_flags = orig_rx_flags;
6449
6450  if (rx_flags & PMf_FOLD) {
6451   RExC_contains_i = 1;
6452  }
6453  if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6454
6455   /* Set to use unicode semantics if the pattern is in utf8 and has the
6456   * 'depends' charset specified, as it means unicode when utf8  */
6457   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6458  }
6459
6460  RExC_precomp = exp;
6461  RExC_flags = rx_flags;
6462  RExC_pm_flags = pm_flags;
6463
6464  if (runtime_code) {
6465   if (TAINTING_get && TAINT_get)
6466    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6467
6468   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6469    /* whoops, we have a non-utf8 pattern, whilst run-time code
6470    * got compiled as utf8. Try again with a utf8 pattern */
6471    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6472          pRExC_state->num_code_blocks);
6473    goto redo_first_pass;
6474   }
6475  }
6476  assert(!pRExC_state->runtime_code_qr);
6477
6478  RExC_sawback = 0;
6479
6480  RExC_seen = 0;
6481  RExC_maxlen = 0;
6482  RExC_in_lookbehind = 0;
6483  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6484  RExC_extralen = 0;
6485  RExC_override_recoding = 0;
6486  RExC_in_multi_char_class = 0;
6487
6488  /* First pass: determine size, legality. */
6489  RExC_parse = exp;
6490  RExC_start = exp;
6491  RExC_end = exp + plen;
6492  RExC_naughty = 0;
6493  RExC_npar = 1;
6494  RExC_nestroot = 0;
6495  RExC_size = 0L;
6496  RExC_emit = (regnode *) &RExC_emit_dummy;
6497  RExC_whilem_seen = 0;
6498  RExC_open_parens = NULL;
6499  RExC_close_parens = NULL;
6500  RExC_opend = NULL;
6501  RExC_paren_names = NULL;
6502 #ifdef DEBUGGING
6503  RExC_paren_name_list = NULL;
6504 #endif
6505  RExC_recurse = NULL;
6506  RExC_study_chunk_recursed = NULL;
6507  RExC_study_chunk_recursed_bytes= 0;
6508  RExC_recurse_count = 0;
6509  pRExC_state->code_index = 0;
6510
6511 #if 0 /* REGC() is (currently) a NOP at the first pass.
6512  * Clever compilers notice this and complain. --jhi */
6513  REGC((U8)REG_MAGIC, (char*)RExC_emit);
6514 #endif
6515  DEBUG_PARSE_r(
6516   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6517   RExC_lastnum=0;
6518   RExC_lastparse=NULL;
6519  );
6520  /* reg may croak on us, not giving us a chance to free
6521  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6522  need it to survive as long as the regexp (qr/(?{})/).
6523  We must check that code_blocksv is not already set, because we may
6524  have jumped back to restart the sizing pass. */
6525  if (pRExC_state->code_blocks && !code_blocksv) {
6526   code_blocksv = newSV_type(SVt_PV);
6527   SAVEFREESV(code_blocksv);
6528   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6529   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6530  }
6531  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6532   /* It's possible to write a regexp in ascii that represents Unicode
6533   codepoints outside of the byte range, such as via \x{100}. If we
6534   detect such a sequence we have to convert the entire pattern to utf8
6535   and then recompile, as our sizing calculation will have been based
6536   on 1 byte == 1 character, but we will need to use utf8 to encode
6537   at least some part of the pattern, and therefore must convert the whole
6538   thing.
6539   -- dmq */
6540   if (flags & RESTART_UTF8) {
6541    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6542          pRExC_state->num_code_blocks);
6543    goto redo_first_pass;
6544   }
6545   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6546  }
6547  if (code_blocksv)
6548   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6549
6550  DEBUG_PARSE_r({
6551   PerlIO_printf(Perl_debug_log,
6552    "Required size %"IVdf" nodes\n"
6553    "Starting second pass (creation)\n",
6554    (IV)RExC_size);
6555   RExC_lastnum=0;
6556   RExC_lastparse=NULL;
6557  });
6558
6559  /* The first pass could have found things that force Unicode semantics */
6560  if ((RExC_utf8 || RExC_uni_semantics)
6561   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6562  {
6563   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6564  }
6565
6566  /* Small enough for pointer-storage convention?
6567  If extralen==0, this means that we will not need long jumps. */
6568  if (RExC_size >= 0x10000L && RExC_extralen)
6569   RExC_size += RExC_extralen;
6570  else
6571   RExC_extralen = 0;
6572  if (RExC_whilem_seen > 15)
6573   RExC_whilem_seen = 15;
6574
6575  /* Allocate space and zero-initialize. Note, the two step process
6576  of zeroing when in debug mode, thus anything assigned has to
6577  happen after that */
6578  rx = (REGEXP*) newSV_type(SVt_REGEXP);
6579  r = ReANY(rx);
6580  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6581   char, regexp_internal);
6582  if ( r == NULL || ri == NULL )
6583   FAIL("Regexp out of space");
6584 #ifdef DEBUGGING
6585  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6586  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6587   char);
6588 #else
6589  /* bulk initialize base fields with 0. */
6590  Zero(ri, sizeof(regexp_internal), char);
6591 #endif
6592
6593  /* non-zero initialization begins here */
6594  RXi_SET( r, ri );
6595  r->engine= eng;
6596  r->extflags = rx_flags;
6597  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6598
6599  if (pm_flags & PMf_IS_QR) {
6600   ri->code_blocks = pRExC_state->code_blocks;
6601   ri->num_code_blocks = pRExC_state->num_code_blocks;
6602  }
6603  else
6604  {
6605   int n;
6606   for (n = 0; n < pRExC_state->num_code_blocks; n++)
6607    if (pRExC_state->code_blocks[n].src_regex)
6608     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6609   SAVEFREEPV(pRExC_state->code_blocks);
6610  }
6611
6612  {
6613   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6614   bool has_charset = (get_regex_charset(r->extflags)
6615              != REGEX_DEPENDS_CHARSET);
6616
6617   /* The caret is output if there are any defaults: if not all the STD
6618   * flags are set, or if no character set specifier is needed */
6619   bool has_default =
6620      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6621      || ! has_charset);
6622   bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6623             == REG_RUN_ON_COMMENT_SEEN);
6624   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6625        >> RXf_PMf_STD_PMMOD_SHIFT);
6626   const char *fptr = STD_PAT_MODS;        /*"msix"*/
6627   char *p;
6628   /* Allocate for the worst case, which is all the std flags are turned
6629   * on.  If more precision is desired, we could do a population count of
6630   * the flags set.  This could be done with a small lookup table, or by
6631   * shifting, masking and adding, or even, when available, assembly
6632   * language for a machine-language population count.
6633   * We never output a minus, as all those are defaults, so are
6634   * covered by the caret */
6635   const STRLEN wraplen = plen + has_p + has_runon
6636    + has_default       /* If needs a caret */
6637
6638     /* If needs a character set specifier */
6639    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6640    + (sizeof(STD_PAT_MODS) - 1)
6641    + (sizeof("(?:)") - 1);
6642
6643   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6644   r->xpv_len_u.xpvlenu_pv = p;
6645   if (RExC_utf8)
6646    SvFLAGS(rx) |= SVf_UTF8;
6647   *p++='('; *p++='?';
6648
6649   /* If a default, cover it using the caret */
6650   if (has_default) {
6651    *p++= DEFAULT_PAT_MOD;
6652   }
6653   if (has_charset) {
6654    STRLEN len;
6655    const char* const name = get_regex_charset_name(r->extflags, &len);
6656    Copy(name, p, len, char);
6657    p += len;
6658   }
6659   if (has_p)
6660    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6661   {
6662    char ch;
6663    while((ch = *fptr++)) {
6664     if(reganch & 1)
6665      *p++ = ch;
6666     reganch >>= 1;
6667    }
6668   }
6669
6670   *p++ = ':';
6671   Copy(RExC_precomp, p, plen, char);
6672   assert ((RX_WRAPPED(rx) - p) < 16);
6673   r->pre_prefix = p - RX_WRAPPED(rx);
6674   p += plen;
6675   if (has_runon)
6676    *p++ = '\n';
6677   *p++ = ')';
6678   *p = 0;
6679   SvCUR_set(rx, p - RX_WRAPPED(rx));
6680  }
6681
6682  r->intflags = 0;
6683  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6684
6685  /* setup various meta data about recursion, this all requires
6686  * RExC_npar to be correctly set, and a bit later on we clear it */
6687  if (RExC_seen & REG_RECURSE_SEEN) {
6688   Newxz(RExC_open_parens, RExC_npar,regnode *);
6689   SAVEFREEPV(RExC_open_parens);
6690   Newxz(RExC_close_parens,RExC_npar,regnode *);
6691   SAVEFREEPV(RExC_close_parens);
6692  }
6693  if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6694   /* Note, RExC_npar is 1 + the number of parens in a pattern.
6695   * So its 1 if there are no parens. */
6696   RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6697           ((RExC_npar & 0x07) != 0);
6698   Newx(RExC_study_chunk_recursed,
6699    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6700   SAVEFREEPV(RExC_study_chunk_recursed);
6701  }
6702
6703  /* Useful during FAIL. */
6704 #ifdef RE_TRACK_PATTERN_OFFSETS
6705  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6706  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6707       "%s %"UVuf" bytes for offset annotations.\n",
6708       ri->u.offsets ? "Got" : "Couldn't get",
6709       (UV)((2*RExC_size+1) * sizeof(U32))));
6710 #endif
6711  SetProgLen(ri,RExC_size);
6712  RExC_rx_sv = rx;
6713  RExC_rx = r;
6714  RExC_rxi = ri;
6715  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6716
6717  /* Second pass: emit code. */
6718  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6719  RExC_pm_flags = pm_flags;
6720  RExC_parse = exp;
6721  RExC_end = exp + plen;
6722  RExC_naughty = 0;
6723  RExC_npar = 1;
6724  RExC_emit_start = ri->program;
6725  RExC_emit = ri->program;
6726  RExC_emit_bound = ri->program + RExC_size + 1;
6727  pRExC_state->code_index = 0;
6728
6729  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6730  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6731   ReREFCNT_dec(rx);
6732   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6733  }
6734  /* XXXX To minimize changes to RE engine we always allocate
6735  3-units-long substrs field. */
6736  Newx(r->substrs, 1, struct reg_substr_data);
6737  if (RExC_recurse_count) {
6738   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6739   SAVEFREEPV(RExC_recurse);
6740  }
6741
6742 reStudy:
6743  r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6744  Zero(r->substrs, 1, struct reg_substr_data);
6745  if (RExC_study_chunk_recursed)
6746   Zero(RExC_study_chunk_recursed,
6747    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6748
6749 #ifdef TRIE_STUDY_OPT
6750  if (!restudied) {
6751   StructCopy(&zero_scan_data, &data, scan_data_t);
6752   copyRExC_state = RExC_state;
6753  } else {
6754   U32 seen=RExC_seen;
6755   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6756
6757   RExC_state = copyRExC_state;
6758   if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6759    RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6760   else
6761    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6762   StructCopy(&zero_scan_data, &data, scan_data_t);
6763  }
6764 #else
6765  StructCopy(&zero_scan_data, &data, scan_data_t);
6766 #endif
6767
6768  /* Dig out information for optimizations. */
6769  r->extflags = RExC_flags; /* was pm_op */
6770  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6771
6772  if (UTF)
6773   SvUTF8_on(rx); /* Unicode in it? */
6774  ri->regstclass = NULL;
6775  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6776   r->intflags |= PREGf_NAUGHTY;
6777  scan = ri->program + 1;  /* First BRANCH. */
6778
6779  /* testing for BRANCH here tells us whether there is "must appear"
6780  data in the pattern. If there is then we can use it for optimisations */
6781  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6782             */
6783   SSize_t fake;
6784   STRLEN longest_float_length, longest_fixed_length;
6785   regnode_ssc ch_class; /* pointed to by data */
6786   int stclass_flag;
6787   SSize_t last_close = 0; /* pointed to by data */
6788   regnode *first= scan;
6789   regnode *first_next= regnext(first);
6790   /*
6791   * Skip introductions and multiplicators >= 1
6792   * so that we can extract the 'meat' of the pattern that must
6793   * match in the large if() sequence following.
6794   * NOTE that EXACT is NOT covered here, as it is normally
6795   * picked up by the optimiser separately.
6796   *
6797   * This is unfortunate as the optimiser isnt handling lookahead
6798   * properly currently.
6799   *
6800   */
6801   while ((OP(first) == OPEN && (sawopen = 1)) ||
6802    /* An OR of *one* alternative - should not happen now. */
6803    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6804    /* for now we can't handle lookbehind IFMATCH*/
6805    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6806    (OP(first) == PLUS) ||
6807    (OP(first) == MINMOD) ||
6808    /* An {n,m} with n>0 */
6809    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6810    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6811   {
6812     /*
6813     * the only op that could be a regnode is PLUS, all the rest
6814     * will be regnode_1 or regnode_2.
6815     *
6816     * (yves doesn't think this is true)
6817     */
6818     if (OP(first) == PLUS)
6819      sawplus = 1;
6820     else {
6821      if (OP(first) == MINMOD)
6822       sawminmod = 1;
6823      first += regarglen[OP(first)];
6824     }
6825     first = NEXTOPER(first);
6826     first_next= regnext(first);
6827   }
6828
6829   /* Starting-point info. */
6830  again:
6831   DEBUG_PEEP("first:",first,0);
6832   /* Ignore EXACT as we deal with it later. */
6833   if (PL_regkind[OP(first)] == EXACT) {
6834    if (OP(first) == EXACT)
6835     NOOP; /* Empty, get anchored substr later. */
6836    else
6837     ri->regstclass = first;
6838   }
6839 #ifdef TRIE_STCLASS
6840   else if (PL_regkind[OP(first)] == TRIE &&
6841     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6842   {
6843    /* this can happen only on restudy */
6844    ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6845   }
6846 #endif
6847   else if (REGNODE_SIMPLE(OP(first)))
6848    ri->regstclass = first;
6849   else if (PL_regkind[OP(first)] == BOUND ||
6850     PL_regkind[OP(first)] == NBOUND)
6851    ri->regstclass = first;
6852   else if (PL_regkind[OP(first)] == BOL) {
6853    r->intflags |= (OP(first) == MBOL
6854       ? PREGf_ANCH_MBOL
6855       : (OP(first) == SBOL
6856        ? PREGf_ANCH_SBOL
6857        : PREGf_ANCH_BOL));
6858    first = NEXTOPER(first);
6859    goto again;
6860   }
6861   else if (OP(first) == GPOS) {
6862    r->intflags |= PREGf_ANCH_GPOS;
6863    first = NEXTOPER(first);
6864    goto again;
6865   }
6866   else if ((!sawopen || !RExC_sawback) &&
6867    !sawlookahead &&
6868    (OP(first) == STAR &&
6869    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6870    !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6871   {
6872    /* turn .* into ^.* with an implied $*=1 */
6873    const int type =
6874     (OP(NEXTOPER(first)) == REG_ANY)
6875      ? PREGf_ANCH_MBOL
6876      : PREGf_ANCH_SBOL;
6877    r->intflags |= (type | PREGf_IMPLICIT);
6878    first = NEXTOPER(first);
6879    goto again;
6880   }
6881   if (sawplus && !sawminmod && !sawlookahead
6882    && (!sawopen || !RExC_sawback)
6883    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6884    /* x+ must match at the 1st pos of run of x's */
6885    r->intflags |= PREGf_SKIP;
6886
6887   /* Scan is after the zeroth branch, first is atomic matcher. */
6888 #ifdef TRIE_STUDY_OPT
6889   DEBUG_PARSE_r(
6890    if (!restudied)
6891     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6892        (IV)(first - scan + 1))
6893   );
6894 #else
6895   DEBUG_PARSE_r(
6896    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6897     (IV)(first - scan + 1))
6898   );
6899 #endif
6900
6901
6902   /*
6903   * If there's something expensive in the r.e., find the
6904   * longest literal string that must appear and make it the
6905   * regmust.  Resolve ties in favor of later strings, since
6906   * the regstart check works with the beginning of the r.e.
6907   * and avoiding duplication strengthens checking.  Not a
6908   * strong reason, but sufficient in the absence of others.
6909   * [Now we resolve ties in favor of the earlier string if
6910   * it happens that c_offset_min has been invalidated, since the
6911   * earlier string may buy us something the later one won't.]
6912   */
6913
6914   data.longest_fixed = newSVpvs("");
6915   data.longest_float = newSVpvs("");
6916   data.last_found = newSVpvs("");
6917   data.longest = &(data.longest_fixed);
6918   ENTER_with_name("study_chunk");
6919   SAVEFREESV(data.longest_fixed);
6920   SAVEFREESV(data.longest_float);
6921   SAVEFREESV(data.last_found);
6922   first = scan;
6923   if (!ri->regstclass) {
6924    ssc_init(pRExC_state, &ch_class);
6925    data.start_class = &ch_class;
6926    stclass_flag = SCF_DO_STCLASS_AND;
6927   } else    /* XXXX Check for BOUND? */
6928    stclass_flag = 0;
6929   data.last_closep = &last_close;
6930
6931   DEBUG_RExC_seen();
6932   minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6933        scan + RExC_size, /* Up to end */
6934    &data, -1, 0, NULL,
6935    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6936       | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6937    0);
6938
6939
6940   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6941
6942
6943   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6944    && data.last_start_min == 0 && data.last_end > 0
6945    && !RExC_seen_zerolen
6946    && !(RExC_seen & REG_VERBARG_SEEN)
6947    && !(RExC_seen & REG_GPOS_SEEN)
6948   ){
6949    r->extflags |= RXf_CHECK_ALL;
6950   }
6951   scan_commit(pRExC_state, &data,&minlen,0);
6952
6953   longest_float_length = CHR_SVLEN(data.longest_float);
6954
6955   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6956     && data.offset_fixed == data.offset_float_min
6957     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6958    && S_setup_longest (aTHX_ pRExC_state,
6959          data.longest_float,
6960          &(r->float_utf8),
6961          &(r->float_substr),
6962          &(r->float_end_shift),
6963          data.lookbehind_float,
6964          data.offset_float_min,
6965          data.minlen_float,
6966          longest_float_length,
6967          cBOOL(data.flags & SF_FL_BEFORE_EOL),
6968          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6969   {
6970    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6971    r->float_max_offset = data.offset_float_max;
6972    if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6973     r->float_max_offset -= data.lookbehind_float;
6974    SvREFCNT_inc_simple_void_NN(data.longest_float);
6975   }
6976   else {
6977    r->float_substr = r->float_utf8 = NULL;
6978    longest_float_length = 0;
6979   }
6980
6981   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6982
6983   if (S_setup_longest (aTHX_ pRExC_state,
6984         data.longest_fixed,
6985         &(r->anchored_utf8),
6986         &(r->anchored_substr),
6987         &(r->anchored_end_shift),
6988         data.lookbehind_fixed,
6989         data.offset_fixed,
6990         data.minlen_fixed,
6991         longest_fixed_length,
6992         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6993         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6994   {
6995    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6996    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6997   }
6998   else {
6999    r->anchored_substr = r->anchored_utf8 = NULL;
7000    longest_fixed_length = 0;
7001   }
7002   LEAVE_with_name("study_chunk");
7003
7004   if (ri->regstclass
7005    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7006    ri->regstclass = NULL;
7007
7008   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7009    && stclass_flag
7010    && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7011    && !ssc_is_anything(data.start_class))
7012   {
7013    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7014
7015    ssc_finalize(pRExC_state, data.start_class);
7016
7017    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7018    StructCopy(data.start_class,
7019      (regnode_ssc*)RExC_rxi->data->data[n],
7020      regnode_ssc);
7021    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7022    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7023    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7024      regprop(r, sv, (regnode*)data.start_class, NULL);
7025      PerlIO_printf(Perl_debug_log,
7026          "synthetic stclass \"%s\".\n",
7027          SvPVX_const(sv));});
7028    data.start_class = NULL;
7029   }
7030
7031   /* A temporary algorithm prefers floated substr to fixed one to dig
7032   * more info. */
7033   if (longest_fixed_length > longest_float_length) {
7034    r->substrs->check_ix = 0;
7035    r->check_end_shift = r->anchored_end_shift;
7036    r->check_substr = r->anchored_substr;
7037    r->check_utf8 = r->anchored_utf8;
7038    r->check_offset_min = r->check_offset_max = r->anchored_offset;
7039    if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7040     r->intflags |= PREGf_NOSCAN;
7041   }
7042   else {
7043    r->substrs->check_ix = 1;
7044    r->check_end_shift = r->float_end_shift;
7045    r->check_substr = r->float_substr;
7046    r->check_utf8 = r->float_utf8;
7047    r->check_offset_min = r->float_min_offset;
7048    r->check_offset_max = r->float_max_offset;
7049   }
7050   if ((r->check_substr || r->check_utf8) ) {
7051    r->extflags |= RXf_USE_INTUIT;
7052    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7053     r->extflags |= RXf_INTUIT_TAIL;
7054   }
7055   r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7056
7057   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7058   if ( (STRLEN)minlen < longest_float_length )
7059    minlen= longest_float_length;
7060   if ( (STRLEN)minlen < longest_fixed_length )
7061    minlen= longest_fixed_length;
7062   */
7063  }
7064  else {
7065   /* Several toplevels. Best we can is to set minlen. */
7066   SSize_t fake;
7067   regnode_ssc ch_class;
7068   SSize_t last_close = 0;
7069
7070   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7071
7072   scan = ri->program + 1;
7073   ssc_init(pRExC_state, &ch_class);
7074   data.start_class = &ch_class;
7075   data.last_closep = &last_close;
7076
7077   DEBUG_RExC_seen();
7078   minlen = study_chunk(pRExC_state,
7079    &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7080    SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7081              ? SCF_TRIE_DOING_RESTUDY
7082              : 0),
7083    0);
7084
7085   CHECK_RESTUDY_GOTO_butfirst(NOOP);
7086
7087   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7088     = r->float_substr = r->float_utf8 = NULL;
7089
7090   if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7091    && ! ssc_is_anything(data.start_class))
7092   {
7093    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7094
7095    ssc_finalize(pRExC_state, data.start_class);
7096
7097    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7098    StructCopy(data.start_class,
7099      (regnode_ssc*)RExC_rxi->data->data[n],
7100      regnode_ssc);
7101    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7102    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7103    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7104      regprop(r, sv, (regnode*)data.start_class, NULL);
7105      PerlIO_printf(Perl_debug_log,
7106          "synthetic stclass \"%s\".\n",
7107          SvPVX_const(sv));});
7108    data.start_class = NULL;
7109   }
7110  }
7111
7112  if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7113   r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7114   r->maxlen = REG_INFTY;
7115  }
7116  else {
7117   r->maxlen = RExC_maxlen;
7118  }
7119
7120  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7121  the "real" pattern. */
7122  DEBUG_OPTIMISE_r({
7123   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7124      (IV)minlen, (IV)r->minlen, RExC_maxlen);
7125  });
7126  r->minlenret = minlen;
7127  if (r->minlen < minlen)
7128   r->minlen = minlen;
7129
7130  if (RExC_seen & REG_GPOS_SEEN)
7131   r->intflags |= PREGf_GPOS_SEEN;
7132  if (RExC_seen & REG_LOOKBEHIND_SEEN)
7133   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7134             lookbehind */
7135  if (pRExC_state->num_code_blocks)
7136   r->extflags |= RXf_EVAL_SEEN;
7137  if (RExC_seen & REG_CANY_SEEN)
7138   r->intflags |= PREGf_CANY_SEEN;
7139  if (RExC_seen & REG_VERBARG_SEEN)
7140  {
7141   r->intflags |= PREGf_VERBARG_SEEN;
7142   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7143  }
7144  if (RExC_seen & REG_CUTGROUP_SEEN)
7145   r->intflags |= PREGf_CUTGROUP_SEEN;
7146  if (pm_flags & PMf_USE_RE_EVAL)
7147   r->intflags |= PREGf_USE_RE_EVAL;
7148  if (RExC_paren_names)
7149   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7150  else
7151   RXp_PAREN_NAMES(r) = NULL;
7152
7153  /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7154  * so it can be used in pp.c */
7155  if (r->intflags & PREGf_ANCH)
7156   r->extflags |= RXf_IS_ANCHORED;
7157
7158
7159  {
7160   /* this is used to identify "special" patterns that might result
7161   * in Perl NOT calling the regex engine and instead doing the match "itself",
7162   * particularly special cases in split//. By having the regex compiler
7163   * do this pattern matching at a regop level (instead of by inspecting the pattern)
7164   * we avoid weird issues with equivalent patterns resulting in different behavior,
7165   * AND we allow non Perl engines to get the same optimizations by the setting the
7166   * flags appropriately - Yves */
7167   regnode *first = ri->program + 1;
7168   U8 fop = OP(first);
7169   regnode *next = NEXTOPER(first);
7170   U8 nop = OP(next);
7171
7172   if (PL_regkind[fop] == NOTHING && nop == END)
7173    r->extflags |= RXf_NULL;
7174   else if (PL_regkind[fop] == BOL && nop == END)
7175    r->extflags |= RXf_START_ONLY;
7176   else if (fop == PLUS
7177     && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7178     && OP(regnext(first)) == END)
7179    r->extflags |= RXf_WHITE;
7180   else if ( r->extflags & RXf_SPLIT
7181     && fop == EXACT
7182     && STR_LEN(first) == 1
7183     && *(STRING(first)) == ' '
7184     && OP(regnext(first)) == END )
7185    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7186
7187  }
7188
7189  if (RExC_contains_locale) {
7190   RXp_EXTFLAGS(r) |= RXf_TAINTED;
7191  }
7192
7193 #ifdef DEBUGGING
7194  if (RExC_paren_names) {
7195   ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7196   ri->data->data[ri->name_list_idx]
7197         = (void*)SvREFCNT_inc(RExC_paren_name_list);
7198  } else
7199 #endif
7200   ri->name_list_idx = 0;
7201
7202  if (RExC_recurse_count) {
7203   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7204    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7205    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7206   }
7207  }
7208  Newxz(r->offs, RExC_npar, regexp_paren_pair);
7209  /* assume we don't need to swap parens around before we match */
7210
7211  DEBUG_DUMP_r({
7212   DEBUG_RExC_seen();
7213   PerlIO_printf(Perl_debug_log,"Final program:\n");
7214   regdump(r);
7215  });
7216 #ifdef RE_TRACK_PATTERN_OFFSETS
7217  DEBUG_OFFSETS_r(if (ri->u.offsets) {
7218   const STRLEN len = ri->u.offsets[0];
7219   STRLEN i;
7220   GET_RE_DEBUG_FLAGS_DECL;
7221   PerlIO_printf(Perl_debug_log,
7222      "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7223   for (i = 1; i <= len; i++) {
7224    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7225     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7226     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7227    }
7228   PerlIO_printf(Perl_debug_log, "\n");
7229  });
7230 #endif
7231
7232 #ifdef USE_ITHREADS
7233  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7234  * by setting the regexp SV to readonly-only instead. If the
7235  * pattern's been recompiled, the USEDness should remain. */
7236  if (old_re && SvREADONLY(old_re))
7237   SvREADONLY_on(rx);
7238 #endif
7239  return rx;
7240 }
7241
7242
7243 SV*
7244 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7245      const U32 flags)
7246 {
7247  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7248
7249  PERL_UNUSED_ARG(value);
7250
7251  if (flags & RXapif_FETCH) {
7252   return reg_named_buff_fetch(rx, key, flags);
7253  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7254   Perl_croak_no_modify();
7255   return NULL;
7256  } else if (flags & RXapif_EXISTS) {
7257   return reg_named_buff_exists(rx, key, flags)
7258    ? &PL_sv_yes
7259    : &PL_sv_no;
7260  } else if (flags & RXapif_REGNAMES) {
7261   return reg_named_buff_all(rx, flags);
7262  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7263   return reg_named_buff_scalar(rx, flags);
7264  } else {
7265   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7266   return NULL;
7267  }
7268 }
7269
7270 SV*
7271 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7272       const U32 flags)
7273 {
7274  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7275  PERL_UNUSED_ARG(lastkey);
7276
7277  if (flags & RXapif_FIRSTKEY)
7278   return reg_named_buff_firstkey(rx, flags);
7279  else if (flags & RXapif_NEXTKEY)
7280   return reg_named_buff_nextkey(rx, flags);
7281  else {
7282   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7283            (int)flags);
7284   return NULL;
7285  }
7286 }
7287
7288 SV*
7289 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7290       const U32 flags)
7291 {
7292  AV *retarray = NULL;
7293  SV *ret;
7294  struct regexp *const rx = ReANY(r);
7295
7296  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7297
7298  if (flags & RXapif_ALL)
7299   retarray=newAV();
7300
7301  if (rx && RXp_PAREN_NAMES(rx)) {
7302   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7303   if (he_str) {
7304    IV i;
7305    SV* sv_dat=HeVAL(he_str);
7306    I32 *nums=(I32*)SvPVX(sv_dat);
7307    for ( i=0; i<SvIVX(sv_dat); i++ ) {
7308     if ((I32)(rx->nparens) >= nums[i]
7309      && rx->offs[nums[i]].start != -1
7310      && rx->offs[nums[i]].end != -1)
7311     {
7312      ret = newSVpvs("");
7313      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7314      if (!retarray)
7315       return ret;
7316     } else {
7317      if (retarray)
7318       ret = newSVsv(&PL_sv_undef);
7319     }
7320     if (retarray)
7321      av_push(retarray, ret);
7322    }
7323    if (retarray)
7324     return newRV_noinc(MUTABLE_SV(retarray));
7325   }
7326  }
7327  return NULL;
7328 }
7329
7330 bool
7331 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7332       const U32 flags)
7333 {
7334  struct regexp *const rx = ReANY(r);
7335
7336  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7337
7338  if (rx && RXp_PAREN_NAMES(rx)) {
7339   if (flags & RXapif_ALL) {
7340    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7341   } else {
7342    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7343    if (sv) {
7344     SvREFCNT_dec_NN(sv);
7345     return TRUE;
7346    } else {
7347     return FALSE;
7348    }
7349   }
7350  } else {
7351   return FALSE;
7352  }
7353 }
7354
7355 SV*
7356 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7357 {
7358  struct regexp *const rx = ReANY(r);
7359
7360  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7361
7362  if ( rx && RXp_PAREN_NAMES(rx) ) {
7363   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7364
7365   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7366  } else {
7367   return FALSE;
7368  }
7369 }
7370
7371 SV*
7372 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7373 {
7374  struct regexp *const rx = ReANY(r);
7375  GET_RE_DEBUG_FLAGS_DECL;
7376
7377  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7378
7379  if (rx && RXp_PAREN_NAMES(rx)) {
7380   HV *hv = RXp_PAREN_NAMES(rx);
7381   HE *temphe;
7382   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7383    IV i;
7384    IV parno = 0;
7385    SV* sv_dat = HeVAL(temphe);
7386    I32 *nums = (I32*)SvPVX(sv_dat);
7387    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7388     if ((I32)(rx->lastparen) >= nums[i] &&
7389      rx->offs[nums[i]].start != -1 &&
7390      rx->offs[nums[i]].end != -1)
7391     {
7392      parno = nums[i];
7393      break;
7394     }
7395    }
7396    if (parno || flags & RXapif_ALL) {
7397     return newSVhek(HeKEY_hek(temphe));
7398    }
7399   }
7400  }
7401  return NULL;
7402 }
7403
7404 SV*
7405 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7406 {
7407  SV *ret;
7408  AV *av;
7409  SSize_t length;
7410  struct regexp *const rx = ReANY(r);
7411
7412  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7413
7414  if (rx && RXp_PAREN_NAMES(rx)) {
7415   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7416    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7417   } else if (flags & RXapif_ONE) {
7418    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7419    av = MUTABLE_AV(SvRV(ret));
7420    length = av_tindex(av);
7421    SvREFCNT_dec_NN(ret);
7422    return newSViv(length + 1);
7423   } else {
7424    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7425             (int)flags);
7426    return NULL;
7427   }
7428  }
7429  return &PL_sv_undef;
7430 }
7431
7432 SV*
7433 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7434 {
7435  struct regexp *const rx = ReANY(r);
7436  AV *av = newAV();
7437
7438  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7439
7440  if (rx && RXp_PAREN_NAMES(rx)) {
7441   HV *hv= RXp_PAREN_NAMES(rx);
7442   HE *temphe;
7443   (void)hv_iterinit(hv);
7444   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7445    IV i;
7446    IV parno = 0;
7447    SV* sv_dat = HeVAL(temphe);
7448    I32 *nums = (I32*)SvPVX(sv_dat);
7449    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7450     if ((I32)(rx->lastparen) >= nums[i] &&
7451      rx->offs[nums[i]].start != -1 &&
7452      rx->offs[nums[i]].end != -1)
7453     {
7454      parno = nums[i];
7455      break;
7456     }
7457    }
7458    if (parno || flags & RXapif_ALL) {
7459     av_push(av, newSVhek(HeKEY_hek(temphe)));
7460    }
7461   }
7462  }
7463
7464  return newRV_noinc(MUTABLE_SV(av));
7465 }
7466
7467 void
7468 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7469        SV * const sv)
7470 {
7471  struct regexp *const rx = ReANY(r);
7472  char *s = NULL;
7473  SSize_t i = 0;
7474  SSize_t s1, t1;
7475  I32 n = paren;
7476
7477  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7478
7479  if (      n == RX_BUFF_IDX_CARET_PREMATCH
7480   || n == RX_BUFF_IDX_CARET_FULLMATCH
7481   || n == RX_BUFF_IDX_CARET_POSTMATCH
7482  )
7483  {
7484   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7485   if (!keepcopy) {
7486    /* on something like
7487    *    $r = qr/.../;
7488    *    /$qr/p;
7489    * the KEEPCOPY is set on the PMOP rather than the regex */
7490    if (PL_curpm && r == PM_GETRE(PL_curpm))
7491     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7492   }
7493   if (!keepcopy)
7494    goto ret_undef;
7495  }
7496
7497  if (!rx->subbeg)
7498   goto ret_undef;
7499
7500  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7501   /* no need to distinguish between them any more */
7502   n = RX_BUFF_IDX_FULLMATCH;
7503
7504  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7505   && rx->offs[0].start != -1)
7506  {
7507   /* $`, ${^PREMATCH} */
7508   i = rx->offs[0].start;
7509   s = rx->subbeg;
7510  }
7511  else
7512  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7513   && rx->offs[0].end != -1)
7514  {
7515   /* $', ${^POSTMATCH} */
7516   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7517   i = rx->sublen + rx->suboffset - rx->offs[0].end;
7518  }
7519  else
7520  if ( 0 <= n && n <= (I32)rx->nparens &&
7521   (s1 = rx->offs[n].start) != -1 &&
7522   (t1 = rx->offs[n].end) != -1)
7523  {
7524   /* $&, ${^MATCH},  $1 ... */
7525   i = t1 - s1;
7526   s = rx->subbeg + s1 - rx->suboffset;
7527  } else {
7528   goto ret_undef;
7529  }
7530
7531  assert(s >= rx->subbeg);
7532  assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7533  if (i >= 0) {
7534 #ifdef NO_TAINT_SUPPORT
7535   sv_setpvn(sv, s, i);
7536 #else
7537   const int oldtainted = TAINT_get;
7538   TAINT_NOT;
7539   sv_setpvn(sv, s, i);
7540   TAINT_set(oldtainted);
7541 #endif
7542   if ( (rx->intflags & PREGf_CANY_SEEN)
7543    ? (RXp_MATCH_UTF8(rx)
7544       && (!i || is_utf8_string((U8*)s, i)))
7545    : (RXp_MATCH_UTF8(rx)) )
7546   {
7547    SvUTF8_on(sv);
7548   }
7549   else
7550    SvUTF8_off(sv);
7551   if (TAINTING_get) {
7552    if (RXp_MATCH_TAINTED(rx)) {
7553     if (SvTYPE(sv) >= SVt_PVMG) {
7554      MAGIC* const mg = SvMAGIC(sv);
7555      MAGIC* mgt;
7556      TAINT;
7557      SvMAGIC_set(sv, mg->mg_moremagic);
7558      SvTAINT(sv);
7559      if ((mgt = SvMAGIC(sv))) {
7560       mg->mg_moremagic = mgt;
7561       SvMAGIC_set(sv, mg);
7562      }
7563     } else {
7564      TAINT;
7565      SvTAINT(sv);
7566     }
7567    } else
7568     SvTAINTED_off(sv);
7569   }
7570  } else {
7571  ret_undef:
7572   sv_setsv(sv,&PL_sv_undef);
7573   return;
7574  }
7575 }
7576
7577 void
7578 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7579               SV const * const value)
7580 {
7581  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7582
7583  PERL_UNUSED_ARG(rx);
7584  PERL_UNUSED_ARG(paren);
7585  PERL_UNUSED_ARG(value);
7586
7587  if (!PL_localizing)
7588   Perl_croak_no_modify();
7589 }
7590
7591 I32
7592 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7593        const I32 paren)
7594 {
7595  struct regexp *const rx = ReANY(r);
7596  I32 i;
7597  I32 s1, t1;
7598
7599  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7600
7601  if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7602   || paren == RX_BUFF_IDX_CARET_FULLMATCH
7603   || paren == RX_BUFF_IDX_CARET_POSTMATCH
7604  )
7605  {
7606   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7607   if (!keepcopy) {
7608    /* on something like
7609    *    $r = qr/.../;
7610    *    /$qr/p;
7611    * the KEEPCOPY is set on the PMOP rather than the regex */
7612    if (PL_curpm && r == PM_GETRE(PL_curpm))
7613     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7614   }
7615   if (!keepcopy)
7616    goto warn_undef;
7617  }
7618
7619  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7620  switch (paren) {
7621  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7622  case RX_BUFF_IDX_PREMATCH:       /* $` */
7623   if (rx->offs[0].start != -1) {
7624       i = rx->offs[0].start;
7625       if (i > 0) {
7626         s1 = 0;
7627         t1 = i;
7628         goto getlen;
7629       }
7630    }
7631   return 0;
7632
7633  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7634  case RX_BUFF_IDX_POSTMATCH:       /* $' */
7635    if (rx->offs[0].end != -1) {
7636       i = rx->sublen - rx->offs[0].end;
7637       if (i > 0) {
7638         s1 = rx->offs[0].end;
7639         t1 = rx->sublen;
7640         goto getlen;
7641       }
7642    }
7643   return 0;
7644
7645  default: /* $& / ${^MATCH}, $1, $2, ... */
7646    if (paren <= (I32)rx->nparens &&
7647    (s1 = rx->offs[paren].start) != -1 &&
7648    (t1 = rx->offs[paren].end) != -1)
7649    {
7650    i = t1 - s1;
7651    goto getlen;
7652   } else {
7653   warn_undef:
7654    if (ckWARN(WARN_UNINITIALIZED))
7655     report_uninit((const SV *)sv);
7656    return 0;
7657   }
7658  }
7659   getlen:
7660  if (i > 0 && RXp_MATCH_UTF8(rx)) {
7661   const char * const s = rx->subbeg - rx->suboffset + s1;
7662   const U8 *ep;
7663   STRLEN el;
7664
7665   i = t1 - s1;
7666   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7667       i = el;
7668  }
7669  return i;
7670 }
7671
7672 SV*
7673 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7674 {
7675  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7676   PERL_UNUSED_ARG(rx);
7677   if (0)
7678    return NULL;
7679   else
7680    return newSVpvs("Regexp");
7681 }
7682
7683 /* Scans the name of a named buffer from the pattern.
7684  * If flags is REG_RSN_RETURN_NULL returns null.
7685  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7686  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7687  * to the parsed name as looked up in the RExC_paren_names hash.
7688  * If there is an error throws a vFAIL().. type exception.
7689  */
7690
7691 #define REG_RSN_RETURN_NULL    0
7692 #define REG_RSN_RETURN_NAME    1
7693 #define REG_RSN_RETURN_DATA    2
7694
7695 STATIC SV*
7696 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7697 {
7698  char *name_start = RExC_parse;
7699
7700  PERL_ARGS_ASSERT_REG_SCAN_NAME;
7701
7702  assert (RExC_parse <= RExC_end);
7703  if (RExC_parse == RExC_end) NOOP;
7704  else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7705   /* skip IDFIRST by using do...while */
7706   if (UTF)
7707    do {
7708     RExC_parse += UTF8SKIP(RExC_parse);
7709    } while (isWORDCHAR_utf8((U8*)RExC_parse));
7710   else
7711    do {
7712     RExC_parse++;
7713    } while (isWORDCHAR(*RExC_parse));
7714  } else {
7715   RExC_parse++; /* so the <- from the vFAIL is after the offending
7716       character */
7717   vFAIL("Group name must start with a non-digit word character");
7718  }
7719  if ( flags ) {
7720   SV* sv_name
7721    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7722        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7723   if ( flags == REG_RSN_RETURN_NAME)
7724    return sv_name;
7725   else if (flags==REG_RSN_RETURN_DATA) {
7726    HE *he_str = NULL;
7727    SV *sv_dat = NULL;
7728    if ( ! sv_name )      /* should not happen*/
7729     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7730    if (RExC_paren_names)
7731     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7732    if ( he_str )
7733     sv_dat = HeVAL(he_str);
7734    if ( ! sv_dat )
7735     vFAIL("Reference to nonexistent named group");
7736    return sv_dat;
7737   }
7738   else {
7739    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7740      (unsigned long) flags);
7741   }
7742   assert(0); /* NOT REACHED */
7743  }
7744  return NULL;
7745 }
7746
7747 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7748  int rem=(int)(RExC_end - RExC_parse);                       \
7749  int cut;                                                    \
7750  int num;                                                    \
7751  int iscut=0;                                                \
7752  if (rem>10) {                                               \
7753   rem=10;                                                 \
7754   iscut=1;                                                \
7755  }                                                           \
7756  cut=10-rem;                                                 \
7757  if (RExC_lastparse!=RExC_parse)                             \
7758   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7759    rem, RExC_parse,                                    \
7760    cut + 4,                                            \
7761    iscut ? "..." : "<"                                 \
7762   );                                                      \
7763  else                                                        \
7764   PerlIO_printf(Perl_debug_log,"%16s","");                \
7765                 \
7766  if (SIZE_ONLY)                                              \
7767  num = RExC_size + 1;                                     \
7768  else                                                        \
7769  num=REG_NODE_NUM(RExC_emit);                             \
7770  if (RExC_lastnum!=num)                                      \
7771  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7772  else                                                        \
7773  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7774  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7775   (int)((depth*2)), "",                                   \
7776   (funcname)                                              \
7777  );                                                          \
7778  RExC_lastnum=num;                                           \
7779  RExC_lastparse=RExC_parse;                                  \
7780 })
7781
7782
7783
7784 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7785  DEBUG_PARSE_MSG((funcname));                            \
7786  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7787 })
7788 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7789  DEBUG_PARSE_MSG((funcname));                            \
7790  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7791 })
7792
7793 /* This section of code defines the inversion list object and its methods.  The
7794  * interfaces are highly subject to change, so as much as possible is static to
7795  * this file.  An inversion list is here implemented as a malloc'd C UV array
7796  * as an SVt_INVLIST scalar.
7797  *
7798  * An inversion list for Unicode is an array of code points, sorted by ordinal
7799  * number.  The zeroth element is the first code point in the list.  The 1th
7800  * element is the first element beyond that not in the list.  In other words,
7801  * the first range is
7802  *  invlist[0]..(invlist[1]-1)
7803  * The other ranges follow.  Thus every element whose index is divisible by two
7804  * marks the beginning of a range that is in the list, and every element not
7805  * divisible by two marks the beginning of a range not in the list.  A single
7806  * element inversion list that contains the single code point N generally
7807  * consists of two elements
7808  *  invlist[0] == N
7809  *  invlist[1] == N+1
7810  * (The exception is when N is the highest representable value on the
7811  * machine, in which case the list containing just it would be a single
7812  * element, itself.  By extension, if the last range in the list extends to
7813  * infinity, then the first element of that range will be in the inversion list
7814  * at a position that is divisible by two, and is the final element in the
7815  * list.)
7816  * Taking the complement (inverting) an inversion list is quite simple, if the
7817  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7818  * This implementation reserves an element at the beginning of each inversion
7819  * list to always contain 0; there is an additional flag in the header which
7820  * indicates if the list begins at the 0, or is offset to begin at the next
7821  * element.
7822  *
7823  * More about inversion lists can be found in "Unicode Demystified"
7824  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7825  * More will be coming when functionality is added later.
7826  *
7827  * The inversion list data structure is currently implemented as an SV pointing
7828  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7829  * array of UV whose memory management is automatically handled by the existing
7830  * facilities for SV's.
7831  *
7832  * Some of the methods should always be private to the implementation, and some
7833  * should eventually be made public */
7834
7835 /* The header definitions are in F<inline_invlist.c> */
7836
7837 PERL_STATIC_INLINE UV*
7838 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7839 {
7840  /* Returns a pointer to the first element in the inversion list's array.
7841  * This is called upon initialization of an inversion list.  Where the
7842  * array begins depends on whether the list has the code point U+0000 in it
7843  * or not.  The other parameter tells it whether the code that follows this
7844  * call is about to put a 0 in the inversion list or not.  The first
7845  * element is either the element reserved for 0, if TRUE, or the element
7846  * after it, if FALSE */
7847
7848  bool* offset = get_invlist_offset_addr(invlist);
7849  UV* zero_addr = (UV *) SvPVX(invlist);
7850
7851  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7852
7853  /* Must be empty */
7854  assert(! _invlist_len(invlist));
7855
7856  *zero_addr = 0;
7857
7858  /* 1^1 = 0; 1^0 = 1 */
7859  *offset = 1 ^ will_have_0;
7860  return zero_addr + *offset;
7861 }
7862
7863 PERL_STATIC_INLINE UV*
7864 S_invlist_array(SV* const invlist)
7865 {
7866  /* Returns the pointer to the inversion list's array.  Every time the
7867  * length changes, this needs to be called in case malloc or realloc moved
7868  * it */
7869
7870  PERL_ARGS_ASSERT_INVLIST_ARRAY;
7871
7872  /* Must not be empty.  If these fail, you probably didn't check for <len>
7873  * being non-zero before trying to get the array */
7874  assert(_invlist_len(invlist));
7875
7876  /* The very first element always contains zero, The array begins either
7877  * there, or if the inversion list is offset, at the element after it.
7878  * The offset header field determines which; it contains 0 or 1 to indicate
7879  * how much additionally to add */
7880  assert(0 == *(SvPVX(invlist)));
7881  return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7882 }
7883
7884 PERL_STATIC_INLINE void
7885 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7886 {
7887  /* Sets the current number of elements stored in the inversion list.
7888  * Updates SvCUR correspondingly */
7889  PERL_UNUSED_CONTEXT;
7890  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7891
7892  assert(SvTYPE(invlist) == SVt_INVLIST);
7893
7894  SvCUR_set(invlist,
7895    (len == 0)
7896    ? 0
7897    : TO_INTERNAL_SIZE(len + offset));
7898  assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7899 }
7900
7901 PERL_STATIC_INLINE IV*
7902 S_get_invlist_previous_index_addr(SV* invlist)
7903 {
7904  /* Return the address of the IV that is reserved to hold the cached index
7905  * */
7906  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7907
7908  assert(SvTYPE(invlist) == SVt_INVLIST);
7909
7910  return &(((XINVLIST*) SvANY(invlist))->prev_index);
7911 }
7912
7913 PERL_STATIC_INLINE IV
7914 S_invlist_previous_index(SV* const invlist)
7915 {
7916  /* Returns cached index of previous search */
7917
7918  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7919
7920  return *get_invlist_previous_index_addr(invlist);
7921 }
7922
7923 PERL_STATIC_INLINE void
7924 S_invlist_set_previous_index(SV* const invlist, const IV index)
7925 {
7926  /* Caches <index> for later retrieval */
7927
7928  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7929
7930  assert(index == 0 || index < (int) _invlist_len(invlist));
7931
7932  *get_invlist_previous_index_addr(invlist) = index;
7933 }
7934
7935 PERL_STATIC_INLINE UV
7936 S_invlist_max(SV* const invlist)
7937 {
7938  /* Returns the maximum number of elements storable in the inversion list's
7939  * array, without having to realloc() */
7940
7941  PERL_ARGS_ASSERT_INVLIST_MAX;
7942
7943  assert(SvTYPE(invlist) == SVt_INVLIST);
7944
7945  /* Assumes worst case, in which the 0 element is not counted in the
7946  * inversion list, so subtracts 1 for that */
7947  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7948   ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7949   : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7950 }
7951
7952 #ifndef PERL_IN_XSUB_RE
7953 SV*
7954 Perl__new_invlist(pTHX_ IV initial_size)
7955 {
7956
7957  /* Return a pointer to a newly constructed inversion list, with enough
7958  * space to store 'initial_size' elements.  If that number is negative, a
7959  * system default is used instead */
7960
7961  SV* new_list;
7962
7963  if (initial_size < 0) {
7964   initial_size = 10;
7965  }
7966
7967  /* Allocate the initial space */
7968  new_list = newSV_type(SVt_INVLIST);
7969
7970  /* First 1 is in case the zero element isn't in the list; second 1 is for
7971  * trailing NUL */
7972  SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7973  invlist_set_len(new_list, 0, 0);
7974
7975  /* Force iterinit() to be used to get iteration to work */
7976  *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7977
7978  *get_invlist_previous_index_addr(new_list) = 0;
7979
7980  return new_list;
7981 }
7982
7983 SV*
7984 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7985 {
7986  /* Return a pointer to a newly constructed inversion list, initialized to
7987  * point to <list>, which has to be in the exact correct inversion list
7988  * form, including internal fields.  Thus this is a dangerous routine that
7989  * should not be used in the wrong hands.  The passed in 'list' contains
7990  * several header fields at the beginning that are not part of the
7991  * inversion list body proper */
7992
7993  const STRLEN length = (STRLEN) list[0];
7994  const UV version_id =          list[1];
7995  const bool offset   =    cBOOL(list[2]);
7996 #define HEADER_LENGTH 3
7997  /* If any of the above changes in any way, you must change HEADER_LENGTH
7998  * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7999  *      perl -E 'say int(rand 2**31-1)'
8000  */
8001 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8002           data structure type, so that one being
8003           passed in can be validated to be an
8004           inversion list of the correct vintage.
8005          */
8006
8007  SV* invlist = newSV_type(SVt_INVLIST);
8008
8009  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8010
8011  if (version_id != INVLIST_VERSION_ID) {
8012   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8013  }
8014
8015  /* The generated array passed in includes header elements that aren't part
8016  * of the list proper, so start it just after them */
8017  SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8018
8019  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8020        shouldn't touch it */
8021
8022  *(get_invlist_offset_addr(invlist)) = offset;
8023
8024  /* The 'length' passed to us is the physical number of elements in the
8025  * inversion list.  But if there is an offset the logical number is one
8026  * less than that */
8027  invlist_set_len(invlist, length  - offset, offset);
8028
8029  invlist_set_previous_index(invlist, 0);
8030
8031  /* Initialize the iteration pointer. */
8032  invlist_iterfinish(invlist);
8033
8034  SvREADONLY_on(invlist);
8035
8036  return invlist;
8037 }
8038 #endif /* ifndef PERL_IN_XSUB_RE */
8039
8040 STATIC void
8041 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8042 {
8043  /* Grow the maximum size of an inversion list */
8044
8045  PERL_ARGS_ASSERT_INVLIST_EXTEND;
8046
8047  assert(SvTYPE(invlist) == SVt_INVLIST);
8048
8049  /* Add one to account for the zero element at the beginning which may not
8050  * be counted by the calling parameters */
8051  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8052 }
8053
8054 PERL_STATIC_INLINE void
8055 S_invlist_trim(SV* const invlist)
8056 {
8057  PERL_ARGS_ASSERT_INVLIST_TRIM;
8058
8059  assert(SvTYPE(invlist) == SVt_INVLIST);
8060
8061  /* Change the length of the inversion list to how many entries it currently
8062  * has */
8063  SvPV_shrink_to_cur((SV *) invlist);
8064 }
8065
8066 STATIC void
8067 S__append_range_to_invlist(pTHX_ SV* const invlist,
8068         const UV start, const UV end)
8069 {
8070    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8071  * the end of the inversion list.  The range must be above any existing
8072  * ones. */
8073
8074  UV* array;
8075  UV max = invlist_max(invlist);
8076  UV len = _invlist_len(invlist);
8077  bool offset;
8078
8079  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8080
8081  if (len == 0) { /* Empty lists must be initialized */
8082   offset = start != 0;
8083   array = _invlist_array_init(invlist, ! offset);
8084  }
8085  else {
8086   /* Here, the existing list is non-empty. The current max entry in the
8087   * list is generally the first value not in the set, except when the
8088   * set extends to the end of permissible values, in which case it is
8089   * the first entry in that final set, and so this call is an attempt to
8090   * append out-of-order */
8091
8092   UV final_element = len - 1;
8093   array = invlist_array(invlist);
8094   if (array[final_element] > start
8095    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8096   {
8097    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8098      array[final_element], start,
8099      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8100   }
8101
8102   /* Here, it is a legal append.  If the new range begins with the first
8103   * value not in the set, it is extending the set, so the new first
8104   * value not in the set is one greater than the newly extended range.
8105   * */
8106   offset = *get_invlist_offset_addr(invlist);
8107   if (array[final_element] == start) {
8108    if (end != UV_MAX) {
8109     array[final_element] = end + 1;
8110    }
8111    else {
8112     /* But if the end is the maximum representable on the machine,
8113     * just let the range that this would extend to have no end */
8114     invlist_set_len(invlist, len - 1, offset);
8115    }
8116    return;
8117   }
8118  }
8119
8120  /* Here the new range doesn't extend any existing set.  Add it */
8121
8122  len += 2; /* Includes an element each for the start and end of range */
8123
8124  /* If wll overflow the existing space, extend, which may cause the array to
8125  * be moved */
8126  if (max < len) {
8127   invlist_extend(invlist, len);
8128
8129   /* Have to set len here to avoid assert failure in invlist_array() */
8130   invlist_set_len(invlist, len, offset);
8131
8132   array = invlist_array(invlist);
8133  }
8134  else {
8135   invlist_set_len(invlist, len, offset);
8136  }
8137
8138  /* The next item on the list starts the range, the one after that is
8139  * one past the new range.  */
8140  array[len - 2] = start;
8141  if (end != UV_MAX) {
8142   array[len - 1] = end + 1;
8143  }
8144  else {
8145   /* But if the end is the maximum representable on the machine, just let
8146   * the range have no end */
8147   invlist_set_len(invlist, len - 1, offset);
8148  }
8149 }
8150
8151 #ifndef PERL_IN_XSUB_RE
8152
8153 IV
8154 Perl__invlist_search(SV* const invlist, const UV cp)
8155 {
8156  /* Searches the inversion list for the entry that contains the input code
8157  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8158  * return value is the index into the list's array of the range that
8159  * contains <cp> */
8160
8161  IV low = 0;
8162  IV mid;
8163  IV high = _invlist_len(invlist);
8164  const IV highest_element = high - 1;
8165  const UV* array;
8166
8167  PERL_ARGS_ASSERT__INVLIST_SEARCH;
8168
8169  /* If list is empty, return failure. */
8170  if (high == 0) {
8171   return -1;
8172  }
8173
8174  /* (We can't get the array unless we know the list is non-empty) */
8175  array = invlist_array(invlist);
8176
8177  mid = invlist_previous_index(invlist);
8178  assert(mid >=0 && mid <= highest_element);
8179
8180  /* <mid> contains the cache of the result of the previous call to this
8181  * function (0 the first time).  See if this call is for the same result,
8182  * or if it is for mid-1.  This is under the theory that calls to this
8183  * function will often be for related code points that are near each other.
8184  * And benchmarks show that caching gives better results.  We also test
8185  * here if the code point is within the bounds of the list.  These tests
8186  * replace others that would have had to be made anyway to make sure that
8187  * the array bounds were not exceeded, and these give us extra information
8188  * at the same time */
8189  if (cp >= array[mid]) {
8190   if (cp >= array[highest_element]) {
8191    return highest_element;
8192   }
8193
8194   /* Here, array[mid] <= cp < array[highest_element].  This means that
8195   * the final element is not the answer, so can exclude it; it also
8196   * means that <mid> is not the final element, so can refer to 'mid + 1'
8197   * safely */
8198   if (cp < array[mid + 1]) {
8199    return mid;
8200   }
8201   high--;
8202   low = mid + 1;
8203  }
8204  else { /* cp < aray[mid] */
8205   if (cp < array[0]) { /* Fail if outside the array */
8206    return -1;
8207   }
8208   high = mid;
8209   if (cp >= array[mid - 1]) {
8210    goto found_entry;
8211   }
8212  }
8213
8214  /* Binary search.  What we are looking for is <i> such that
8215  * array[i] <= cp < array[i+1]
8216  * The loop below converges on the i+1.  Note that there may not be an
8217  * (i+1)th element in the array, and things work nonetheless */
8218  while (low < high) {
8219   mid = (low + high) / 2;
8220   assert(mid <= highest_element);
8221   if (array[mid] <= cp) { /* cp >= array[mid] */
8222    low = mid + 1;
8223
8224    /* We could do this extra test to exit the loop early.
8225    if (cp < array[low]) {
8226     return mid;
8227    }
8228    */
8229   }
8230   else { /* cp < array[mid] */
8231    high = mid;
8232   }
8233  }
8234
8235   found_entry:
8236  high--;
8237  invlist_set_previous_index(invlist, high);
8238  return high;
8239 }
8240
8241 void
8242 Perl__invlist_populate_swatch(SV* const invlist,
8243        const UV start, const UV end, U8* swatch)
8244 {
8245  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8246  * but is used when the swash has an inversion list.  This makes this much
8247  * faster, as it uses a binary search instead of a linear one.  This is
8248  * intimately tied to that function, and perhaps should be in utf8.c,
8249  * except it is intimately tied to inversion lists as well.  It assumes
8250  * that <swatch> is all 0's on input */
8251
8252  UV current = start;
8253  const IV len = _invlist_len(invlist);
8254  IV i;
8255  const UV * array;
8256
8257  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8258
8259  if (len == 0) { /* Empty inversion list */
8260   return;
8261  }
8262
8263  array = invlist_array(invlist);
8264
8265  /* Find which element it is */
8266  i = _invlist_search(invlist, start);
8267
8268  /* We populate from <start> to <end> */
8269  while (current < end) {
8270   UV upper;
8271
8272   /* The inversion list gives the results for every possible code point
8273   * after the first one in the list.  Only those ranges whose index is
8274   * even are ones that the inversion list matches.  For the odd ones,
8275   * and if the initial code point is not in the list, we have to skip
8276   * forward to the next element */
8277   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8278    i++;
8279    if (i >= len) { /* Finished if beyond the end of the array */
8280     return;
8281    }
8282    current = array[i];
8283    if (current >= end) {   /* Finished if beyond the end of what we
8284          are populating */
8285     if (LIKELY(end < UV_MAX)) {
8286      return;
8287     }
8288
8289     /* We get here when the upper bound is the maximum
8290     * representable on the machine, and we are looking for just
8291     * that code point.  Have to special case it */
8292     i = len;
8293     goto join_end_of_list;
8294    }
8295   }
8296   assert(current >= start);
8297
8298   /* The current range ends one below the next one, except don't go past
8299   * <end> */
8300   i++;
8301   upper = (i < len && array[i] < end) ? array[i] : end;
8302
8303   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8304   * for each code point in it */
8305   for (; current < upper; current++) {
8306    const STRLEN offset = (STRLEN)(current - start);
8307    swatch[offset >> 3] |= 1 << (offset & 7);
8308   }
8309
8310  join_end_of_list:
8311
8312   /* Quit if at the end of the list */
8313   if (i >= len) {
8314
8315    /* But first, have to deal with the highest possible code point on
8316    * the platform.  The previous code assumes that <end> is one
8317    * beyond where we want to populate, but that is impossible at the
8318    * platform's infinity, so have to handle it specially */
8319    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8320    {
8321     const STRLEN offset = (STRLEN)(end - start);
8322     swatch[offset >> 3] |= 1 << (offset & 7);
8323    }
8324    return;
8325   }
8326
8327   /* Advance to the next range, which will be for code points not in the
8328   * inversion list */
8329   current = array[i];
8330  }
8331
8332  return;
8333 }
8334
8335 void
8336 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8337           const bool complement_b, SV** output)
8338 {
8339  /* Take the union of two inversion lists and point <output> to it.  *output
8340  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8341  * the reference count to that list will be decremented if not already a
8342  * temporary (mortal); otherwise *output will be made correspondingly
8343  * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8344  * second list is returned.  If <complement_b> is TRUE, the union is taken
8345  * of the complement (inversion) of <b> instead of b itself.
8346  *
8347  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8348  * Richard Gillam, published by Addison-Wesley, and explained at some
8349  * length there.  The preface says to incorporate its examples into your
8350  * code at your own risk.
8351  *
8352  * The algorithm is like a merge sort.
8353  *
8354  * XXX A potential performance improvement is to keep track as we go along
8355  * if only one of the inputs contributes to the result, meaning the other
8356  * is a subset of that one.  In that case, we can skip the final copy and
8357  * return the larger of the input lists, but then outside code might need
8358  * to keep track of whether to free the input list or not */
8359
8360  const UV* array_a;    /* a's array */
8361  const UV* array_b;
8362  UV len_a;     /* length of a's array */
8363  UV len_b;
8364
8365  SV* u;   /* the resulting union */
8366  UV* array_u;
8367  UV len_u;
8368
8369  UV i_a = 0;      /* current index into a's array */
8370  UV i_b = 0;
8371  UV i_u = 0;
8372
8373  /* running count, as explained in the algorithm source book; items are
8374  * stopped accumulating and are output when the count changes to/from 0.
8375  * The count is incremented when we start a range that's in the set, and
8376  * decremented when we start a range that's not in the set.  So its range
8377  * is 0 to 2.  Only when the count is zero is something not in the set.
8378  */
8379  UV count = 0;
8380
8381  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8382  assert(a != b);
8383
8384  /* If either one is empty, the union is the other one */
8385  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8386   bool make_temp = FALSE; /* Should we mortalize the result? */
8387
8388   if (*output == a) {
8389    if (a != NULL) {
8390     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8391      SvREFCNT_dec_NN(a);
8392     }
8393    }
8394   }
8395   if (*output != b) {
8396    *output = invlist_clone(b);
8397    if (complement_b) {
8398     _invlist_invert(*output);
8399    }
8400   } /* else *output already = b; */
8401
8402   if (make_temp) {
8403    sv_2mortal(*output);
8404   }
8405   return;
8406  }
8407  else if ((len_b = _invlist_len(b)) == 0) {
8408   bool make_temp = FALSE;
8409   if (*output == b) {
8410    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8411     SvREFCNT_dec_NN(b);
8412    }
8413   }
8414
8415   /* The complement of an empty list is a list that has everything in it,
8416   * so the union with <a> includes everything too */
8417   if (complement_b) {
8418    if (a == *output) {
8419     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8420      SvREFCNT_dec_NN(a);
8421     }
8422    }
8423    *output = _new_invlist(1);
8424    _append_range_to_invlist(*output, 0, UV_MAX);
8425   }
8426   else if (*output != a) {
8427    *output = invlist_clone(a);
8428   }
8429   /* else *output already = a; */
8430
8431   if (make_temp) {
8432    sv_2mortal(*output);
8433   }
8434   return;
8435  }
8436
8437  /* Here both lists exist and are non-empty */
8438  array_a = invlist_array(a);
8439  array_b = invlist_array(b);
8440
8441  /* If are to take the union of 'a' with the complement of b, set it
8442  * up so are looking at b's complement. */
8443  if (complement_b) {
8444
8445   /* To complement, we invert: if the first element is 0, remove it.  To
8446   * do this, we just pretend the array starts one later */
8447   if (array_b[0] == 0) {
8448    array_b++;
8449    len_b--;
8450   }
8451   else {
8452
8453    /* But if the first element is not zero, we pretend the list starts
8454    * at the 0 that is always stored immediately before the array. */
8455    array_b--;
8456    len_b++;
8457   }
8458  }
8459
8460  /* Size the union for the worst case: that the sets are completely
8461  * disjoint */
8462  u = _new_invlist(len_a + len_b);
8463
8464  /* Will contain U+0000 if either component does */
8465  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8466          || (len_b > 0 && array_b[0] == 0));
8467
8468  /* Go through each list item by item, stopping when exhausted one of
8469  * them */
8470  while (i_a < len_a && i_b < len_b) {
8471   UV cp;     /* The element to potentially add to the union's array */
8472   bool cp_in_set;   /* is it in the the input list's set or not */
8473
8474   /* We need to take one or the other of the two inputs for the union.
8475   * Since we are merging two sorted lists, we take the smaller of the
8476   * next items.  In case of a tie, we take the one that is in its set
8477   * first.  If we took one not in the set first, it would decrement the
8478   * count, possibly to 0 which would cause it to be output as ending the
8479   * range, and the next time through we would take the same number, and
8480   * output it again as beginning the next range.  By doing it the
8481   * opposite way, there is no possibility that the count will be
8482   * momentarily decremented to 0, and thus the two adjoining ranges will
8483   * be seamlessly merged.  (In a tie and both are in the set or both not
8484   * in the set, it doesn't matter which we take first.) */
8485   if (array_a[i_a] < array_b[i_b]
8486    || (array_a[i_a] == array_b[i_b]
8487     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8488   {
8489    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8490    cp= array_a[i_a++];
8491   }
8492   else {
8493    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8494    cp = array_b[i_b++];
8495   }
8496
8497   /* Here, have chosen which of the two inputs to look at.  Only output
8498   * if the running count changes to/from 0, which marks the
8499   * beginning/end of a range in that's in the set */
8500   if (cp_in_set) {
8501    if (count == 0) {
8502     array_u[i_u++] = cp;
8503    }
8504    count++;
8505   }
8506   else {
8507    count--;
8508    if (count == 0) {
8509     array_u[i_u++] = cp;
8510    }
8511   }
8512  }
8513
8514  /* Here, we are finished going through at least one of the lists, which
8515  * means there is something remaining in at most one.  We check if the list
8516  * that hasn't been exhausted is positioned such that we are in the middle
8517  * of a range in its set or not.  (i_a and i_b point to the element beyond
8518  * the one we care about.) If in the set, we decrement 'count'; if 0, there
8519  * is potentially more to output.
8520  * There are four cases:
8521  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
8522  *    in the union is entirely from the non-exhausted set.
8523  * 2) Both were in their sets, count is 2.  Nothing further should
8524  *    be output, as everything that remains will be in the exhausted
8525  *    list's set, hence in the union; decrementing to 1 but not 0 insures
8526  *    that
8527  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8528  *    Nothing further should be output because the union includes
8529  *    everything from the exhausted set.  Not decrementing ensures that.
8530  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8531  *    decrementing to 0 insures that we look at the remainder of the
8532  *    non-exhausted set */
8533  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8534   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8535  {
8536   count--;
8537  }
8538
8539  /* The final length is what we've output so far, plus what else is about to
8540  * be output.  (If 'count' is non-zero, then the input list we exhausted
8541  * has everything remaining up to the machine's limit in its set, and hence
8542  * in the union, so there will be no further output. */
8543  len_u = i_u;
8544  if (count == 0) {
8545   /* At most one of the subexpressions will be non-zero */
8546   len_u += (len_a - i_a) + (len_b - i_b);
8547  }
8548
8549  /* Set result to final length, which can change the pointer to array_u, so
8550  * re-find it */
8551  if (len_u != _invlist_len(u)) {
8552   invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8553   invlist_trim(u);
8554   array_u = invlist_array(u);
8555  }
8556
8557  /* When 'count' is 0, the list that was exhausted (if one was shorter than
8558  * the other) ended with everything above it not in its set.  That means
8559  * that the remaining part of the union is precisely the same as the
8560  * non-exhausted list, so can just copy it unchanged.  (If both list were
8561  * exhausted at the same time, then the operations below will be both 0.)
8562  */
8563  if (count == 0) {
8564   IV copy_count; /* At most one will have a non-zero copy count */
8565   if ((copy_count = len_a - i_a) > 0) {
8566    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8567   }
8568   else if ((copy_count = len_b - i_b) > 0) {
8569    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8570   }
8571  }
8572
8573  /*  We may be removing a reference to one of the inputs.  If so, the output
8574  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8575  *  count decremented) */
8576  if (a == *output || b == *output) {
8577   assert(! invlist_is_iterating(*output));
8578   if ((SvTEMP(*output))) {
8579    sv_2mortal(u);
8580   }
8581   else {
8582    SvREFCNT_dec_NN(*output);
8583   }
8584  }
8585
8586  *output = u;
8587
8588  return;
8589 }
8590
8591 void
8592 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8593            const bool complement_b, SV** i)
8594 {
8595  /* Take the intersection of two inversion lists and point <i> to it.  *i
8596  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8597  * the reference count to that list will be decremented if not already a
8598  * temporary (mortal); otherwise *i will be made correspondingly mortal.
8599  * The first list, <a>, may be NULL, in which case an empty list is
8600  * returned.  If <complement_b> is TRUE, the result will be the
8601  * intersection of <a> and the complement (or inversion) of <b> instead of
8602  * <b> directly.
8603  *
8604  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8605  * Richard Gillam, published by Addison-Wesley, and explained at some
8606  * length there.  The preface says to incorporate its examples into your
8607  * code at your own risk.  In fact, it had bugs
8608  *
8609  * The algorithm is like a merge sort, and is essentially the same as the
8610  * union above
8611  */
8612
8613  const UV* array_a;  /* a's array */
8614  const UV* array_b;
8615  UV len_a; /* length of a's array */
8616  UV len_b;
8617
8618  SV* r;       /* the resulting intersection */
8619  UV* array_r;
8620  UV len_r;
8621
8622  UV i_a = 0;      /* current index into a's array */
8623  UV i_b = 0;
8624  UV i_r = 0;
8625
8626  /* running count, as explained in the algorithm source book; items are
8627  * stopped accumulating and are output when the count changes to/from 2.
8628  * The count is incremented when we start a range that's in the set, and
8629  * decremented when we start a range that's not in the set.  So its range
8630  * is 0 to 2.  Only when the count is 2 is something in the intersection.
8631  */
8632  UV count = 0;
8633
8634  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8635  assert(a != b);
8636
8637  /* Special case if either one is empty */
8638  len_a = (a == NULL) ? 0 : _invlist_len(a);
8639  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8640   bool make_temp = FALSE;
8641
8642   if (len_a != 0 && complement_b) {
8643
8644    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8645    * be empty.  Here, also we are using 'b's complement, which hence
8646    * must be every possible code point.  Thus the intersection is
8647    * simply 'a'. */
8648    if (*i != a) {
8649     if (*i == b) {
8650      if (! (make_temp = cBOOL(SvTEMP(b)))) {
8651       SvREFCNT_dec_NN(b);
8652      }
8653     }
8654
8655     *i = invlist_clone(a);
8656    }
8657    /* else *i is already 'a' */
8658
8659    if (make_temp) {
8660     sv_2mortal(*i);
8661    }
8662    return;
8663   }
8664
8665   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8666   * intersection must be empty */
8667   if (*i == a) {
8668    if (! (make_temp = cBOOL(SvTEMP(a)))) {
8669     SvREFCNT_dec_NN(a);
8670    }
8671   }
8672   else if (*i == b) {
8673    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8674     SvREFCNT_dec_NN(b);
8675    }
8676   }
8677   *i = _new_invlist(0);
8678   if (make_temp) {
8679    sv_2mortal(*i);
8680   }
8681
8682   return;
8683  }
8684
8685  /* Here both lists exist and are non-empty */
8686  array_a = invlist_array(a);
8687  array_b = invlist_array(b);
8688
8689  /* If are to take the intersection of 'a' with the complement of b, set it
8690  * up so are looking at b's complement. */
8691  if (complement_b) {
8692
8693   /* To complement, we invert: if the first element is 0, remove it.  To
8694   * do this, we just pretend the array starts one later */
8695   if (array_b[0] == 0) {
8696    array_b++;
8697    len_b--;
8698   }
8699   else {
8700
8701    /* But if the first element is not zero, we pretend the list starts
8702    * at the 0 that is always stored immediately before the array. */
8703    array_b--;
8704    len_b++;
8705   }
8706  }
8707
8708  /* Size the intersection for the worst case: that the intersection ends up
8709  * fragmenting everything to be completely disjoint */
8710  r= _new_invlist(len_a + len_b);
8711
8712  /* Will contain U+0000 iff both components do */
8713  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8714          && len_b > 0 && array_b[0] == 0);
8715
8716  /* Go through each list item by item, stopping when exhausted one of
8717  * them */
8718  while (i_a < len_a && i_b < len_b) {
8719   UV cp;     /* The element to potentially add to the intersection's
8720      array */
8721   bool cp_in_set; /* Is it in the input list's set or not */
8722
8723   /* We need to take one or the other of the two inputs for the
8724   * intersection.  Since we are merging two sorted lists, we take the
8725   * smaller of the next items.  In case of a tie, we take the one that
8726   * is not in its set first (a difference from the union algorithm).  If
8727   * we took one in the set first, it would increment the count, possibly
8728   * to 2 which would cause it to be output as starting a range in the
8729   * intersection, and the next time through we would take that same
8730   * number, and output it again as ending the set.  By doing it the
8731   * opposite of this, there is no possibility that the count will be
8732   * momentarily incremented to 2.  (In a tie and both are in the set or
8733   * both not in the set, it doesn't matter which we take first.) */
8734   if (array_a[i_a] < array_b[i_b]
8735    || (array_a[i_a] == array_b[i_b]
8736     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8737   {
8738    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8739    cp= array_a[i_a++];
8740   }
8741   else {
8742    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8743    cp= array_b[i_b++];
8744   }
8745
8746   /* Here, have chosen which of the two inputs to look at.  Only output
8747   * if the running count changes to/from 2, which marks the
8748   * beginning/end of a range that's in the intersection */
8749   if (cp_in_set) {
8750    count++;
8751    if (count == 2) {
8752     array_r[i_r++] = cp;
8753    }
8754   }
8755   else {
8756    if (count == 2) {
8757     array_r[i_r++] = cp;
8758    }
8759    count--;
8760   }
8761  }
8762
8763  /* Here, we are finished going through at least one of the lists, which
8764  * means there is something remaining in at most one.  We check if the list
8765  * that has been exhausted is positioned such that we are in the middle
8766  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8767  * the ones we care about.)  There are four cases:
8768  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
8769  *    nothing left in the intersection.
8770  * 2) Both were in their sets, count is 2 and perhaps is incremented to
8771  *    above 2.  What should be output is exactly that which is in the
8772  *    non-exhausted set, as everything it has is also in the intersection
8773  *    set, and everything it doesn't have can't be in the intersection
8774  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8775  *    gets incremented to 2.  Like the previous case, the intersection is
8776  *    everything that remains in the non-exhausted set.
8777  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8778  *    remains 1.  And the intersection has nothing more. */
8779  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8780   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8781  {
8782   count++;
8783  }
8784
8785  /* The final length is what we've output so far plus what else is in the
8786  * intersection.  At most one of the subexpressions below will be non-zero
8787  * */
8788  len_r = i_r;
8789  if (count >= 2) {
8790   len_r += (len_a - i_a) + (len_b - i_b);
8791  }
8792
8793  /* Set result to final length, which can change the pointer to array_r, so
8794  * re-find it */
8795  if (len_r != _invlist_len(r)) {
8796   invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8797   invlist_trim(r);
8798   array_r = invlist_array(r);
8799  }
8800
8801  /* Finish outputting any remaining */
8802  if (count >= 2) { /* At most one will have a non-zero copy count */
8803   IV copy_count;
8804   if ((copy_count = len_a - i_a) > 0) {
8805    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8806   }
8807   else if ((copy_count = len_b - i_b) > 0) {
8808    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8809   }
8810  }
8811
8812  /*  We may be removing a reference to one of the inputs.  If so, the output
8813  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8814  *  count decremented) */
8815  if (a == *i || b == *i) {
8816   assert(! invlist_is_iterating(*i));
8817   if (SvTEMP(*i)) {
8818    sv_2mortal(r);
8819   }
8820   else {
8821    SvREFCNT_dec_NN(*i);
8822   }
8823  }
8824
8825  *i = r;
8826
8827  return;
8828 }
8829
8830 SV*
8831 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8832 {
8833  /* Add the range from 'start' to 'end' inclusive to the inversion list's
8834  * set.  A pointer to the inversion list is returned.  This may actually be
8835  * a new list, in which case the passed in one has been destroyed.  The
8836  * passed in inversion list can be NULL, in which case a new one is created
8837  * with just the one range in it */
8838
8839  SV* range_invlist;
8840  UV len;
8841
8842  if (invlist == NULL) {
8843   invlist = _new_invlist(2);
8844   len = 0;
8845  }
8846  else {
8847   len = _invlist_len(invlist);
8848  }
8849
8850  /* If comes after the final entry actually in the list, can just append it
8851  * to the end, */
8852  if (len == 0
8853   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8854    && start >= invlist_array(invlist)[len - 1]))
8855  {
8856   _append_range_to_invlist(invlist, start, end);
8857   return invlist;
8858  }
8859
8860  /* Here, can't just append things, create and return a new inversion list
8861  * which is the union of this range and the existing inversion list */
8862  range_invlist = _new_invlist(2);
8863  _append_range_to_invlist(range_invlist, start, end);
8864
8865  _invlist_union(invlist, range_invlist, &invlist);
8866
8867  /* The temporary can be freed */
8868  SvREFCNT_dec_NN(range_invlist);
8869
8870  return invlist;
8871 }
8872
8873 SV*
8874 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8875         UV** other_elements_ptr)
8876 {
8877  /* Create and return an inversion list whose contents are to be populated
8878  * by the caller.  The caller gives the number of elements (in 'size') and
8879  * the very first element ('element0').  This function will set
8880  * '*other_elements_ptr' to an array of UVs, where the remaining elements
8881  * are to be placed.
8882  *
8883  * Obviously there is some trust involved that the caller will properly
8884  * fill in the other elements of the array.
8885  *
8886  * (The first element needs to be passed in, as the underlying code does
8887  * things differently depending on whether it is zero or non-zero) */
8888
8889  SV* invlist = _new_invlist(size);
8890  bool offset;
8891
8892  PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8893
8894  _append_range_to_invlist(invlist, element0, element0);
8895  offset = *get_invlist_offset_addr(invlist);
8896
8897  invlist_set_len(invlist, size, offset);
8898  *other_elements_ptr = invlist_array(invlist) + 1;
8899  return invlist;
8900 }
8901
8902 #endif
8903
8904 PERL_STATIC_INLINE SV*
8905 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8906  return _add_range_to_invlist(invlist, cp, cp);
8907 }
8908
8909 #ifndef PERL_IN_XSUB_RE
8910 void
8911 Perl__invlist_invert(pTHX_ SV* const invlist)
8912 {
8913  /* Complement the input inversion list.  This adds a 0 if the list didn't
8914  * have a zero; removes it otherwise.  As described above, the data
8915  * structure is set up so that this is very efficient */
8916
8917  PERL_ARGS_ASSERT__INVLIST_INVERT;
8918
8919  assert(! invlist_is_iterating(invlist));
8920
8921  /* The inverse of matching nothing is matching everything */
8922  if (_invlist_len(invlist) == 0) {
8923   _append_range_to_invlist(invlist, 0, UV_MAX);
8924   return;
8925  }
8926
8927  *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8928 }
8929
8930 #endif
8931
8932 PERL_STATIC_INLINE SV*
8933 S_invlist_clone(pTHX_ SV* const invlist)
8934 {
8935
8936  /* Return a new inversion list that is a copy of the input one, which is
8937  * unchanged.  The new list will not be mortal even if the old one was. */
8938
8939  /* Need to allocate extra space to accommodate Perl's addition of a
8940  * trailing NUL to SvPV's, since it thinks they are always strings */
8941  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8942  STRLEN physical_length = SvCUR(invlist);
8943  bool offset = *(get_invlist_offset_addr(invlist));
8944
8945  PERL_ARGS_ASSERT_INVLIST_CLONE;
8946
8947  *(get_invlist_offset_addr(new_invlist)) = offset;
8948  invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8949  Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8950
8951  return new_invlist;
8952 }
8953
8954 PERL_STATIC_INLINE STRLEN*
8955 S_get_invlist_iter_addr(SV* invlist)
8956 {
8957  /* Return the address of the UV that contains the current iteration
8958  * position */
8959
8960  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8961
8962  assert(SvTYPE(invlist) == SVt_INVLIST);
8963
8964  return &(((XINVLIST*) SvANY(invlist))->iterator);
8965 }
8966
8967 PERL_STATIC_INLINE void
8968 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
8969 {
8970  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8971
8972  *get_invlist_iter_addr(invlist) = 0;
8973 }
8974
8975 PERL_STATIC_INLINE void
8976 S_invlist_iterfinish(SV* invlist)
8977 {
8978  /* Terminate iterator for invlist.  This is to catch development errors.
8979  * Any iteration that is interrupted before completed should call this
8980  * function.  Functions that add code points anywhere else but to the end
8981  * of an inversion list assert that they are not in the middle of an
8982  * iteration.  If they were, the addition would make the iteration
8983  * problematical: if the iteration hadn't reached the place where things
8984  * were being added, it would be ok */
8985
8986  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8987
8988  *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8989 }
8990
8991 STATIC bool
8992 S_invlist_iternext(SV* invlist, UV* start, UV* end)
8993 {
8994  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8995  * This call sets in <*start> and <*end>, the next range in <invlist>.
8996  * Returns <TRUE> if successful and the next call will return the next
8997  * range; <FALSE> if was already at the end of the list.  If the latter,
8998  * <*start> and <*end> are unchanged, and the next call to this function
8999  * will start over at the beginning of the list */
9000
9001  STRLEN* pos = get_invlist_iter_addr(invlist);
9002  UV len = _invlist_len(invlist);
9003  UV *array;
9004
9005  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9006
9007  if (*pos >= len) {
9008   *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9009   return FALSE;
9010  }
9011
9012  array = invlist_array(invlist);
9013
9014  *start = array[(*pos)++];
9015
9016  if (*pos >= len) {
9017   *end = UV_MAX;
9018  }
9019  else {
9020   *end = array[(*pos)++] - 1;
9021  }
9022
9023  return TRUE;
9024 }
9025
9026 PERL_STATIC_INLINE bool
9027 S_invlist_is_iterating(SV* const invlist)
9028 {
9029  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9030
9031  return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9032 }
9033
9034 PERL_STATIC_INLINE UV
9035 S_invlist_highest(SV* const invlist)
9036 {
9037  /* Returns the highest code point that matches an inversion list.  This API
9038  * has an ambiguity, as it returns 0 under either the highest is actually
9039  * 0, or if the list is empty.  If this distinction matters to you, check
9040  * for emptiness before calling this function */
9041
9042  UV len = _invlist_len(invlist);
9043  UV *array;
9044
9045  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9046
9047  if (len == 0) {
9048   return 0;
9049  }
9050
9051  array = invlist_array(invlist);
9052
9053  /* The last element in the array in the inversion list always starts a
9054  * range that goes to infinity.  That range may be for code points that are
9055  * matched in the inversion list, or it may be for ones that aren't
9056  * matched.  In the latter case, the highest code point in the set is one
9057  * less than the beginning of this range; otherwise it is the final element
9058  * of this range: infinity */
9059  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9060   ? UV_MAX
9061   : array[len - 1] - 1;
9062 }
9063
9064 #ifndef PERL_IN_XSUB_RE
9065 SV *
9066 Perl__invlist_contents(pTHX_ SV* const invlist)
9067 {
9068  /* Get the contents of an inversion list into a string SV so that they can
9069  * be printed out.  It uses the format traditionally done for debug tracing
9070  */
9071
9072  UV start, end;
9073  SV* output = newSVpvs("\n");
9074
9075  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9076
9077  assert(! invlist_is_iterating(invlist));
9078
9079  invlist_iterinit(invlist);
9080  while (invlist_iternext(invlist, &start, &end)) {
9081   if (end == UV_MAX) {
9082    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9083   }
9084   else if (end != start) {
9085    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9086      start,       end);
9087   }
9088   else {
9089    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9090   }
9091  }
9092
9093  return output;
9094 }
9095 #endif
9096
9097 #ifndef PERL_IN_XSUB_RE
9098 void
9099 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9100       const char * const indent, SV* const invlist)
9101 {
9102  /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9103  * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9104  * the string 'indent'.  The output looks like this:
9105   [0] 0x000A .. 0x000D
9106   [2] 0x0085
9107   [4] 0x2028 .. 0x2029
9108   [6] 0x3104 .. INFINITY
9109  * This means that the first range of code points matched by the list are
9110  * 0xA through 0xD; the second range contains only the single code point
9111  * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9112  * are used to define each range (except if the final range extends to
9113  * infinity, only a single element is needed).  The array index of the
9114  * first element for the corresponding range is given in brackets. */
9115
9116  UV start, end;
9117  STRLEN count = 0;
9118
9119  PERL_ARGS_ASSERT__INVLIST_DUMP;
9120
9121  if (invlist_is_iterating(invlist)) {
9122   Perl_dump_indent(aTHX_ level, file,
9123    "%sCan't dump inversion list because is in middle of iterating\n",
9124    indent);
9125   return;
9126  }
9127
9128  invlist_iterinit(invlist);
9129  while (invlist_iternext(invlist, &start, &end)) {
9130   if (end == UV_MAX) {
9131    Perl_dump_indent(aTHX_ level, file,
9132          "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9133         indent, (UV)count, start);
9134   }
9135   else if (end != start) {
9136    Perl_dump_indent(aTHX_ level, file,
9137          "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9138         indent, (UV)count, start,         end);
9139   }
9140   else {
9141    Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9142            indent, (UV)count, start);
9143   }
9144   count += 2;
9145  }
9146 }
9147
9148 void
9149 Perl__load_PL_utf8_foldclosures (pTHX)
9150 {
9151  assert(! PL_utf8_foldclosures);
9152
9153  /* If the folds haven't been read in, call a fold function
9154  * to force that */
9155  if (! PL_utf8_tofold) {
9156   U8 dummy[UTF8_MAXBYTES_CASE+1];
9157
9158   /* This string is just a short named one above \xff */
9159   to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9160   assert(PL_utf8_tofold); /* Verify that worked */
9161  }
9162  PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9163 }
9164 #endif
9165
9166 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9167 bool
9168 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9169 {
9170  /* Return a boolean as to if the two passed in inversion lists are
9171  * identical.  The final argument, if TRUE, says to take the complement of
9172  * the second inversion list before doing the comparison */
9173
9174  const UV* array_a = invlist_array(a);
9175  const UV* array_b = invlist_array(b);
9176  UV len_a = _invlist_len(a);
9177  UV len_b = _invlist_len(b);
9178
9179  UV i = 0;      /* current index into the arrays */
9180  bool retval = TRUE;     /* Assume are identical until proven otherwise */
9181
9182  PERL_ARGS_ASSERT__INVLISTEQ;
9183
9184  /* If are to compare 'a' with the complement of b, set it
9185  * up so are looking at b's complement. */
9186  if (complement_b) {
9187
9188   /* The complement of nothing is everything, so <a> would have to have
9189   * just one element, starting at zero (ending at infinity) */
9190   if (len_b == 0) {
9191    return (len_a == 1 && array_a[0] == 0);
9192   }
9193   else if (array_b[0] == 0) {
9194
9195    /* Otherwise, to complement, we invert.  Here, the first element is
9196    * 0, just remove it.  To do this, we just pretend the array starts
9197    * one later */
9198
9199    array_b++;
9200    len_b--;
9201   }
9202   else {
9203
9204    /* But if the first element is not zero, we pretend the list starts
9205    * at the 0 that is always stored immediately before the array. */
9206    array_b--;
9207    len_b++;
9208   }
9209  }
9210
9211  /* Make sure that the lengths are the same, as well as the final element
9212  * before looping through the remainder.  (Thus we test the length, final,
9213  * and first elements right off the bat) */
9214  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9215   retval = FALSE;
9216  }
9217  else for (i = 0; i < len_a - 1; i++) {
9218   if (array_a[i] != array_b[i]) {
9219    retval = FALSE;
9220    break;
9221   }
9222  }
9223
9224  return retval;
9225 }
9226 #endif
9227
9228 #undef HEADER_LENGTH
9229 #undef TO_INTERNAL_SIZE
9230 #undef FROM_INTERNAL_SIZE
9231 #undef INVLIST_VERSION_ID
9232
9233 /* End of inversion list object */
9234
9235 STATIC void
9236 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9237 {
9238  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9239  * constructs, and updates RExC_flags with them.  On input, RExC_parse
9240  * should point to the first flag; it is updated on output to point to the
9241  * final ')' or ':'.  There needs to be at least one flag, or this will
9242  * abort */
9243
9244  /* for (?g), (?gc), and (?o) warnings; warning
9245  about (?c) will warn about (?g) -- japhy    */
9246
9247 #define WASTED_O  0x01
9248 #define WASTED_G  0x02
9249 #define WASTED_C  0x04
9250 #define WASTED_GC (WASTED_G|WASTED_C)
9251  I32 wastedflags = 0x00;
9252  U32 posflags = 0, negflags = 0;
9253  U32 *flagsp = &posflags;
9254  char has_charset_modifier = '\0';
9255  regex_charset cs;
9256  bool has_use_defaults = FALSE;
9257  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9258
9259  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9260
9261  /* '^' as an initial flag sets certain defaults */
9262  if (UCHARAT(RExC_parse) == '^') {
9263   RExC_parse++;
9264   has_use_defaults = TRUE;
9265   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9266   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9267           ? REGEX_UNICODE_CHARSET
9268           : REGEX_DEPENDS_CHARSET);
9269  }
9270
9271  cs = get_regex_charset(RExC_flags);
9272  if (cs == REGEX_DEPENDS_CHARSET
9273   && (RExC_utf8 || RExC_uni_semantics))
9274  {
9275   cs = REGEX_UNICODE_CHARSET;
9276  }
9277
9278  while (*RExC_parse) {
9279   /* && strchr("iogcmsx", *RExC_parse) */
9280   /* (?g), (?gc) and (?o) are useless here
9281   and must be globally applied -- japhy */
9282   switch (*RExC_parse) {
9283
9284    /* Code for the imsx flags */
9285    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9286
9287    case LOCALE_PAT_MOD:
9288     if (has_charset_modifier) {
9289      goto excess_modifier;
9290     }
9291     else if (flagsp == &negflags) {
9292      goto neg_modifier;
9293     }
9294     cs = REGEX_LOCALE_CHARSET;
9295     has_charset_modifier = LOCALE_PAT_MOD;
9296     break;
9297    case UNICODE_PAT_MOD:
9298     if (has_charset_modifier) {
9299      goto excess_modifier;
9300     }
9301     else if (flagsp == &negflags) {
9302      goto neg_modifier;
9303     }
9304     cs = REGEX_UNICODE_CHARSET;
9305     has_charset_modifier = UNICODE_PAT_MOD;
9306     break;
9307    case ASCII_RESTRICT_PAT_MOD:
9308     if (flagsp == &negflags) {
9309      goto neg_modifier;
9310     }
9311     if (has_charset_modifier) {
9312      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9313       goto excess_modifier;
9314      }
9315      /* Doubled modifier implies more restricted */
9316      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9317     }
9318     else {
9319      cs = REGEX_ASCII_RESTRICTED_CHARSET;
9320     }
9321     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9322     break;
9323    case DEPENDS_PAT_MOD:
9324     if (has_use_defaults) {
9325      goto fail_modifiers;
9326     }
9327     else if (flagsp == &negflags) {
9328      goto neg_modifier;
9329     }
9330     else if (has_charset_modifier) {
9331      goto excess_modifier;
9332     }
9333
9334     /* The dual charset means unicode semantics if the
9335     * pattern (or target, not known until runtime) are
9336     * utf8, or something in the pattern indicates unicode
9337     * semantics */
9338     cs = (RExC_utf8 || RExC_uni_semantics)
9339      ? REGEX_UNICODE_CHARSET
9340      : REGEX_DEPENDS_CHARSET;
9341     has_charset_modifier = DEPENDS_PAT_MOD;
9342     break;
9343    excess_modifier:
9344     RExC_parse++;
9345     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9346      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9347     }
9348     else if (has_charset_modifier == *(RExC_parse - 1)) {
9349      vFAIL2("Regexp modifier \"%c\" may not appear twice",
9350           *(RExC_parse - 1));
9351     }
9352     else {
9353      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9354     }
9355     /*NOTREACHED*/
9356    neg_modifier:
9357     RExC_parse++;
9358     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9359          *(RExC_parse - 1));
9360     /*NOTREACHED*/
9361    case ONCE_PAT_MOD: /* 'o' */
9362    case GLOBAL_PAT_MOD: /* 'g' */
9363     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9364      const I32 wflagbit = *RExC_parse == 'o'
9365           ? WASTED_O
9366           : WASTED_G;
9367      if (! (wastedflags & wflagbit) ) {
9368       wastedflags |= wflagbit;
9369       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9370       vWARN5(
9371        RExC_parse + 1,
9372        "Useless (%s%c) - %suse /%c modifier",
9373        flagsp == &negflags ? "?-" : "?",
9374        *RExC_parse,
9375        flagsp == &negflags ? "don't " : "",
9376        *RExC_parse
9377       );
9378      }
9379     }
9380     break;
9381
9382    case CONTINUE_PAT_MOD: /* 'c' */
9383     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9384      if (! (wastedflags & WASTED_C) ) {
9385       wastedflags |= WASTED_GC;
9386       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9387       vWARN3(
9388        RExC_parse + 1,
9389        "Useless (%sc) - %suse /gc modifier",
9390        flagsp == &negflags ? "?-" : "?",
9391        flagsp == &negflags ? "don't " : ""
9392       );
9393      }
9394     }
9395     break;
9396    case KEEPCOPY_PAT_MOD: /* 'p' */
9397     if (flagsp == &negflags) {
9398      if (SIZE_ONLY)
9399       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9400     } else {
9401      *flagsp |= RXf_PMf_KEEPCOPY;
9402     }
9403     break;
9404    case '-':
9405     /* A flag is a default iff it is following a minus, so
9406     * if there is a minus, it means will be trying to
9407     * re-specify a default which is an error */
9408     if (has_use_defaults || flagsp == &negflags) {
9409      goto fail_modifiers;
9410     }
9411     flagsp = &negflags;
9412     wastedflags = 0;  /* reset so (?g-c) warns twice */
9413     break;
9414    case ':':
9415    case ')':
9416     RExC_flags |= posflags;
9417     RExC_flags &= ~negflags;
9418     set_regex_charset(&RExC_flags, cs);
9419     if (RExC_flags & RXf_PMf_FOLD) {
9420      RExC_contains_i = 1;
9421     }
9422     return;
9423     /*NOTREACHED*/
9424    default:
9425    fail_modifiers:
9426     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9427     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9428     vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9429      UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9430     /*NOTREACHED*/
9431   }
9432
9433   ++RExC_parse;
9434  }
9435 }
9436
9437 /*
9438  - reg - regular expression, i.e. main body or parenthesized thing
9439  *
9440  * Caller must absorb opening parenthesis.
9441  *
9442  * Combining parenthesis handling with the base level of regular expression
9443  * is a trifle forced, but the need to tie the tails of the branches to what
9444  * follows makes it hard to avoid.
9445  */
9446 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9447 #ifdef DEBUGGING
9448 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9449 #else
9450 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9451 #endif
9452
9453 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9454    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9455    needs to be restarted.
9456    Otherwise would only return NULL if regbranch() returns NULL, which
9457    cannot happen.  */
9458 STATIC regnode *
9459 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9460  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9461  * 2 is like 1, but indicates that nextchar() has been called to advance
9462  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9463  * this flag alerts us to the need to check for that */
9464 {
9465  regnode *ret;  /* Will be the head of the group. */
9466  regnode *br;
9467  regnode *lastbr;
9468  regnode *ender = NULL;
9469  I32 parno = 0;
9470  I32 flags;
9471  U32 oregflags = RExC_flags;
9472  bool have_branch = 0;
9473  bool is_open = 0;
9474  I32 freeze_paren = 0;
9475  I32 after_freeze = 0;
9476  I32 num; /* numeric backreferences */
9477
9478  char * parse_start = RExC_parse; /* MJD */
9479  char * const oregcomp_parse = RExC_parse;
9480
9481  GET_RE_DEBUG_FLAGS_DECL;
9482
9483  PERL_ARGS_ASSERT_REG;
9484  DEBUG_PARSE("reg ");
9485
9486  *flagp = 0;    /* Tentatively. */
9487
9488
9489  /* Make an OPEN node, if parenthesized. */
9490  if (paren) {
9491
9492   /* Under /x, space and comments can be gobbled up between the '(' and
9493   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9494   * intervening space, as the sequence is a token, and a token should be
9495   * indivisible */
9496   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9497
9498   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9499    char *start_verb = RExC_parse;
9500    STRLEN verb_len = 0;
9501    char *start_arg = NULL;
9502    unsigned char op = 0;
9503    int argok = 1;
9504    int internal_argval = 0; /* internal_argval is only useful if
9505           !argok */
9506
9507    if (has_intervening_patws) {
9508     RExC_parse++;
9509     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9510    }
9511    while ( *RExC_parse && *RExC_parse != ')' ) {
9512     if ( *RExC_parse == ':' ) {
9513      start_arg = RExC_parse + 1;
9514      break;
9515     }
9516     RExC_parse++;
9517    }
9518    ++start_verb;
9519    verb_len = RExC_parse - start_verb;
9520    if ( start_arg ) {
9521     RExC_parse++;
9522     while ( *RExC_parse && *RExC_parse != ')' )
9523      RExC_parse++;
9524     if ( *RExC_parse != ')' )
9525      vFAIL("Unterminated verb pattern argument");
9526     if ( RExC_parse == start_arg )
9527      start_arg = NULL;
9528    } else {
9529     if ( *RExC_parse != ')' )
9530      vFAIL("Unterminated verb pattern");
9531    }
9532
9533    switch ( *start_verb ) {
9534    case 'A':  /* (*ACCEPT) */
9535     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9536      op = ACCEPT;
9537      internal_argval = RExC_nestroot;
9538     }
9539     break;
9540    case 'C':  /* (*COMMIT) */
9541     if ( memEQs(start_verb,verb_len,"COMMIT") )
9542      op = COMMIT;
9543     break;
9544    case 'F':  /* (*FAIL) */
9545     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9546      op = OPFAIL;
9547      argok = 0;
9548     }
9549     break;
9550    case ':':  /* (*:NAME) */
9551    case 'M':  /* (*MARK:NAME) */
9552     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9553      op = MARKPOINT;
9554      argok = -1;
9555     }
9556     break;
9557    case 'P':  /* (*PRUNE) */
9558     if ( memEQs(start_verb,verb_len,"PRUNE") )
9559      op = PRUNE;
9560     break;
9561    case 'S':   /* (*SKIP) */
9562     if ( memEQs(start_verb,verb_len,"SKIP") )
9563      op = SKIP;
9564     break;
9565    case 'T':  /* (*THEN) */
9566     /* [19:06] <TimToady> :: is then */
9567     if ( memEQs(start_verb,verb_len,"THEN") ) {
9568      op = CUTGROUP;
9569      RExC_seen |= REG_CUTGROUP_SEEN;
9570     }
9571     break;
9572    }
9573    if ( ! op ) {
9574     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9575     vFAIL2utf8f(
9576      "Unknown verb pattern '%"UTF8f"'",
9577      UTF8fARG(UTF, verb_len, start_verb));
9578    }
9579    if ( argok ) {
9580     if ( start_arg && internal_argval ) {
9581      vFAIL3("Verb pattern '%.*s' may not have an argument",
9582       verb_len, start_verb);
9583     } else if ( argok < 0 && !start_arg ) {
9584      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9585       verb_len, start_verb);
9586     } else {
9587      ret = reganode(pRExC_state, op, internal_argval);
9588      if ( ! internal_argval && ! SIZE_ONLY ) {
9589       if (start_arg) {
9590        SV *sv = newSVpvn( start_arg,
9591            RExC_parse - start_arg);
9592        ARG(ret) = add_data( pRExC_state,
9593             STR_WITH_LEN("S"));
9594        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9595        ret->flags = 0;
9596       } else {
9597        ret->flags = 1;
9598       }
9599      }
9600     }
9601     if (!internal_argval)
9602      RExC_seen |= REG_VERBARG_SEEN;
9603    } else if ( start_arg ) {
9604     vFAIL3("Verb pattern '%.*s' may not have an argument",
9605       verb_len, start_verb);
9606    } else {
9607     ret = reg_node(pRExC_state, op);
9608    }
9609    nextchar(pRExC_state);
9610    return ret;
9611   }
9612   else if (*RExC_parse == '?') { /* (?...) */
9613    bool is_logical = 0;
9614    const char * const seqstart = RExC_parse;
9615    const char * endptr;
9616    if (has_intervening_patws) {
9617     RExC_parse++;
9618     vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9619    }
9620
9621    RExC_parse++;
9622    paren = *RExC_parse++;
9623    ret = NULL;   /* For look-ahead/behind. */
9624    switch (paren) {
9625
9626    case 'P': /* (?P...) variants for those used to PCRE/Python */
9627     paren = *RExC_parse++;
9628     if ( paren == '<')         /* (?P<...>) named capture */
9629      goto named_capture;
9630     else if (paren == '>') {   /* (?P>name) named recursion */
9631      goto named_recursion;
9632     }
9633     else if (paren == '=') {   /* (?P=...)  named backref */
9634      /* this pretty much dupes the code for \k<NAME> in
9635      * regatom(), if you change this make sure you change that
9636      * */
9637      char* name_start = RExC_parse;
9638      U32 num = 0;
9639      SV *sv_dat = reg_scan_name(pRExC_state,
9640       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9641      if (RExC_parse == name_start || *RExC_parse != ')')
9642       /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9643       vFAIL2("Sequence %.3s... not terminated",parse_start);
9644
9645      if (!SIZE_ONLY) {
9646       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9647       RExC_rxi->data->data[num]=(void*)sv_dat;
9648       SvREFCNT_inc_simple_void(sv_dat);
9649      }
9650      RExC_sawback = 1;
9651      ret = reganode(pRExC_state,
9652         ((! FOLD)
9653          ? NREF
9654          : (ASCII_FOLD_RESTRICTED)
9655          ? NREFFA
9656          : (AT_LEAST_UNI_SEMANTICS)
9657           ? NREFFU
9658           : (LOC)
9659           ? NREFFL
9660           : NREFF),
9661          num);
9662      *flagp |= HASWIDTH;
9663
9664      Set_Node_Offset(ret, parse_start+1);
9665      Set_Node_Cur_Length(ret, parse_start);
9666
9667      nextchar(pRExC_state);
9668      return ret;
9669     }
9670     RExC_parse++;
9671     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9672     vFAIL3("Sequence (%.*s...) not recognized",
9673         RExC_parse-seqstart, seqstart);
9674     /*NOTREACHED*/
9675    case '<':           /* (?<...) */
9676     if (*RExC_parse == '!')
9677      paren = ',';
9678     else if (*RExC_parse != '=')
9679    named_capture:
9680     {               /* (?<...>) */
9681      char *name_start;
9682      SV *svname;
9683      paren= '>';
9684    case '\'':          /* (?'...') */
9685       name_start= RExC_parse;
9686       svname = reg_scan_name(pRExC_state,
9687       SIZE_ONLY    /* reverse test from the others */
9688       ? REG_RSN_RETURN_NAME
9689       : REG_RSN_RETURN_NULL);
9690      if (RExC_parse == name_start || *RExC_parse != paren)
9691       vFAIL2("Sequence (?%c... not terminated",
9692        paren=='>' ? '<' : paren);
9693      if (SIZE_ONLY) {
9694       HE *he_str;
9695       SV *sv_dat = NULL;
9696       if (!svname) /* shouldn't happen */
9697        Perl_croak(aTHX_
9698         "panic: reg_scan_name returned NULL");
9699       if (!RExC_paren_names) {
9700        RExC_paren_names= newHV();
9701        sv_2mortal(MUTABLE_SV(RExC_paren_names));
9702 #ifdef DEBUGGING
9703        RExC_paren_name_list= newAV();
9704        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9705 #endif
9706       }
9707       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9708       if ( he_str )
9709        sv_dat = HeVAL(he_str);
9710       if ( ! sv_dat ) {
9711        /* croak baby croak */
9712        Perl_croak(aTHX_
9713         "panic: paren_name hash element allocation failed");
9714       } else if ( SvPOK(sv_dat) ) {
9715        /* (?|...) can mean we have dupes so scan to check
9716        its already been stored. Maybe a flag indicating
9717        we are inside such a construct would be useful,
9718        but the arrays are likely to be quite small, so
9719        for now we punt -- dmq */
9720        IV count = SvIV(sv_dat);
9721        I32 *pv = (I32*)SvPVX(sv_dat);
9722        IV i;
9723        for ( i = 0 ; i < count ; i++ ) {
9724         if ( pv[i] == RExC_npar ) {
9725          count = 0;
9726          break;
9727         }
9728        }
9729        if ( count ) {
9730         pv = (I32*)SvGROW(sv_dat,
9731             SvCUR(sv_dat) + sizeof(I32)+1);
9732         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9733         pv[count] = RExC_npar;
9734         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9735        }
9736       } else {
9737        (void)SvUPGRADE(sv_dat,SVt_PVNV);
9738        sv_setpvn(sv_dat, (char *)&(RExC_npar),
9739                 sizeof(I32));
9740        SvIOK_on(sv_dat);
9741        SvIV_set(sv_dat, 1);
9742       }
9743 #ifdef DEBUGGING
9744       /* Yes this does cause a memory leak in debugging Perls
9745       * */
9746       if (!av_store(RExC_paren_name_list,
9747          RExC_npar, SvREFCNT_inc(svname)))
9748        SvREFCNT_dec_NN(svname);
9749 #endif
9750
9751       /*sv_dump(sv_dat);*/
9752      }
9753      nextchar(pRExC_state);
9754      paren = 1;
9755      goto capturing_parens;
9756     }
9757     RExC_seen |= REG_LOOKBEHIND_SEEN;
9758     RExC_in_lookbehind++;
9759     RExC_parse++;
9760     /* FALLTHROUGH */
9761    case '=':           /* (?=...) */
9762     RExC_seen_zerolen++;
9763     break;
9764    case '!':           /* (?!...) */
9765     RExC_seen_zerolen++;
9766     if (*RExC_parse == ')') {
9767      ret=reg_node(pRExC_state, OPFAIL);
9768      nextchar(pRExC_state);
9769      return ret;
9770     }
9771     break;
9772    case '|':           /* (?|...) */
9773     /* branch reset, behave like a (?:...) except that
9774     buffers in alternations share the same numbers */
9775     paren = ':';
9776     after_freeze = freeze_paren = RExC_npar;
9777     break;
9778    case ':':           /* (?:...) */
9779    case '>':           /* (?>...) */
9780     break;
9781    case '$':           /* (?$...) */
9782    case '@':           /* (?@...) */
9783     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9784     break;
9785    case '0' :           /* (?0) */
9786    case 'R' :           /* (?R) */
9787     if (*RExC_parse != ')')
9788      FAIL("Sequence (?R) not terminated");
9789     ret = reg_node(pRExC_state, GOSTART);
9790      RExC_seen |= REG_GOSTART_SEEN;
9791     *flagp |= POSTPONED;
9792     nextchar(pRExC_state);
9793     return ret;
9794     /*notreached*/
9795    /* named and numeric backreferences */
9796    case '&':            /* (?&NAME) */
9797     parse_start = RExC_parse - 1;
9798    named_recursion:
9799     {
9800       SV *sv_dat = reg_scan_name(pRExC_state,
9801        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9802       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9803     }
9804     if (RExC_parse == RExC_end || *RExC_parse != ')')
9805      vFAIL("Sequence (?&... not terminated");
9806     goto gen_recurse_regop;
9807     assert(0); /* NOT REACHED */
9808    case '+':
9809     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9810      RExC_parse++;
9811      vFAIL("Illegal pattern");
9812     }
9813     goto parse_recursion;
9814     /* NOT REACHED*/
9815    case '-': /* (?-1) */
9816     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9817      RExC_parse--; /* rewind to let it be handled later */
9818      goto parse_flags;
9819     }
9820     /* FALLTHROUGH */
9821    case '1': case '2': case '3': case '4': /* (?1) */
9822    case '5': case '6': case '7': case '8': case '9':
9823     RExC_parse--;
9824    parse_recursion:
9825     {
9826      bool is_neg = FALSE;
9827      parse_start = RExC_parse - 1; /* MJD */
9828      if (*RExC_parse == '-') {
9829       RExC_parse++;
9830       is_neg = TRUE;
9831      }
9832      num = grok_atou(RExC_parse, &endptr);
9833      if (endptr)
9834       RExC_parse = (char*)endptr;
9835      if (is_neg) {
9836       /* Some limit for num? */
9837       num = -num;
9838      }
9839     }
9840     if (*RExC_parse!=')')
9841      vFAIL("Expecting close bracket");
9842
9843    gen_recurse_regop:
9844     if ( paren == '-' ) {
9845      /*
9846      Diagram of capture buffer numbering.
9847      Top line is the normal capture buffer numbers
9848      Bottom line is the negative indexing as from
9849      the X (the (?-2))
9850
9851      +   1 2    3 4 5 X          6 7
9852      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9853      -   5 4    3 2 1 X          x x
9854
9855      */
9856      num = RExC_npar + num;
9857      if (num < 1)  {
9858       RExC_parse++;
9859       vFAIL("Reference to nonexistent group");
9860      }
9861     } else if ( paren == '+' ) {
9862      num = RExC_npar + num - 1;
9863     }
9864
9865     ret = reganode(pRExC_state, GOSUB, num);
9866     if (!SIZE_ONLY) {
9867      if (num > (I32)RExC_rx->nparens) {
9868       RExC_parse++;
9869       vFAIL("Reference to nonexistent group");
9870      }
9871      ARG2L_SET( ret, RExC_recurse_count++);
9872      RExC_emit++;
9873      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9874       "Recurse #%"UVuf" to %"IVdf"\n",
9875        (UV)ARG(ret), (IV)ARG2L(ret)));
9876     } else {
9877      RExC_size++;
9878      }
9879      RExC_seen |= REG_RECURSE_SEEN;
9880     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9881     Set_Node_Offset(ret, parse_start); /* MJD */
9882
9883     *flagp |= POSTPONED;
9884     nextchar(pRExC_state);
9885     return ret;
9886
9887    assert(0); /* NOT REACHED */
9888
9889    case '?':           /* (??...) */
9890     is_logical = 1;
9891     if (*RExC_parse != '{') {
9892      RExC_parse++;
9893      /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9894      vFAIL2utf8f(
9895       "Sequence (%"UTF8f"...) not recognized",
9896       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9897      /*NOTREACHED*/
9898     }
9899     *flagp |= POSTPONED;
9900     paren = *RExC_parse++;
9901     /* FALLTHROUGH */
9902    case '{':           /* (?{...}) */
9903    {
9904     U32 n = 0;
9905     struct reg_code_block *cb;
9906
9907     RExC_seen_zerolen++;
9908
9909     if (   !pRExC_state->num_code_blocks
9910      || pRExC_state->code_index >= pRExC_state->num_code_blocks
9911      || pRExC_state->code_blocks[pRExC_state->code_index].start
9912       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9913        - RExC_start)
9914     ) {
9915      if (RExC_pm_flags & PMf_USE_RE_EVAL)
9916       FAIL("panic: Sequence (?{...}): no code block found\n");
9917      FAIL("Eval-group not allowed at runtime, use re 'eval'");
9918     }
9919     /* this is a pre-compiled code block (?{...}) */
9920     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9921     RExC_parse = RExC_start + cb->end;
9922     if (!SIZE_ONLY) {
9923      OP *o = cb->block;
9924      if (cb->src_regex) {
9925       n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9926       RExC_rxi->data->data[n] =
9927        (void*)SvREFCNT_inc((SV*)cb->src_regex);
9928       RExC_rxi->data->data[n+1] = (void*)o;
9929      }
9930      else {
9931       n = add_data(pRExC_state,
9932        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9933       RExC_rxi->data->data[n] = (void*)o;
9934      }
9935     }
9936     pRExC_state->code_index++;
9937     nextchar(pRExC_state);
9938
9939     if (is_logical) {
9940      regnode *eval;
9941      ret = reg_node(pRExC_state, LOGICAL);
9942      eval = reganode(pRExC_state, EVAL, n);
9943      if (!SIZE_ONLY) {
9944       ret->flags = 2;
9945       /* for later propagation into (??{}) return value */
9946       eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9947      }
9948      REGTAIL(pRExC_state, ret, eval);
9949      /* deal with the length of this later - MJD */
9950      return ret;
9951     }
9952     ret = reganode(pRExC_state, EVAL, n);
9953     Set_Node_Length(ret, RExC_parse - parse_start + 1);
9954     Set_Node_Offset(ret, parse_start);
9955     return ret;
9956    }
9957    case '(':           /* (?(?{...})...) and (?(?=...)...) */
9958    {
9959     int is_define= 0;
9960     if (RExC_parse[0] == '?') {        /* (?(?...)) */
9961      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9962       || RExC_parse[1] == '<'
9963       || RExC_parse[1] == '{') { /* Lookahead or eval. */
9964       I32 flag;
9965       regnode *tail;
9966
9967       ret = reg_node(pRExC_state, LOGICAL);
9968       if (!SIZE_ONLY)
9969        ret->flags = 1;
9970
9971       tail = reg(pRExC_state, 1, &flag, depth+1);
9972       if (flag & RESTART_UTF8) {
9973        *flagp = RESTART_UTF8;
9974        return NULL;
9975       }
9976       REGTAIL(pRExC_state, ret, tail);
9977       goto insert_if;
9978      }
9979      /* Fall through to ‘Unknown switch condition’ at the
9980      end of the if/else chain. */
9981     }
9982     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9983       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9984     {
9985      char ch = RExC_parse[0] == '<' ? '>' : '\'';
9986      char *name_start= RExC_parse++;
9987      U32 num = 0;
9988      SV *sv_dat=reg_scan_name(pRExC_state,
9989       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9990      if (RExC_parse == name_start || *RExC_parse != ch)
9991       vFAIL2("Sequence (?(%c... not terminated",
9992        (ch == '>' ? '<' : ch));
9993      RExC_parse++;
9994      if (!SIZE_ONLY) {
9995       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9996       RExC_rxi->data->data[num]=(void*)sv_dat;
9997       SvREFCNT_inc_simple_void(sv_dat);
9998      }
9999      ret = reganode(pRExC_state,NGROUPP,num);
10000      goto insert_if_check_paren;
10001     }
10002     else if (RExC_parse[0] == 'D' &&
10003       RExC_parse[1] == 'E' &&
10004       RExC_parse[2] == 'F' &&
10005       RExC_parse[3] == 'I' &&
10006       RExC_parse[4] == 'N' &&
10007       RExC_parse[5] == 'E')
10008     {
10009      ret = reganode(pRExC_state,DEFINEP,0);
10010      RExC_parse +=6 ;
10011      is_define = 1;
10012      goto insert_if_check_paren;
10013     }
10014     else if (RExC_parse[0] == 'R') {
10015      RExC_parse++;
10016      parno = 0;
10017      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10018       parno = grok_atou(RExC_parse, &endptr);
10019       if (endptr)
10020        RExC_parse = (char*)endptr;
10021      } else if (RExC_parse[0] == '&') {
10022       SV *sv_dat;
10023       RExC_parse++;
10024       sv_dat = reg_scan_name(pRExC_state,
10025        SIZE_ONLY
10026        ? REG_RSN_RETURN_NULL
10027        : REG_RSN_RETURN_DATA);
10028        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10029      }
10030      ret = reganode(pRExC_state,INSUBP,parno);
10031      goto insert_if_check_paren;
10032     }
10033     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10034      /* (?(1)...) */
10035      char c;
10036      char *tmp;
10037      parno = grok_atou(RExC_parse, &endptr);
10038      if (endptr)
10039       RExC_parse = (char*)endptr;
10040      ret = reganode(pRExC_state, GROUPP, parno);
10041
10042     insert_if_check_paren:
10043      if (*(tmp = nextchar(pRExC_state)) != ')') {
10044       /* nextchar also skips comments, so undo its work
10045       * and skip over the the next character.
10046       */
10047       RExC_parse = tmp;
10048       RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10049       vFAIL("Switch condition not recognized");
10050      }
10051     insert_if:
10052      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10053      br = regbranch(pRExC_state, &flags, 1,depth+1);
10054      if (br == NULL) {
10055       if (flags & RESTART_UTF8) {
10056        *flagp = RESTART_UTF8;
10057        return NULL;
10058       }
10059       FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10060        (UV) flags);
10061      } else
10062       REGTAIL(pRExC_state, br, reganode(pRExC_state,
10063               LONGJMP, 0));
10064      c = *nextchar(pRExC_state);
10065      if (flags&HASWIDTH)
10066       *flagp |= HASWIDTH;
10067      if (c == '|') {
10068       if (is_define)
10069        vFAIL("(?(DEFINE)....) does not allow branches");
10070
10071       /* Fake one for optimizer.  */
10072       lastbr = reganode(pRExC_state, IFTHEN, 0);
10073
10074       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10075        if (flags & RESTART_UTF8) {
10076         *flagp = RESTART_UTF8;
10077         return NULL;
10078        }
10079        FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10080         (UV) flags);
10081       }
10082       REGTAIL(pRExC_state, ret, lastbr);
10083       if (flags&HASWIDTH)
10084        *flagp |= HASWIDTH;
10085       c = *nextchar(pRExC_state);
10086      }
10087      else
10088       lastbr = NULL;
10089      if (c != ')')
10090       vFAIL("Switch (?(condition)... contains too many branches");
10091      ender = reg_node(pRExC_state, TAIL);
10092      REGTAIL(pRExC_state, br, ender);
10093      if (lastbr) {
10094       REGTAIL(pRExC_state, lastbr, ender);
10095       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10096      }
10097      else
10098       REGTAIL(pRExC_state, ret, ender);
10099      RExC_size++; /* XXX WHY do we need this?!!
10100          For large programs it seems to be required
10101          but I can't figure out why. -- dmq*/
10102      return ret;
10103     }
10104     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10105     vFAIL("Unknown switch condition (?(...))");
10106    }
10107    case '[':           /* (?[ ... ]) */
10108     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10109           oregcomp_parse);
10110    case 0:
10111     RExC_parse--; /* for vFAIL to print correctly */
10112     vFAIL("Sequence (? incomplete");
10113     break;
10114    default: /* e.g., (?i) */
10115     --RExC_parse;
10116    parse_flags:
10117     parse_lparen_question_flags(pRExC_state);
10118     if (UCHARAT(RExC_parse) != ':') {
10119      nextchar(pRExC_state);
10120      *flagp = TRYAGAIN;
10121      return NULL;
10122     }
10123     paren = ':';
10124     nextchar(pRExC_state);
10125     ret = NULL;
10126     goto parse_rest;
10127    } /* end switch */
10128   }
10129   else {                  /* (...) */
10130   capturing_parens:
10131    parno = RExC_npar;
10132    RExC_npar++;
10133
10134    ret = reganode(pRExC_state, OPEN, parno);
10135    if (!SIZE_ONLY ){
10136     if (!RExC_nestroot)
10137      RExC_nestroot = parno;
10138     if (RExC_seen & REG_RECURSE_SEEN
10139      && !RExC_open_parens[parno-1])
10140     {
10141      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10142       "Setting open paren #%"IVdf" to %d\n",
10143       (IV)parno, REG_NODE_NUM(ret)));
10144      RExC_open_parens[parno-1]= ret;
10145     }
10146    }
10147    Set_Node_Length(ret, 1); /* MJD */
10148    Set_Node_Offset(ret, RExC_parse); /* MJD */
10149    is_open = 1;
10150   }
10151  }
10152  else                        /* ! paren */
10153   ret = NULL;
10154
10155    parse_rest:
10156  /* Pick up the branches, linking them together. */
10157  parse_start = RExC_parse;   /* MJD */
10158  br = regbranch(pRExC_state, &flags, 1,depth+1);
10159
10160  /*     branch_len = (paren != 0); */
10161
10162  if (br == NULL) {
10163   if (flags & RESTART_UTF8) {
10164    *flagp = RESTART_UTF8;
10165    return NULL;
10166   }
10167   FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10168  }
10169  if (*RExC_parse == '|') {
10170   if (!SIZE_ONLY && RExC_extralen) {
10171    reginsert(pRExC_state, BRANCHJ, br, depth+1);
10172   }
10173   else {                  /* MJD */
10174    reginsert(pRExC_state, BRANCH, br, depth+1);
10175    Set_Node_Length(br, paren != 0);
10176    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10177   }
10178   have_branch = 1;
10179   if (SIZE_ONLY)
10180    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
10181  }
10182  else if (paren == ':') {
10183   *flagp |= flags&SIMPLE;
10184  }
10185  if (is_open) {    /* Starts with OPEN. */
10186   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10187  }
10188  else if (paren != '?')  /* Not Conditional */
10189   ret = br;
10190  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10191  lastbr = br;
10192  while (*RExC_parse == '|') {
10193   if (!SIZE_ONLY && RExC_extralen) {
10194    ender = reganode(pRExC_state, LONGJMP,0);
10195
10196    /* Append to the previous. */
10197    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10198   }
10199   if (SIZE_ONLY)
10200    RExC_extralen += 2;  /* Account for LONGJMP. */
10201   nextchar(pRExC_state);
10202   if (freeze_paren) {
10203    if (RExC_npar > after_freeze)
10204     after_freeze = RExC_npar;
10205    RExC_npar = freeze_paren;
10206   }
10207   br = regbranch(pRExC_state, &flags, 0, depth+1);
10208
10209   if (br == NULL) {
10210    if (flags & RESTART_UTF8) {
10211     *flagp = RESTART_UTF8;
10212     return NULL;
10213    }
10214    FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10215   }
10216   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10217   lastbr = br;
10218   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10219  }
10220
10221  if (have_branch || paren != ':') {
10222   /* Make a closing node, and hook it on the end. */
10223   switch (paren) {
10224   case ':':
10225    ender = reg_node(pRExC_state, TAIL);
10226    break;
10227   case 1: case 2:
10228    ender = reganode(pRExC_state, CLOSE, parno);
10229    if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10230     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10231       "Setting close paren #%"IVdf" to %d\n",
10232       (IV)parno, REG_NODE_NUM(ender)));
10233     RExC_close_parens[parno-1]= ender;
10234     if (RExC_nestroot == parno)
10235      RExC_nestroot = 0;
10236    }
10237    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10238    Set_Node_Length(ender,1); /* MJD */
10239    break;
10240   case '<':
10241   case ',':
10242   case '=':
10243   case '!':
10244    *flagp &= ~HASWIDTH;
10245    /* FALLTHROUGH */
10246   case '>':
10247    ender = reg_node(pRExC_state, SUCCEED);
10248    break;
10249   case 0:
10250    ender = reg_node(pRExC_state, END);
10251    if (!SIZE_ONLY) {
10252     assert(!RExC_opend); /* there can only be one! */
10253     RExC_opend = ender;
10254    }
10255    break;
10256   }
10257   DEBUG_PARSE_r(if (!SIZE_ONLY) {
10258    SV * const mysv_val1=sv_newmortal();
10259    SV * const mysv_val2=sv_newmortal();
10260    DEBUG_PARSE_MSG("lsbr");
10261    regprop(RExC_rx, mysv_val1, lastbr, NULL);
10262    regprop(RExC_rx, mysv_val2, ender, NULL);
10263    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10264       SvPV_nolen_const(mysv_val1),
10265       (IV)REG_NODE_NUM(lastbr),
10266       SvPV_nolen_const(mysv_val2),
10267       (IV)REG_NODE_NUM(ender),
10268       (IV)(ender - lastbr)
10269    );
10270   });
10271   REGTAIL(pRExC_state, lastbr, ender);
10272
10273   if (have_branch && !SIZE_ONLY) {
10274    char is_nothing= 1;
10275    if (depth==1)
10276     RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10277
10278    /* Hook the tails of the branches to the closing node. */
10279    for (br = ret; br; br = regnext(br)) {
10280     const U8 op = PL_regkind[OP(br)];
10281     if (op == BRANCH) {
10282      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10283      if ( OP(NEXTOPER(br)) != NOTHING
10284       || regnext(NEXTOPER(br)) != ender)
10285       is_nothing= 0;
10286     }
10287     else if (op == BRANCHJ) {
10288      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10289      /* for now we always disable this optimisation * /
10290      if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10291       || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10292      */
10293       is_nothing= 0;
10294     }
10295    }
10296    if (is_nothing) {
10297     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10298     DEBUG_PARSE_r(if (!SIZE_ONLY) {
10299      SV * const mysv_val1=sv_newmortal();
10300      SV * const mysv_val2=sv_newmortal();
10301      DEBUG_PARSE_MSG("NADA");
10302      regprop(RExC_rx, mysv_val1, ret, NULL);
10303      regprop(RExC_rx, mysv_val2, ender, NULL);
10304      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10305         SvPV_nolen_const(mysv_val1),
10306         (IV)REG_NODE_NUM(ret),
10307         SvPV_nolen_const(mysv_val2),
10308         (IV)REG_NODE_NUM(ender),
10309         (IV)(ender - ret)
10310      );
10311     });
10312     OP(br)= NOTHING;
10313     if (OP(ender) == TAIL) {
10314      NEXT_OFF(br)= 0;
10315      RExC_emit= br + 1;
10316     } else {
10317      regnode *opt;
10318      for ( opt= br + 1; opt < ender ; opt++ )
10319       OP(opt)= OPTIMIZED;
10320      NEXT_OFF(br)= ender - br;
10321     }
10322    }
10323   }
10324  }
10325
10326  {
10327   const char *p;
10328   static const char parens[] = "=!<,>";
10329
10330   if (paren && (p = strchr(parens, paren))) {
10331    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10332    int flag = (p - parens) > 1;
10333
10334    if (paren == '>')
10335     node = SUSPEND, flag = 0;
10336    reginsert(pRExC_state, node,ret, depth+1);
10337    Set_Node_Cur_Length(ret, parse_start);
10338    Set_Node_Offset(ret, parse_start + 1);
10339    ret->flags = flag;
10340    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10341   }
10342  }
10343
10344  /* Check for proper termination. */
10345  if (paren) {
10346   /* restore original flags, but keep (?p) */
10347   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10348   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10349    RExC_parse = oregcomp_parse;
10350    vFAIL("Unmatched (");
10351   }
10352  }
10353  else if (!paren && RExC_parse < RExC_end) {
10354   if (*RExC_parse == ')') {
10355    RExC_parse++;
10356    vFAIL("Unmatched )");
10357   }
10358   else
10359    FAIL("Junk on end of regexp"); /* "Can't happen". */
10360   assert(0); /* NOTREACHED */
10361  }
10362
10363  if (RExC_in_lookbehind) {
10364   RExC_in_lookbehind--;
10365  }
10366  if (after_freeze > RExC_npar)
10367   RExC_npar = after_freeze;
10368  return(ret);
10369 }
10370
10371 /*
10372  - regbranch - one alternative of an | operator
10373  *
10374  * Implements the concatenation operator.
10375  *
10376  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10377  * restarted.
10378  */
10379 STATIC regnode *
10380 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10381 {
10382  regnode *ret;
10383  regnode *chain = NULL;
10384  regnode *latest;
10385  I32 flags = 0, c = 0;
10386  GET_RE_DEBUG_FLAGS_DECL;
10387
10388  PERL_ARGS_ASSERT_REGBRANCH;
10389
10390  DEBUG_PARSE("brnc");
10391
10392  if (first)
10393   ret = NULL;
10394  else {
10395   if (!SIZE_ONLY && RExC_extralen)
10396    ret = reganode(pRExC_state, BRANCHJ,0);
10397   else {
10398    ret = reg_node(pRExC_state, BRANCH);
10399    Set_Node_Length(ret, 1);
10400   }
10401  }
10402
10403  if (!first && SIZE_ONLY)
10404   RExC_extralen += 1;   /* BRANCHJ */
10405
10406  *flagp = WORST;   /* Tentatively. */
10407
10408  RExC_parse--;
10409  nextchar(pRExC_state);
10410  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10411   flags &= ~TRYAGAIN;
10412   latest = regpiece(pRExC_state, &flags,depth+1);
10413   if (latest == NULL) {
10414    if (flags & TRYAGAIN)
10415     continue;
10416    if (flags & RESTART_UTF8) {
10417     *flagp = RESTART_UTF8;
10418     return NULL;
10419    }
10420    FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10421   }
10422   else if (ret == NULL)
10423    ret = latest;
10424   *flagp |= flags&(HASWIDTH|POSTPONED);
10425   if (chain == NULL)  /* First piece. */
10426    *flagp |= flags&SPSTART;
10427   else {
10428    RExC_naughty++;
10429    REGTAIL(pRExC_state, chain, latest);
10430   }
10431   chain = latest;
10432   c++;
10433  }
10434  if (chain == NULL) { /* Loop ran zero times. */
10435   chain = reg_node(pRExC_state, NOTHING);
10436   if (ret == NULL)
10437    ret = chain;
10438  }
10439  if (c == 1) {
10440   *flagp |= flags&SIMPLE;
10441  }
10442
10443  return ret;
10444 }
10445
10446 /*
10447  - regpiece - something followed by possible [*+?]
10448  *
10449  * Note that the branching code sequences used for ? and the general cases
10450  * of * and + are somewhat optimized:  they use the same NOTHING node as
10451  * both the endmarker for their branch list and the body of the last branch.
10452  * It might seem that this node could be dispensed with entirely, but the
10453  * endmarker role is not redundant.
10454  *
10455  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10456  * TRYAGAIN.
10457  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10458  * restarted.
10459  */
10460 STATIC regnode *
10461 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10462 {
10463  regnode *ret;
10464  char op;
10465  char *next;
10466  I32 flags;
10467  const char * const origparse = RExC_parse;
10468  I32 min;
10469  I32 max = REG_INFTY;
10470 #ifdef RE_TRACK_PATTERN_OFFSETS
10471  char *parse_start;
10472 #endif
10473  const char *maxpos = NULL;
10474
10475  /* Save the original in case we change the emitted regop to a FAIL. */
10476  regnode * const orig_emit = RExC_emit;
10477
10478  GET_RE_DEBUG_FLAGS_DECL;
10479
10480  PERL_ARGS_ASSERT_REGPIECE;
10481
10482  DEBUG_PARSE("piec");
10483
10484  ret = regatom(pRExC_state, &flags,depth+1);
10485  if (ret == NULL) {
10486   if (flags & (TRYAGAIN|RESTART_UTF8))
10487    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10488   else
10489    FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10490   return(NULL);
10491  }
10492
10493  op = *RExC_parse;
10494
10495  if (op == '{' && regcurly(RExC_parse)) {
10496   maxpos = NULL;
10497 #ifdef RE_TRACK_PATTERN_OFFSETS
10498   parse_start = RExC_parse; /* MJD */
10499 #endif
10500   next = RExC_parse + 1;
10501   while (isDIGIT(*next) || *next == ',') {
10502    if (*next == ',') {
10503     if (maxpos)
10504      break;
10505     else
10506      maxpos = next;
10507    }
10508    next++;
10509   }
10510   if (*next == '}') {  /* got one */
10511    const char* endptr;
10512    if (!maxpos)
10513     maxpos = next;
10514    RExC_parse++;
10515    min = grok_atou(RExC_parse, &endptr);
10516    if (*maxpos == ',')
10517     maxpos++;
10518    else
10519     maxpos = RExC_parse;
10520    max = grok_atou(maxpos, &endptr);
10521    if (!max && *maxpos != '0')
10522     max = REG_INFTY;  /* meaning "infinity" */
10523    else if (max >= REG_INFTY)
10524     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10525    RExC_parse = next;
10526    nextchar(pRExC_state);
10527    if (max < min) {    /* If can't match, warn and optimize to fail
10528         unconditionally */
10529     if (SIZE_ONLY) {
10530      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10531
10532      /* We can't back off the size because we have to reserve
10533      * enough space for all the things we are about to throw
10534      * away, but we can shrink it by the ammount we are about
10535      * to re-use here */
10536      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10537     }
10538     else {
10539      RExC_emit = orig_emit;
10540     }
10541     ret = reg_node(pRExC_state, OPFAIL);
10542     return ret;
10543    }
10544    else if (min == max
10545      && RExC_parse < RExC_end
10546      && (*RExC_parse == '?' || *RExC_parse == '+'))
10547    {
10548     if (SIZE_ONLY) {
10549      ckWARN2reg(RExC_parse + 1,
10550        "Useless use of greediness modifier '%c'",
10551        *RExC_parse);
10552     }
10553     /* Absorb the modifier, so later code doesn't see nor use
10554      * it */
10555     nextchar(pRExC_state);
10556    }
10557
10558   do_curly:
10559    if ((flags&SIMPLE)) {
10560     RExC_naughty += 2 + RExC_naughty / 2;
10561     reginsert(pRExC_state, CURLY, ret, depth+1);
10562     Set_Node_Offset(ret, parse_start+1); /* MJD */
10563     Set_Node_Cur_Length(ret, parse_start);
10564    }
10565    else {
10566     regnode * const w = reg_node(pRExC_state, WHILEM);
10567
10568     w->flags = 0;
10569     REGTAIL(pRExC_state, ret, w);
10570     if (!SIZE_ONLY && RExC_extralen) {
10571      reginsert(pRExC_state, LONGJMP,ret, depth+1);
10572      reginsert(pRExC_state, NOTHING,ret, depth+1);
10573      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10574     }
10575     reginsert(pRExC_state, CURLYX,ret, depth+1);
10576         /* MJD hk */
10577     Set_Node_Offset(ret, parse_start+1);
10578     Set_Node_Length(ret,
10579         op == '{' ? (RExC_parse - parse_start) : 1);
10580
10581     if (!SIZE_ONLY && RExC_extralen)
10582      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10583     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10584     if (SIZE_ONLY)
10585      RExC_whilem_seen++, RExC_extralen += 3;
10586     RExC_naughty += 4 + RExC_naughty; /* compound interest */
10587    }
10588    ret->flags = 0;
10589
10590    if (min > 0)
10591     *flagp = WORST;
10592    if (max > 0)
10593     *flagp |= HASWIDTH;
10594    if (!SIZE_ONLY) {
10595     ARG1_SET(ret, (U16)min);
10596     ARG2_SET(ret, (U16)max);
10597    }
10598    if (max == REG_INFTY)
10599     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10600
10601    goto nest_check;
10602   }
10603  }
10604
10605  if (!ISMULT1(op)) {
10606   *flagp = flags;
10607   return(ret);
10608  }
10609
10610 #if 0    /* Now runtime fix should be reliable. */
10611
10612  /* if this is reinstated, don't forget to put this back into perldiag:
10613
10614    =item Regexp *+ operand could be empty at {#} in regex m/%s/
10615
10616   (F) The part of the regexp subject to either the * or + quantifier
10617   could match an empty string. The {#} shows in the regular
10618   expression about where the problem was discovered.
10619
10620  */
10621
10622  if (!(flags&HASWIDTH) && op != '?')
10623  vFAIL("Regexp *+ operand could be empty");
10624 #endif
10625
10626 #ifdef RE_TRACK_PATTERN_OFFSETS
10627  parse_start = RExC_parse;
10628 #endif
10629  nextchar(pRExC_state);
10630
10631  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10632
10633  if (op == '*' && (flags&SIMPLE)) {
10634   reginsert(pRExC_state, STAR, ret, depth+1);
10635   ret->flags = 0;
10636   RExC_naughty += 4;
10637   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10638  }
10639  else if (op == '*') {
10640   min = 0;
10641   goto do_curly;
10642  }
10643  else if (op == '+' && (flags&SIMPLE)) {
10644   reginsert(pRExC_state, PLUS, ret, depth+1);
10645   ret->flags = 0;
10646   RExC_naughty += 3;
10647   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10648  }
10649  else if (op == '+') {
10650   min = 1;
10651   goto do_curly;
10652  }
10653  else if (op == '?') {
10654   min = 0; max = 1;
10655   goto do_curly;
10656  }
10657   nest_check:
10658  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10659   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10660   ckWARN2reg(RExC_parse,
10661     "%"UTF8f" matches null string many times",
10662     UTF8fARG(UTF, (RExC_parse >= origparse
10663         ? RExC_parse - origparse
10664         : 0),
10665     origparse));
10666   (void)ReREFCNT_inc(RExC_rx_sv);
10667  }
10668
10669  if (RExC_parse < RExC_end && *RExC_parse == '?') {
10670   nextchar(pRExC_state);
10671   reginsert(pRExC_state, MINMOD, ret, depth+1);
10672   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10673  }
10674  else
10675  if (RExC_parse < RExC_end && *RExC_parse == '+') {
10676   regnode *ender;
10677   nextchar(pRExC_state);
10678   ender = reg_node(pRExC_state, SUCCEED);
10679   REGTAIL(pRExC_state, ret, ender);
10680   reginsert(pRExC_state, SUSPEND, ret, depth+1);
10681   ret->flags = 0;
10682   ender = reg_node(pRExC_state, TAIL);
10683   REGTAIL(pRExC_state, ret, ender);
10684  }
10685
10686  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10687   RExC_parse++;
10688   vFAIL("Nested quantifiers");
10689  }
10690
10691  return(ret);
10692 }
10693
10694 STATIC bool
10695 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10696      UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10697      const bool strict   /* Apply stricter parsing rules? */
10698  )
10699 {
10700
10701  /* This is expected to be called by a parser routine that has recognized '\N'
10702    and needs to handle the rest. RExC_parse is expected to point at the first
10703    char following the N at the time of the call.  On successful return,
10704    RExC_parse has been updated to point to just after the sequence identified
10705    by this routine, and <*flagp> has been updated.
10706
10707    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10708    character class.
10709
10710    \N may begin either a named sequence, or if outside a character class, mean
10711    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10712    attempted to decide which, and in the case of a named sequence, converted it
10713    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10714    where c1... are the characters in the sequence.  For single-quoted regexes,
10715    the tokenizer passes the \N sequence through unchanged; this code will not
10716    attempt to determine this nor expand those, instead raising a syntax error.
10717    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10718    or there is no '}', it signals that this \N occurrence means to match a
10719    non-newline.
10720
10721    Only the \N{U+...} form should occur in a character class, for the same
10722    reason that '.' inside a character class means to just match a period: it
10723    just doesn't make sense.
10724
10725    The function raises an error (via vFAIL), and doesn't return for various
10726    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10727    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10728    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10729    only possible if node_p is non-NULL.
10730
10731
10732    If <valuep> is non-null, it means the caller can accept an input sequence
10733    consisting of a just a single code point; <*valuep> is set to that value
10734    if the input is such.
10735
10736    If <node_p> is non-null it signifies that the caller can accept any other
10737    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10738    is set as follows:
10739  1) \N means not-a-NL: points to a newly created REG_ANY node;
10740  2) \N{}:              points to a new NOTHING node;
10741  3) otherwise:         points to a new EXACT node containing the resolved
10742       string.
10743    Note that FALSE is returned for single code point sequences if <valuep> is
10744    null.
10745  */
10746
10747  char * endbrace;    /* '}' following the name */
10748  char* p;
10749  char *endchar; /* Points to '.' or '}' ending cur char in the input
10750       stream */
10751  bool has_multiple_chars; /* true if the input stream contains a sequence of
10752         more than one character */
10753
10754  GET_RE_DEBUG_FLAGS_DECL;
10755
10756  PERL_ARGS_ASSERT_GROK_BSLASH_N;
10757
10758  GET_RE_DEBUG_FLAGS;
10759
10760  assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10761
10762  /* The [^\n] meaning of \N ignores spaces and comments under the /x
10763  * modifier.  The other meaning does not, so use a temporary until we find
10764  * out which we are being called with */
10765  p = (RExC_flags & RXf_PMf_EXTENDED)
10766   ? regpatws(pRExC_state, RExC_parse,
10767         TRUE) /* means recognize comments */
10768   : RExC_parse;
10769
10770  /* Disambiguate between \N meaning a named character versus \N meaning
10771  * [^\n].  The former is assumed when it can't be the latter. */
10772  if (*p != '{' || regcurly(p)) {
10773   RExC_parse = p;
10774   if (! node_p) {
10775    /* no bare \N allowed in a charclass */
10776    if (in_char_class) {
10777     vFAIL("\\N in a character class must be a named character: \\N{...}");
10778    }
10779    return FALSE;
10780   }
10781   RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10782       current char */
10783   nextchar(pRExC_state);
10784   *node_p = reg_node(pRExC_state, REG_ANY);
10785   *flagp |= HASWIDTH|SIMPLE;
10786   RExC_naughty++;
10787   Set_Node_Length(*node_p, 1); /* MJD */
10788   return TRUE;
10789  }
10790
10791  /* Here, we have decided it should be a named character or sequence */
10792
10793  /* The test above made sure that the next real character is a '{', but
10794  * under the /x modifier, it could be separated by space (or a comment and
10795  * \n) and this is not allowed (for consistency with \x{...} and the
10796  * tokenizer handling of \N{NAME}). */
10797  if (*RExC_parse != '{') {
10798   vFAIL("Missing braces on \\N{}");
10799  }
10800
10801  RExC_parse++; /* Skip past the '{' */
10802
10803  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10804   || ! (endbrace == RExC_parse  /* nothing between the {} */
10805    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10806             */
10807     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10808              */
10809  {
10810   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10811   vFAIL("\\N{NAME} must be resolved by the lexer");
10812  }
10813
10814  if (endbrace == RExC_parse) {   /* empty: \N{} */
10815   bool ret = TRUE;
10816   if (node_p) {
10817    *node_p = reg_node(pRExC_state,NOTHING);
10818   }
10819   else if (in_char_class) {
10820    if (SIZE_ONLY && in_char_class) {
10821     if (strict) {
10822      RExC_parse++;   /* Position after the "}" */
10823      vFAIL("Zero length \\N{}");
10824     }
10825     else {
10826      ckWARNreg(RExC_parse,
10827        "Ignoring zero length \\N{} in character class");
10828     }
10829    }
10830    ret = FALSE;
10831   }
10832   else {
10833    return FALSE;
10834   }
10835   nextchar(pRExC_state);
10836   return ret;
10837  }
10838
10839  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10840  RExC_parse += 2; /* Skip past the 'U+' */
10841
10842  endchar = RExC_parse + strcspn(RExC_parse, ".}");
10843
10844  /* Code points are separated by dots.  If none, there is only one code
10845  * point, and is terminated by the brace */
10846  has_multiple_chars = (endchar < endbrace);
10847
10848  if (valuep && (! has_multiple_chars || in_char_class)) {
10849   /* We only pay attention to the first char of
10850   multichar strings being returned in char classes. I kinda wonder
10851   if this makes sense as it does change the behaviour
10852   from earlier versions, OTOH that behaviour was broken
10853   as well. XXX Solution is to recharacterize as
10854   [rest-of-class]|multi1|multi2... */
10855
10856   STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10857   I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10858    | PERL_SCAN_DISALLOW_PREFIX
10859    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10860
10861   *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10862
10863   /* The tokenizer should have guaranteed validity, but it's possible to
10864   * bypass it by using single quoting, so check */
10865   if (length_of_hex == 0
10866    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10867   {
10868    RExC_parse += length_of_hex; /* Includes all the valid */
10869    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10870        ? UTF8SKIP(RExC_parse)
10871        : 1;
10872    /* Guard against malformed utf8 */
10873    if (RExC_parse >= endchar) {
10874     RExC_parse = endchar;
10875    }
10876    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10877   }
10878
10879   if (in_char_class && has_multiple_chars) {
10880    if (strict) {
10881     RExC_parse = endbrace;
10882     vFAIL("\\N{} in character class restricted to one character");
10883    }
10884    else {
10885     ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10886    }
10887   }
10888
10889   RExC_parse = endbrace + 1;
10890  }
10891  else if (! node_p || ! has_multiple_chars) {
10892
10893   /* Here, the input is legal, but not according to the caller's
10894   * options.  We fail without advancing the parse, so that the
10895   * caller can try again */
10896   RExC_parse = p;
10897   return FALSE;
10898  }
10899  else {
10900
10901   /* What is done here is to convert this to a sub-pattern of the form
10902   * (?:\x{char1}\x{char2}...)
10903   * and then call reg recursively.  That way, it retains its atomicness,
10904   * while not having to worry about special handling that some code
10905   * points may have.  toke.c has converted the original Unicode values
10906   * to native, so that we can just pass on the hex values unchanged.  We
10907   * do have to set a flag to keep recoding from happening in the
10908   * recursion */
10909
10910   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10911   STRLEN len;
10912   char *orig_end = RExC_end;
10913   I32 flags;
10914
10915   while (RExC_parse < endbrace) {
10916
10917    /* Convert to notation the rest of the code understands */
10918    sv_catpv(substitute_parse, "\\x{");
10919    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10920    sv_catpv(substitute_parse, "}");
10921
10922    /* Point to the beginning of the next character in the sequence. */
10923    RExC_parse = endchar + 1;
10924    endchar = RExC_parse + strcspn(RExC_parse, ".}");
10925   }
10926   sv_catpv(substitute_parse, ")");
10927
10928   RExC_parse = SvPV(substitute_parse, len);
10929
10930   /* Don't allow empty number */
10931   if (len < 8) {
10932    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10933   }
10934   RExC_end = RExC_parse + len;
10935
10936   /* The values are Unicode, and therefore not subject to recoding */
10937   RExC_override_recoding = 1;
10938
10939   if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10940    if (flags & RESTART_UTF8) {
10941     *flagp = RESTART_UTF8;
10942     return FALSE;
10943    }
10944    FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10945     (UV) flags);
10946   }
10947   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10948
10949   RExC_parse = endbrace;
10950   RExC_end = orig_end;
10951   RExC_override_recoding = 0;
10952
10953   nextchar(pRExC_state);
10954  }
10955
10956  return TRUE;
10957 }
10958
10959
10960 /*
10961  * reg_recode
10962  *
10963  * It returns the code point in utf8 for the value in *encp.
10964  *    value: a code value in the source encoding
10965  *    encp:  a pointer to an Encode object
10966  *
10967  * If the result from Encode is not a single character,
10968  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10969  */
10970 STATIC UV
10971 S_reg_recode(pTHX_ const char value, SV **encp)
10972 {
10973  STRLEN numlen = 1;
10974  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10975  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10976  const STRLEN newlen = SvCUR(sv);
10977  UV uv = UNICODE_REPLACEMENT;
10978
10979  PERL_ARGS_ASSERT_REG_RECODE;
10980
10981  if (newlen)
10982   uv = SvUTF8(sv)
10983    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10984    : *(U8*)s;
10985
10986  if (!newlen || numlen != newlen) {
10987   uv = UNICODE_REPLACEMENT;
10988   *encp = NULL;
10989  }
10990  return uv;
10991 }
10992
10993 PERL_STATIC_INLINE U8
10994 S_compute_EXACTish(RExC_state_t *pRExC_state)
10995 {
10996  U8 op;
10997
10998  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10999
11000  if (! FOLD) {
11001   return EXACT;
11002  }
11003
11004  op = get_regex_charset(RExC_flags);
11005  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11006   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11007     been, so there is no hole */
11008  }
11009
11010  return op + EXACTF;
11011 }
11012
11013 PERL_STATIC_INLINE void
11014 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11015       regnode *node, I32* flagp, STRLEN len, UV code_point,
11016       bool downgradable)
11017 {
11018  /* This knows the details about sizing an EXACTish node, setting flags for
11019  * it (by setting <*flagp>, and potentially populating it with a single
11020  * character.
11021  *
11022  * If <len> (the length in bytes) is non-zero, this function assumes that
11023  * the node has already been populated, and just does the sizing.  In this
11024  * case <code_point> should be the final code point that has already been
11025  * placed into the node.  This value will be ignored except that under some
11026  * circumstances <*flagp> is set based on it.
11027  *
11028  * If <len> is zero, the function assumes that the node is to contain only
11029  * the single character given by <code_point> and calculates what <len>
11030  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11031  * additionally will populate the node's STRING with <code_point> or its
11032  * fold if folding.
11033  *
11034  * In both cases <*flagp> is appropriately set
11035  *
11036  * It knows that under FOLD, the Latin Sharp S and UTF characters above
11037  * 255, must be folded (the former only when the rules indicate it can
11038  * match 'ss')
11039  *
11040  * When it does the populating, it looks at the flag 'downgradable'.  If
11041  * true with a node that folds, it checks if the single code point
11042  * participates in a fold, and if not downgrades the node to an EXACT.
11043  * This helps the optimizer */
11044
11045  bool len_passed_in = cBOOL(len != 0);
11046  U8 character[UTF8_MAXBYTES_CASE+1];
11047
11048  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11049
11050  /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11051  * sizing difference, and is extra work that is thrown away */
11052  if (downgradable && ! PASS2) {
11053   downgradable = FALSE;
11054  }
11055
11056  if (! len_passed_in) {
11057   if (UTF) {
11058    if (UNI_IS_INVARIANT(code_point)) {
11059     if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11060      *character = (U8) code_point;
11061     }
11062     else { /* Here is /i and not /l (toFOLD() is defined on just
11063       ASCII, which isn't the same thing as INVARIANT on
11064       EBCDIC, but it works there, as the extra invariants
11065       fold to themselves) */
11066      *character = toFOLD((U8) code_point);
11067      if (downgradable
11068       && *character == code_point
11069       && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
11070      {
11071       OP(node) = EXACT;
11072      }
11073     }
11074     len = 1;
11075    }
11076    else if (FOLD && (! LOC
11077        || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11078    {   /* Folding, and ok to do so now */
11079     UV folded = _to_uni_fold_flags(
11080         code_point,
11081         character,
11082         &len,
11083         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11084              ? FOLD_FLAGS_NOMIX_ASCII
11085              : 0));
11086     if (downgradable
11087      && folded == code_point
11088      && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11089     {
11090      OP(node) = EXACT;
11091     }
11092    }
11093    else if (code_point <= MAX_UTF8_TWO_BYTE) {
11094
11095     /* Not folding this cp, and can output it directly */
11096     *character = UTF8_TWO_BYTE_HI(code_point);
11097     *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11098     len = 2;
11099    }
11100    else {
11101     uvchr_to_utf8( character, code_point);
11102     len = UTF8SKIP(character);
11103    }
11104   } /* Else pattern isn't UTF8.  */
11105   else if (! FOLD) {
11106    *character = (U8) code_point;
11107    len = 1;
11108   } /* Else is folded non-UTF8 */
11109   else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11110
11111    /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11112    * comments at join_exact()); */
11113    *character = (U8) code_point;
11114    len = 1;
11115
11116    /* Can turn into an EXACT node if we know the fold at compile time,
11117    * and it folds to itself and doesn't particpate in other folds */
11118    if (downgradable
11119     && ! LOC
11120     && PL_fold_latin1[code_point] == code_point
11121     && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11122      || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11123    {
11124     OP(node) = EXACT;
11125    }
11126   } /* else is Sharp s.  May need to fold it */
11127   else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11128    *character = 's';
11129    *(character + 1) = 's';
11130    len = 2;
11131   }
11132   else {
11133    *character = LATIN_SMALL_LETTER_SHARP_S;
11134    len = 1;
11135   }
11136  }
11137
11138  if (SIZE_ONLY) {
11139   RExC_size += STR_SZ(len);
11140  }
11141  else {
11142   RExC_emit += STR_SZ(len);
11143   STR_LEN(node) = len;
11144   if (! len_passed_in) {
11145    Copy((char *) character, STRING(node), len, char);
11146   }
11147  }
11148
11149  *flagp |= HASWIDTH;
11150
11151  /* A single character node is SIMPLE, except for the special-cased SHARP S
11152  * under /di. */
11153  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11154   && (code_point != LATIN_SMALL_LETTER_SHARP_S
11155    || ! FOLD || ! DEPENDS_SEMANTICS))
11156  {
11157   *flagp |= SIMPLE;
11158  }
11159
11160  /* The OP may not be well defined in PASS1 */
11161  if (PASS2 && OP(node) == EXACTFL) {
11162   RExC_contains_locale = 1;
11163  }
11164 }
11165
11166
11167 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11168  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11169
11170 static I32
11171 S_backref_value(char *p)
11172 {
11173  const char* endptr;
11174  UV val = grok_atou(p, &endptr);
11175  if (endptr == p || endptr == NULL || val > I32_MAX)
11176   return I32_MAX;
11177  return (I32)val;
11178 }
11179
11180
11181 /*
11182  - regatom - the lowest level
11183
11184    Try to identify anything special at the start of the pattern. If there
11185    is, then handle it as required. This may involve generating a single regop,
11186    such as for an assertion; or it may involve recursing, such as to
11187    handle a () structure.
11188
11189    If the string doesn't start with something special then we gobble up
11190    as much literal text as we can.
11191
11192    Once we have been able to handle whatever type of thing started the
11193    sequence, we return.
11194
11195    Note: we have to be careful with escapes, as they can be both literal
11196    and special, and in the case of \10 and friends, context determines which.
11197
11198    A summary of the code structure is:
11199
11200    switch (first_byte) {
11201   cases for each special:
11202    handle this special;
11203    break;
11204   case '\\':
11205    switch (2nd byte) {
11206     cases for each unambiguous special:
11207      handle this special;
11208      break;
11209     cases for each ambigous special/literal:
11210      disambiguate;
11211      if (special)  handle here
11212      else goto defchar;
11213     default: // unambiguously literal:
11214      goto defchar;
11215    }
11216   default:  // is a literal char
11217    // FALL THROUGH
11218   defchar:
11219    create EXACTish node for literal;
11220    while (more input and node isn't full) {
11221     switch (input_byte) {
11222     cases for each special;
11223      make sure parse pointer is set so that the next call to
11224       regatom will see this special first
11225      goto loopdone; // EXACTish node terminated by prev. char
11226     default:
11227      append char to EXACTISH node;
11228     }
11229     get next input byte;
11230    }
11231   loopdone:
11232    }
11233    return the generated node;
11234
11235    Specifically there are two separate switches for handling
11236    escape sequences, with the one for handling literal escapes requiring
11237    a dummy entry for all of the special escapes that are actually handled
11238    by the other.
11239
11240    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11241    TRYAGAIN.
11242    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11243    restarted.
11244    Otherwise does not return NULL.
11245 */
11246
11247 STATIC regnode *
11248 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11249 {
11250  regnode *ret = NULL;
11251  I32 flags = 0;
11252  char *parse_start = RExC_parse;
11253  U8 op;
11254  int invert = 0;
11255  U8 arg;
11256
11257  GET_RE_DEBUG_FLAGS_DECL;
11258
11259  *flagp = WORST;  /* Tentatively. */
11260
11261  DEBUG_PARSE("atom");
11262
11263  PERL_ARGS_ASSERT_REGATOM;
11264
11265 tryagain:
11266  switch ((U8)*RExC_parse) {
11267  case '^':
11268   RExC_seen_zerolen++;
11269   nextchar(pRExC_state);
11270   if (RExC_flags & RXf_PMf_MULTILINE)
11271    ret = reg_node(pRExC_state, MBOL);
11272   else if (RExC_flags & RXf_PMf_SINGLELINE)
11273    ret = reg_node(pRExC_state, SBOL);
11274   else
11275    ret = reg_node(pRExC_state, BOL);
11276   Set_Node_Length(ret, 1); /* MJD */
11277   break;
11278  case '$':
11279   nextchar(pRExC_state);
11280   if (*RExC_parse)
11281    RExC_seen_zerolen++;
11282   if (RExC_flags & RXf_PMf_MULTILINE)
11283    ret = reg_node(pRExC_state, MEOL);
11284   else if (RExC_flags & RXf_PMf_SINGLELINE)
11285    ret = reg_node(pRExC_state, SEOL);
11286   else
11287    ret = reg_node(pRExC_state, EOL);
11288   Set_Node_Length(ret, 1); /* MJD */
11289   break;
11290  case '.':
11291   nextchar(pRExC_state);
11292   if (RExC_flags & RXf_PMf_SINGLELINE)
11293    ret = reg_node(pRExC_state, SANY);
11294   else
11295    ret = reg_node(pRExC_state, REG_ANY);
11296   *flagp |= HASWIDTH|SIMPLE;
11297   RExC_naughty++;
11298   Set_Node_Length(ret, 1); /* MJD */
11299   break;
11300  case '[':
11301  {
11302   char * const oregcomp_parse = ++RExC_parse;
11303   ret = regclass(pRExC_state, flagp,depth+1,
11304      FALSE, /* means parse the whole char class */
11305      TRUE, /* allow multi-char folds */
11306      FALSE, /* don't silence non-portable warnings. */
11307      NULL);
11308   if (*RExC_parse != ']') {
11309    RExC_parse = oregcomp_parse;
11310    vFAIL("Unmatched [");
11311   }
11312   if (ret == NULL) {
11313    if (*flagp & RESTART_UTF8)
11314     return NULL;
11315    FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11316     (UV) *flagp);
11317   }
11318   nextchar(pRExC_state);
11319   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11320   break;
11321  }
11322  case '(':
11323   nextchar(pRExC_state);
11324   ret = reg(pRExC_state, 2, &flags,depth+1);
11325   if (ret == NULL) {
11326     if (flags & TRYAGAIN) {
11327      if (RExC_parse == RExC_end) {
11328       /* Make parent create an empty node if needed. */
11329       *flagp |= TRYAGAIN;
11330       return(NULL);
11331      }
11332      goto tryagain;
11333     }
11334     if (flags & RESTART_UTF8) {
11335      *flagp = RESTART_UTF8;
11336      return NULL;
11337     }
11338     FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11339                 (UV) flags);
11340   }
11341   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11342   break;
11343  case '|':
11344  case ')':
11345   if (flags & TRYAGAIN) {
11346    *flagp |= TRYAGAIN;
11347    return NULL;
11348   }
11349   vFAIL("Internal urp");
11350         /* Supposed to be caught earlier. */
11351   break;
11352  case '?':
11353  case '+':
11354  case '*':
11355   RExC_parse++;
11356   vFAIL("Quantifier follows nothing");
11357   break;
11358  case '\\':
11359   /* Special Escapes
11360
11361   This switch handles escape sequences that resolve to some kind
11362   of special regop and not to literal text. Escape sequnces that
11363   resolve to literal text are handled below in the switch marked
11364   "Literal Escapes".
11365
11366   Every entry in this switch *must* have a corresponding entry
11367   in the literal escape switch. However, the opposite is not
11368   required, as the default for this switch is to jump to the
11369   literal text handling code.
11370   */
11371   switch ((U8)*++RExC_parse) {
11372   /* Special Escapes */
11373   case 'A':
11374    RExC_seen_zerolen++;
11375    ret = reg_node(pRExC_state, SBOL);
11376    *flagp |= SIMPLE;
11377    goto finish_meta_pat;
11378   case 'G':
11379    ret = reg_node(pRExC_state, GPOS);
11380    RExC_seen |= REG_GPOS_SEEN;
11381    *flagp |= SIMPLE;
11382    goto finish_meta_pat;
11383   case 'K':
11384    RExC_seen_zerolen++;
11385    ret = reg_node(pRExC_state, KEEPS);
11386    *flagp |= SIMPLE;
11387    /* XXX:dmq : disabling in-place substitution seems to
11388    * be necessary here to avoid cases of memory corruption, as
11389    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11390    */
11391    RExC_seen |= REG_LOOKBEHIND_SEEN;
11392    goto finish_meta_pat;
11393   case 'Z':
11394    ret = reg_node(pRExC_state, SEOL);
11395    *flagp |= SIMPLE;
11396    RExC_seen_zerolen++;  /* Do not optimize RE away */
11397    goto finish_meta_pat;
11398   case 'z':
11399    ret = reg_node(pRExC_state, EOS);
11400    *flagp |= SIMPLE;
11401    RExC_seen_zerolen++;  /* Do not optimize RE away */
11402    goto finish_meta_pat;
11403   case 'C':
11404    ret = reg_node(pRExC_state, CANY);
11405    RExC_seen |= REG_CANY_SEEN;
11406    *flagp |= HASWIDTH|SIMPLE;
11407    if (SIZE_ONLY) {
11408     ckWARNdep(RExC_parse+1, "\\C is deprecated");
11409    }
11410    goto finish_meta_pat;
11411   case 'X':
11412    ret = reg_node(pRExC_state, CLUMP);
11413    *flagp |= HASWIDTH;
11414    goto finish_meta_pat;
11415
11416   case 'W':
11417    invert = 1;
11418    /* FALLTHROUGH */
11419   case 'w':
11420    arg = ANYOF_WORDCHAR;
11421    goto join_posix;
11422
11423   case 'b':
11424    RExC_seen_zerolen++;
11425    RExC_seen |= REG_LOOKBEHIND_SEEN;
11426    op = BOUND + get_regex_charset(RExC_flags);
11427    if (op > BOUNDA) {  /* /aa is same as /a */
11428     op = BOUNDA;
11429    }
11430    else if (op == BOUNDL) {
11431     RExC_contains_locale = 1;
11432    }
11433    ret = reg_node(pRExC_state, op);
11434    FLAGS(ret) = get_regex_charset(RExC_flags);
11435    *flagp |= SIMPLE;
11436    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11437     /* diag_listed_as: Use "%s" instead of "%s" */
11438     vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11439    }
11440    goto finish_meta_pat;
11441   case 'B':
11442    RExC_seen_zerolen++;
11443    RExC_seen |= REG_LOOKBEHIND_SEEN;
11444    op = NBOUND + get_regex_charset(RExC_flags);
11445    if (op > NBOUNDA) { /* /aa is same as /a */
11446     op = NBOUNDA;
11447    }
11448    else if (op == NBOUNDL) {
11449     RExC_contains_locale = 1;
11450    }
11451    ret = reg_node(pRExC_state, op);
11452    FLAGS(ret) = get_regex_charset(RExC_flags);
11453    *flagp |= SIMPLE;
11454    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11455     /* diag_listed_as: Use "%s" instead of "%s" */
11456     vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11457    }
11458    goto finish_meta_pat;
11459
11460   case 'D':
11461    invert = 1;
11462    /* FALLTHROUGH */
11463   case 'd':
11464    arg = ANYOF_DIGIT;
11465    goto join_posix;
11466
11467   case 'R':
11468    ret = reg_node(pRExC_state, LNBREAK);
11469    *flagp |= HASWIDTH|SIMPLE;
11470    goto finish_meta_pat;
11471
11472   case 'H':
11473    invert = 1;
11474    /* FALLTHROUGH */
11475   case 'h':
11476    arg = ANYOF_BLANK;
11477    op = POSIXU;
11478    goto join_posix_op_known;
11479
11480   case 'V':
11481    invert = 1;
11482    /* FALLTHROUGH */
11483   case 'v':
11484    arg = ANYOF_VERTWS;
11485    op = POSIXU;
11486    goto join_posix_op_known;
11487
11488   case 'S':
11489    invert = 1;
11490    /* FALLTHROUGH */
11491   case 's':
11492    arg = ANYOF_SPACE;
11493
11494   join_posix:
11495
11496    op = POSIXD + get_regex_charset(RExC_flags);
11497    if (op > POSIXA) {  /* /aa is same as /a */
11498     op = POSIXA;
11499    }
11500    else if (op == POSIXL) {
11501     RExC_contains_locale = 1;
11502    }
11503
11504   join_posix_op_known:
11505
11506    if (invert) {
11507     op += NPOSIXD - POSIXD;
11508    }
11509
11510    ret = reg_node(pRExC_state, op);
11511    if (! SIZE_ONLY) {
11512     FLAGS(ret) = namedclass_to_classnum(arg);
11513    }
11514
11515    *flagp |= HASWIDTH|SIMPLE;
11516    /* FALLTHROUGH */
11517
11518   finish_meta_pat:
11519    nextchar(pRExC_state);
11520    Set_Node_Length(ret, 2); /* MJD */
11521    break;
11522   case 'p':
11523   case 'P':
11524    {
11525 #ifdef DEBUGGING
11526     char* parse_start = RExC_parse - 2;
11527 #endif
11528
11529     RExC_parse--;
11530
11531     ret = regclass(pRExC_state, flagp,depth+1,
11532        TRUE, /* means just parse this element */
11533        FALSE, /* don't allow multi-char folds */
11534        FALSE, /* don't silence non-portable warnings.
11535           It would be a bug if these returned
11536           non-portables */
11537        NULL);
11538     /* regclass() can only return RESTART_UTF8 if multi-char folds
11539     are allowed.  */
11540     if (!ret)
11541      FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11542       (UV) *flagp);
11543
11544     RExC_parse--;
11545
11546     Set_Node_Offset(ret, parse_start + 2);
11547     Set_Node_Cur_Length(ret, parse_start);
11548     nextchar(pRExC_state);
11549    }
11550    break;
11551   case 'N':
11552    /* Handle \N and \N{NAME} with multiple code points here and not
11553    * below because it can be multicharacter. join_exact() will join
11554    * them up later on.  Also this makes sure that things like
11555    * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11556    * The options to the grok function call causes it to fail if the
11557    * sequence is just a single code point.  We then go treat it as
11558    * just another character in the current EXACT node, and hence it
11559    * gets uniform treatment with all the other characters.  The
11560    * special treatment for quantifiers is not needed for such single
11561    * character sequences */
11562    ++RExC_parse;
11563    if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11564         FALSE /* not strict */ )) {
11565     if (*flagp & RESTART_UTF8)
11566      return NULL;
11567     RExC_parse--;
11568     goto defchar;
11569    }
11570    break;
11571   case 'k':    /* Handle \k<NAME> and \k'NAME' */
11572   parse_named_seq:
11573   {
11574    char ch= RExC_parse[1];
11575    if (ch != '<' && ch != '\'' && ch != '{') {
11576     RExC_parse++;
11577     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11578     vFAIL2("Sequence %.2s... not terminated",parse_start);
11579    } else {
11580     /* this pretty much dupes the code for (?P=...) in reg(), if
11581     you change this make sure you change that */
11582     char* name_start = (RExC_parse += 2);
11583     U32 num = 0;
11584     SV *sv_dat = reg_scan_name(pRExC_state,
11585      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11586     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11587     if (RExC_parse == name_start || *RExC_parse != ch)
11588      /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11589      vFAIL2("Sequence %.3s... not terminated",parse_start);
11590
11591     if (!SIZE_ONLY) {
11592      num = add_data( pRExC_state, STR_WITH_LEN("S"));
11593      RExC_rxi->data->data[num]=(void*)sv_dat;
11594      SvREFCNT_inc_simple_void(sv_dat);
11595     }
11596
11597     RExC_sawback = 1;
11598     ret = reganode(pRExC_state,
11599        ((! FOLD)
11600         ? NREF
11601         : (ASCII_FOLD_RESTRICTED)
11602         ? NREFFA
11603         : (AT_LEAST_UNI_SEMANTICS)
11604          ? NREFFU
11605          : (LOC)
11606          ? NREFFL
11607          : NREFF),
11608         num);
11609     *flagp |= HASWIDTH;
11610
11611     /* override incorrect value set in reganode MJD */
11612     Set_Node_Offset(ret, parse_start+1);
11613     Set_Node_Cur_Length(ret, parse_start);
11614     nextchar(pRExC_state);
11615
11616    }
11617    break;
11618   }
11619   case 'g':
11620   case '1': case '2': case '3': case '4':
11621   case '5': case '6': case '7': case '8': case '9':
11622    {
11623     I32 num;
11624     bool hasbrace = 0;
11625
11626     if (*RExC_parse == 'g') {
11627      bool isrel = 0;
11628
11629      RExC_parse++;
11630      if (*RExC_parse == '{') {
11631       RExC_parse++;
11632       hasbrace = 1;
11633      }
11634      if (*RExC_parse == '-') {
11635       RExC_parse++;
11636       isrel = 1;
11637      }
11638      if (hasbrace && !isDIGIT(*RExC_parse)) {
11639       if (isrel) RExC_parse--;
11640       RExC_parse -= 2;
11641       goto parse_named_seq;
11642      }
11643
11644      num = S_backref_value(RExC_parse);
11645      if (num == 0)
11646       vFAIL("Reference to invalid group 0");
11647      else if (num == I32_MAX) {
11648       if (isDIGIT(*RExC_parse))
11649        vFAIL("Reference to nonexistent group");
11650       else
11651        vFAIL("Unterminated \\g... pattern");
11652      }
11653
11654      if (isrel) {
11655       num = RExC_npar - num;
11656       if (num < 1)
11657        vFAIL("Reference to nonexistent or unclosed group");
11658      }
11659     }
11660     else {
11661      num = S_backref_value(RExC_parse);
11662      /* bare \NNN might be backref or octal - if it is larger than or equal
11663      * RExC_npar then it is assumed to be and octal escape.
11664      * Note RExC_npar is +1 from the actual number of parens*/
11665      if (num == I32_MAX || (num > 9 && num >= RExC_npar
11666        && *RExC_parse != '8' && *RExC_parse != '9'))
11667      {
11668       /* Probably a character specified in octal, e.g. \35 */
11669       goto defchar;
11670      }
11671     }
11672
11673     /* at this point RExC_parse definitely points to a backref
11674     * number */
11675     {
11676 #ifdef RE_TRACK_PATTERN_OFFSETS
11677      char * const parse_start = RExC_parse - 1; /* MJD */
11678 #endif
11679      while (isDIGIT(*RExC_parse))
11680       RExC_parse++;
11681      if (hasbrace) {
11682       if (*RExC_parse != '}')
11683        vFAIL("Unterminated \\g{...} pattern");
11684       RExC_parse++;
11685      }
11686      if (!SIZE_ONLY) {
11687       if (num > (I32)RExC_rx->nparens)
11688        vFAIL("Reference to nonexistent group");
11689      }
11690      RExC_sawback = 1;
11691      ret = reganode(pRExC_state,
11692         ((! FOLD)
11693          ? REF
11694          : (ASCII_FOLD_RESTRICTED)
11695          ? REFFA
11696          : (AT_LEAST_UNI_SEMANTICS)
11697           ? REFFU
11698           : (LOC)
11699           ? REFFL
11700           : REFF),
11701          num);
11702      *flagp |= HASWIDTH;
11703
11704      /* override incorrect value set in reganode MJD */
11705      Set_Node_Offset(ret, parse_start+1);
11706      Set_Node_Cur_Length(ret, parse_start);
11707      RExC_parse--;
11708      nextchar(pRExC_state);
11709     }
11710    }
11711    break;
11712   case '\0':
11713    if (RExC_parse >= RExC_end)
11714     FAIL("Trailing \\");
11715    /* FALLTHROUGH */
11716   default:
11717    /* Do not generate "unrecognized" warnings here, we fall
11718    back into the quick-grab loop below */
11719    parse_start--;
11720    goto defchar;
11721   }
11722   break;
11723
11724  case '#':
11725   if (RExC_flags & RXf_PMf_EXTENDED) {
11726    RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11727    if (RExC_parse < RExC_end)
11728     goto tryagain;
11729   }
11730   /* FALLTHROUGH */
11731
11732  default:
11733
11734    parse_start = RExC_parse - 1;
11735
11736    RExC_parse++;
11737
11738   defchar: {
11739    STRLEN len = 0;
11740    UV ender = 0;
11741    char *p;
11742    char *s;
11743 #define MAX_NODE_STRING_SIZE 127
11744    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11745    char *s0;
11746    U8 upper_parse = MAX_NODE_STRING_SIZE;
11747    U8 node_type = compute_EXACTish(pRExC_state);
11748    bool next_is_quantifier;
11749    char * oldp = NULL;
11750
11751    /* We can convert EXACTF nodes to EXACTFU if they contain only
11752    * characters that match identically regardless of the target
11753    * string's UTF8ness.  The reason to do this is that EXACTF is not
11754    * trie-able, EXACTFU is.
11755    *
11756    * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11757    * contain only above-Latin1 characters (hence must be in UTF8),
11758    * which don't participate in folds with Latin1-range characters,
11759    * as the latter's folds aren't known until runtime.  (We don't
11760    * need to figure this out until pass 2) */
11761    bool maybe_exactfu = PASS2
11762        && (node_type == EXACTF || node_type == EXACTFL);
11763
11764    /* If a folding node contains only code points that don't
11765    * participate in folds, it can be changed into an EXACT node,
11766    * which allows the optimizer more things to look for */
11767    bool maybe_exact;
11768
11769    ret = reg_node(pRExC_state, node_type);
11770
11771    /* In pass1, folded, we use a temporary buffer instead of the
11772    * actual node, as the node doesn't exist yet */
11773    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11774
11775    s0 = s;
11776
11777   reparse:
11778
11779    /* We do the EXACTFish to EXACT node only if folding.  (And we
11780    * don't need to figure this out until pass 2) */
11781    maybe_exact = FOLD && PASS2;
11782
11783    /* XXX The node can hold up to 255 bytes, yet this only goes to
11784    * 127.  I (khw) do not know why.  Keeping it somewhat less than
11785    * 255 allows us to not have to worry about overflow due to
11786    * converting to utf8 and fold expansion, but that value is
11787    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11788    * split up by this limit into a single one using the real max of
11789    * 255.  Even at 127, this breaks under rare circumstances.  If
11790    * folding, we do not want to split a node at a character that is a
11791    * non-final in a multi-char fold, as an input string could just
11792    * happen to want to match across the node boundary.  The join
11793    * would solve that problem if the join actually happens.  But a
11794    * series of more than two nodes in a row each of 127 would cause
11795    * the first join to succeed to get to 254, but then there wouldn't
11796    * be room for the next one, which could at be one of those split
11797    * multi-char folds.  I don't know of any fool-proof solution.  One
11798    * could back off to end with only a code point that isn't such a
11799    * non-final, but it is possible for there not to be any in the
11800    * entire node. */
11801    for (p = RExC_parse - 1;
11802     len < upper_parse && p < RExC_end;
11803     len++)
11804    {
11805     oldp = p;
11806
11807     if (RExC_flags & RXf_PMf_EXTENDED)
11808      p = regpatws(pRExC_state, p,
11809           TRUE); /* means recognize comments */
11810     switch ((U8)*p) {
11811     case '^':
11812     case '$':
11813     case '.':
11814     case '[':
11815     case '(':
11816     case ')':
11817     case '|':
11818      goto loopdone;
11819     case '\\':
11820      /* Literal Escapes Switch
11821
11822      This switch is meant to handle escape sequences that
11823      resolve to a literal character.
11824
11825      Every escape sequence that represents something
11826      else, like an assertion or a char class, is handled
11827      in the switch marked 'Special Escapes' above in this
11828      routine, but also has an entry here as anything that
11829      isn't explicitly mentioned here will be treated as
11830      an unescaped equivalent literal.
11831      */
11832
11833      switch ((U8)*++p) {
11834      /* These are all the special escapes. */
11835      case 'A':             /* Start assertion */
11836      case 'b': case 'B':   /* Word-boundary assertion*/
11837      case 'C':             /* Single char !DANGEROUS! */
11838      case 'd': case 'D':   /* digit class */
11839      case 'g': case 'G':   /* generic-backref, pos assertion */
11840      case 'h': case 'H':   /* HORIZWS */
11841      case 'k': case 'K':   /* named backref, keep marker */
11842      case 'p': case 'P':   /* Unicode property */
11843        case 'R':   /* LNBREAK */
11844      case 's': case 'S':   /* space class */
11845      case 'v': case 'V':   /* VERTWS */
11846      case 'w': case 'W':   /* word class */
11847      case 'X':             /* eXtended Unicode "combining
11848            character sequence" */
11849      case 'z': case 'Z':   /* End of line/string assertion */
11850       --p;
11851       goto loopdone;
11852
11853      /* Anything after here is an escape that resolves to a
11854      literal. (Except digits, which may or may not)
11855      */
11856      case 'n':
11857       ender = '\n';
11858       p++;
11859       break;
11860      case 'N': /* Handle a single-code point named character. */
11861       /* The options cause it to fail if a multiple code
11862       * point sequence.  Handle those in the switch() above
11863       * */
11864       RExC_parse = p + 1;
11865       if (! grok_bslash_N(pRExC_state, NULL, &ender,
11866            flagp, depth, FALSE,
11867            FALSE /* not strict */ ))
11868       {
11869        if (*flagp & RESTART_UTF8)
11870         FAIL("panic: grok_bslash_N set RESTART_UTF8");
11871        RExC_parse = p = oldp;
11872        goto loopdone;
11873       }
11874       p = RExC_parse;
11875       if (ender > 0xff) {
11876        REQUIRE_UTF8;
11877       }
11878       break;
11879      case 'r':
11880       ender = '\r';
11881       p++;
11882       break;
11883      case 't':
11884       ender = '\t';
11885       p++;
11886       break;
11887      case 'f':
11888       ender = '\f';
11889       p++;
11890       break;
11891      case 'e':
11892       ender = ASCII_TO_NATIVE('\033');
11893       p++;
11894       break;
11895      case 'a':
11896       ender = '\a';
11897       p++;
11898       break;
11899      case 'o':
11900       {
11901        UV result;
11902        const char* error_msg;
11903
11904        bool valid = grok_bslash_o(&p,
11905              &result,
11906              &error_msg,
11907              TRUE, /* out warnings */
11908              FALSE, /* not strict */
11909              TRUE, /* Output warnings
11910                 for non-
11911                 portables */
11912              UTF);
11913        if (! valid) {
11914         RExC_parse = p; /* going to die anyway; point
11915             to exact spot of failure */
11916         vFAIL(error_msg);
11917        }
11918        ender = result;
11919        if (PL_encoding && ender < 0x100) {
11920         goto recode_encoding;
11921        }
11922        if (ender > 0xff) {
11923         REQUIRE_UTF8;
11924        }
11925        break;
11926       }
11927      case 'x':
11928       {
11929        UV result = UV_MAX; /* initialize to erroneous
11930             value */
11931        const char* error_msg;
11932
11933        bool valid = grok_bslash_x(&p,
11934              &result,
11935              &error_msg,
11936              TRUE, /* out warnings */
11937              FALSE, /* not strict */
11938              TRUE, /* Output warnings
11939                 for non-
11940                 portables */
11941              UTF);
11942        if (! valid) {
11943         RExC_parse = p; /* going to die anyway; point
11944             to exact spot of failure */
11945         vFAIL(error_msg);
11946        }
11947        ender = result;
11948
11949        if (PL_encoding && ender < 0x100) {
11950         goto recode_encoding;
11951        }
11952        if (ender > 0xff) {
11953         REQUIRE_UTF8;
11954        }
11955        break;
11956       }
11957      case 'c':
11958       p++;
11959       ender = grok_bslash_c(*p++, SIZE_ONLY);
11960       break;
11961      case '8': case '9': /* must be a backreference */
11962       --p;
11963       goto loopdone;
11964      case '1': case '2': case '3':case '4':
11965      case '5': case '6': case '7':
11966       /* When we parse backslash escapes there is ambiguity
11967       * between backreferences and octal escapes. Any escape
11968       * from \1 - \9 is a backreference, any multi-digit
11969       * escape which does not start with 0 and which when
11970       * evaluated as decimal could refer to an already
11971       * parsed capture buffer is a backslash. Anything else
11972       * is octal.
11973       *
11974       * Note this implies that \118 could be interpreted as
11975       * 118 OR as "\11" . "8" depending on whether there
11976       * were 118 capture buffers defined already in the
11977       * pattern.  */
11978
11979       /* NOTE, RExC_npar is 1 more than the actual number of
11980       * parens we have seen so far, hence the < RExC_npar below. */
11981
11982       if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11983       {  /* Not to be treated as an octal constant, go
11984         find backref */
11985        --p;
11986        goto loopdone;
11987       }
11988       /* FALLTHROUGH */
11989      case '0':
11990       {
11991        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11992        STRLEN numlen = 3;
11993        ender = grok_oct(p, &numlen, &flags, NULL);
11994        if (ender > 0xff) {
11995         REQUIRE_UTF8;
11996        }
11997        p += numlen;
11998        if (SIZE_ONLY   /* like \08, \178 */
11999         && numlen < 3
12000         && p < RExC_end
12001         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12002        {
12003         reg_warn_non_literal_string(
12004           p + 1,
12005           form_short_octal_warning(p, numlen));
12006        }
12007       }
12008       if (PL_encoding && ender < 0x100)
12009        goto recode_encoding;
12010       break;
12011      recode_encoding:
12012       if (! RExC_override_recoding) {
12013        SV* enc = PL_encoding;
12014        ender = reg_recode((const char)(U8)ender, &enc);
12015        if (!enc && SIZE_ONLY)
12016         ckWARNreg(p, "Invalid escape in the specified encoding");
12017        REQUIRE_UTF8;
12018       }
12019       break;
12020      case '\0':
12021       if (p >= RExC_end)
12022        FAIL("Trailing \\");
12023       /* FALLTHROUGH */
12024      default:
12025       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12026        /* Include any { following the alpha to emphasize
12027        * that it could be part of an escape at some point
12028        * in the future */
12029        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12030        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12031       }
12032       goto normal_default;
12033      } /* End of switch on '\' */
12034      break;
12035     case '{':
12036      /* Currently we don't warn when the lbrace is at the start
12037      * of a construct.  This catches it in the middle of a
12038      * literal string, or when its the first thing after
12039      * something like "\b" */
12040      if (! SIZE_ONLY
12041       && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12042      {
12043       ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12044      }
12045      /*FALLTHROUGH*/
12046     default:    /* A literal character */
12047     normal_default:
12048      if (UTF8_IS_START(*p) && UTF) {
12049       STRLEN numlen;
12050       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12051            &numlen, UTF8_ALLOW_DEFAULT);
12052       p += numlen;
12053      }
12054      else
12055       ender = (U8) *p++;
12056      break;
12057     } /* End of switch on the literal */
12058
12059     /* Here, have looked at the literal character and <ender>
12060     * contains its ordinal, <p> points to the character after it
12061     */
12062
12063     if ( RExC_flags & RXf_PMf_EXTENDED)
12064      p = regpatws(pRExC_state, p,
12065           TRUE); /* means recognize comments */
12066
12067     /* If the next thing is a quantifier, it applies to this
12068     * character only, which means that this character has to be in
12069     * its own node and can't just be appended to the string in an
12070     * existing node, so if there are already other characters in
12071     * the node, close the node with just them, and set up to do
12072     * this character again next time through, when it will be the
12073     * only thing in its new node */
12074     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12075     {
12076      p = oldp;
12077      goto loopdone;
12078     }
12079
12080     if (! FOLD   /* The simple case, just append the literal */
12081      || (LOC  /* Also don't fold for tricky chars under /l */
12082       && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12083     {
12084      if (UTF) {
12085       const STRLEN unilen = reguni(pRExC_state, ender, s);
12086       if (unilen > 0) {
12087       s   += unilen;
12088       len += unilen;
12089       }
12090
12091       /* The loop increments <len> each time, as all but this
12092       * path (and one other) through it add a single byte to
12093       * the EXACTish node.  But this one has changed len to
12094       * be the correct final value, so subtract one to
12095       * cancel out the increment that follows */
12096       len--;
12097      }
12098      else {
12099       REGC((char)ender, s++);
12100      }
12101
12102      /* Can get here if folding only if is one of the /l
12103      * characters whose fold depends on the locale.  The
12104      * occurrence of any of these indicate that we can't
12105      * simplify things */
12106      if (FOLD) {
12107       maybe_exact = FALSE;
12108       maybe_exactfu = FALSE;
12109      }
12110     }
12111     else             /* FOLD */
12112      if (! ( UTF
12113       /* See comments for join_exact() as to why we fold this
12114       * non-UTF at compile time */
12115       || (node_type == EXACTFU
12116        && ender == LATIN_SMALL_LETTER_SHARP_S)))
12117     {
12118      /* Here, are folding and are not UTF-8 encoded; therefore
12119      * the character must be in the range 0-255, and is not /l
12120      * (Not /l because we already handled these under /l in
12121      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12122      if (IS_IN_SOME_FOLD_L1(ender)) {
12123       maybe_exact = FALSE;
12124
12125       /* See if the character's fold differs between /d and
12126       * /u.  This includes the multi-char fold SHARP S to
12127       * 'ss' */
12128       if (maybe_exactfu
12129        && (PL_fold[ender] != PL_fold_latin1[ender]
12130         || ender == LATIN_SMALL_LETTER_SHARP_S
12131         || (len > 0
12132         && isARG2_lower_or_UPPER_ARG1('s', ender)
12133         && isARG2_lower_or_UPPER_ARG1('s',
12134                 *(s-1)))))
12135       {
12136        maybe_exactfu = FALSE;
12137       }
12138      }
12139
12140      /* Even when folding, we store just the input character, as
12141      * we have an array that finds its fold quickly */
12142      *(s++) = (char) ender;
12143     }
12144     else {  /* FOLD and UTF */
12145      /* Unlike the non-fold case, we do actually have to
12146      * calculate the results here in pass 1.  This is for two
12147      * reasons, the folded length may be longer than the
12148      * unfolded, and we have to calculate how many EXACTish
12149      * nodes it will take; and we may run out of room in a node
12150      * in the middle of a potential multi-char fold, and have
12151      * to back off accordingly.  (Hence we can't use REGC for
12152      * the simple case just below.) */
12153
12154      UV folded;
12155      if (isASCII(ender)) {
12156       folded = toFOLD(ender);
12157       *(s)++ = (U8) folded;
12158      }
12159      else {
12160       STRLEN foldlen;
12161
12162       folded = _to_uni_fold_flags(
12163          ender,
12164          (U8 *) s,
12165          &foldlen,
12166          FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12167               ? FOLD_FLAGS_NOMIX_ASCII
12168               : 0));
12169       s += foldlen;
12170
12171       /* The loop increments <len> each time, as all but this
12172       * path (and one other) through it add a single byte to
12173       * the EXACTish node.  But this one has changed len to
12174       * be the correct final value, so subtract one to
12175       * cancel out the increment that follows */
12176       len += foldlen - 1;
12177      }
12178      /* If this node only contains non-folding code points so
12179      * far, see if this new one is also non-folding */
12180      if (maybe_exact) {
12181       if (folded != ender) {
12182        maybe_exact = FALSE;
12183       }
12184       else {
12185        /* Here the fold is the original; we have to check
12186        * further to see if anything folds to it */
12187        if (_invlist_contains_cp(PL_utf8_foldable,
12188               ender))
12189        {
12190         maybe_exact = FALSE;
12191        }
12192       }
12193      }
12194      ender = folded;
12195     }
12196
12197     if (next_is_quantifier) {
12198
12199      /* Here, the next input is a quantifier, and to get here,
12200      * the current character is the only one in the node.
12201      * Also, here <len> doesn't include the final byte for this
12202      * character */
12203      len++;
12204      goto loopdone;
12205     }
12206
12207    } /* End of loop through literal characters */
12208
12209    /* Here we have either exhausted the input or ran out of room in
12210    * the node.  (If we encountered a character that can't be in the
12211    * node, transfer is made directly to <loopdone>, and so we
12212    * wouldn't have fallen off the end of the loop.)  In the latter
12213    * case, we artificially have to split the node into two, because
12214    * we just don't have enough space to hold everything.  This
12215    * creates a problem if the final character participates in a
12216    * multi-character fold in the non-final position, as a match that
12217    * should have occurred won't, due to the way nodes are matched,
12218    * and our artificial boundary.  So back off until we find a non-
12219    * problematic character -- one that isn't at the beginning or
12220    * middle of such a fold.  (Either it doesn't participate in any
12221    * folds, or appears only in the final position of all the folds it
12222    * does participate in.)  A better solution with far fewer false
12223    * positives, and that would fill the nodes more completely, would
12224    * be to actually have available all the multi-character folds to
12225    * test against, and to back-off only far enough to be sure that
12226    * this node isn't ending with a partial one.  <upper_parse> is set
12227    * further below (if we need to reparse the node) to include just
12228    * up through that final non-problematic character that this code
12229    * identifies, so when it is set to less than the full node, we can
12230    * skip the rest of this */
12231    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12232
12233     const STRLEN full_len = len;
12234
12235     assert(len >= MAX_NODE_STRING_SIZE);
12236
12237     /* Here, <s> points to the final byte of the final character.
12238     * Look backwards through the string until find a non-
12239     * problematic character */
12240
12241     if (! UTF) {
12242
12243      /* This has no multi-char folds to non-UTF characters */
12244      if (ASCII_FOLD_RESTRICTED) {
12245       goto loopdone;
12246      }
12247
12248      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12249      len = s - s0 + 1;
12250     }
12251     else {
12252      if (!  PL_NonL1NonFinalFold) {
12253       PL_NonL1NonFinalFold = _new_invlist_C_array(
12254           NonL1_Perl_Non_Final_Folds_invlist);
12255      }
12256
12257      /* Point to the first byte of the final character */
12258      s = (char *) utf8_hop((U8 *) s, -1);
12259
12260      while (s >= s0) {   /* Search backwards until find
12261           non-problematic char */
12262       if (UTF8_IS_INVARIANT(*s)) {
12263
12264        /* There are no ascii characters that participate
12265        * in multi-char folds under /aa.  In EBCDIC, the
12266        * non-ascii invariants are all control characters,
12267        * so don't ever participate in any folds. */
12268        if (ASCII_FOLD_RESTRICTED
12269         || ! IS_NON_FINAL_FOLD(*s))
12270        {
12271         break;
12272        }
12273       }
12274       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12275        if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12276                 *s, *(s+1))))
12277        {
12278         break;
12279        }
12280       }
12281       else if (! _invlist_contains_cp(
12282           PL_NonL1NonFinalFold,
12283           valid_utf8_to_uvchr((U8 *) s, NULL)))
12284       {
12285        break;
12286       }
12287
12288       /* Here, the current character is problematic in that
12289       * it does occur in the non-final position of some
12290       * fold, so try the character before it, but have to
12291       * special case the very first byte in the string, so
12292       * we don't read outside the string */
12293       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12294      } /* End of loop backwards through the string */
12295
12296      /* If there were only problematic characters in the string,
12297      * <s> will point to before s0, in which case the length
12298      * should be 0, otherwise include the length of the
12299      * non-problematic character just found */
12300      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12301     }
12302
12303     /* Here, have found the final character, if any, that is
12304     * non-problematic as far as ending the node without splitting
12305     * it across a potential multi-char fold.  <len> contains the
12306     * number of bytes in the node up-to and including that
12307     * character, or is 0 if there is no such character, meaning
12308     * the whole node contains only problematic characters.  In
12309     * this case, give up and just take the node as-is.  We can't
12310     * do any better */
12311     if (len == 0) {
12312      len = full_len;
12313
12314      /* If the node ends in an 's' we make sure it stays EXACTF,
12315      * as if it turns into an EXACTFU, it could later get
12316      * joined with another 's' that would then wrongly match
12317      * the sharp s */
12318      if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12319      {
12320       maybe_exactfu = FALSE;
12321      }
12322     } else {
12323
12324      /* Here, the node does contain some characters that aren't
12325      * problematic.  If one such is the final character in the
12326      * node, we are done */
12327      if (len == full_len) {
12328       goto loopdone;
12329      }
12330      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12331
12332       /* If the final character is problematic, but the
12333       * penultimate is not, back-off that last character to
12334       * later start a new node with it */
12335       p = oldp;
12336       goto loopdone;
12337      }
12338
12339      /* Here, the final non-problematic character is earlier
12340      * in the input than the penultimate character.  What we do
12341      * is reparse from the beginning, going up only as far as
12342      * this final ok one, thus guaranteeing that the node ends
12343      * in an acceptable character.  The reason we reparse is
12344      * that we know how far in the character is, but we don't
12345      * know how to correlate its position with the input parse.
12346      * An alternate implementation would be to build that
12347      * correlation as we go along during the original parse,
12348      * but that would entail extra work for every node, whereas
12349      * this code gets executed only when the string is too
12350      * large for the node, and the final two characters are
12351      * problematic, an infrequent occurrence.  Yet another
12352      * possible strategy would be to save the tail of the
12353      * string, and the next time regatom is called, initialize
12354      * with that.  The problem with this is that unless you
12355      * back off one more character, you won't be guaranteed
12356      * regatom will get called again, unless regbranch,
12357      * regpiece ... are also changed.  If you do back off that
12358      * extra character, so that there is input guaranteed to
12359      * force calling regatom, you can't handle the case where
12360      * just the first character in the node is acceptable.  I
12361      * (khw) decided to try this method which doesn't have that
12362      * pitfall; if performance issues are found, we can do a
12363      * combination of the current approach plus that one */
12364      upper_parse = len;
12365      len = 0;
12366      s = s0;
12367      goto reparse;
12368     }
12369    }   /* End of verifying node ends with an appropriate char */
12370
12371   loopdone:   /* Jumped to when encounters something that shouldn't be in
12372      the node */
12373
12374    /* I (khw) don't know if you can get here with zero length, but the
12375    * old code handled this situation by creating a zero-length EXACT
12376    * node.  Might as well be NOTHING instead */
12377    if (len == 0) {
12378     OP(ret) = NOTHING;
12379    }
12380    else {
12381     if (FOLD) {
12382      /* If 'maybe_exact' is still set here, means there are no
12383      * code points in the node that participate in folds;
12384      * similarly for 'maybe_exactfu' and code points that match
12385      * differently depending on UTF8ness of the target string
12386      * (for /u), or depending on locale for /l */
12387      if (maybe_exact) {
12388       OP(ret) = EXACT;
12389      }
12390      else if (maybe_exactfu) {
12391       OP(ret) = EXACTFU;
12392      }
12393     }
12394     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12395           FALSE /* Don't look to see if could
12396              be turned into an EXACT
12397              node, as we have already
12398              computed that */
12399           );
12400    }
12401
12402    RExC_parse = p - 1;
12403    Set_Node_Cur_Length(ret, parse_start);
12404    nextchar(pRExC_state);
12405    {
12406     /* len is STRLEN which is unsigned, need to copy to signed */
12407     IV iv = len;
12408     if (iv < 0)
12409      vFAIL("Internal disaster");
12410    }
12411
12412   } /* End of label 'defchar:' */
12413   break;
12414  } /* End of giant switch on input character */
12415
12416  return(ret);
12417 }
12418
12419 STATIC char *
12420 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12421 {
12422  /* Returns the next non-pattern-white space, non-comment character (the
12423  * latter only if 'recognize_comment is true) in the string p, which is
12424  * ended by RExC_end.  See also reg_skipcomment */
12425  const char *e = RExC_end;
12426
12427  PERL_ARGS_ASSERT_REGPATWS;
12428
12429  while (p < e) {
12430   STRLEN len;
12431   if ((len = is_PATWS_safe(p, e, UTF))) {
12432    p += len;
12433   }
12434   else if (recognize_comment && *p == '#') {
12435    p = reg_skipcomment(pRExC_state, p);
12436   }
12437   else
12438    break;
12439  }
12440  return p;
12441 }
12442
12443 STATIC void
12444 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12445 {
12446  /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12447  * sets up the bitmap and any flags, removing those code points from the
12448  * inversion list, setting it to NULL should it become completely empty */
12449
12450  PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12451  assert(PL_regkind[OP(node)] == ANYOF);
12452
12453  ANYOF_BITMAP_ZERO(node);
12454  if (*invlist_ptr) {
12455
12456   /* This gets set if we actually need to modify things */
12457   bool change_invlist = FALSE;
12458
12459   UV start, end;
12460
12461   /* Start looking through *invlist_ptr */
12462   invlist_iterinit(*invlist_ptr);
12463   while (invlist_iternext(*invlist_ptr, &start, &end)) {
12464    UV high;
12465    int i;
12466
12467    if (end == UV_MAX && start <= 256) {
12468     ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12469    }
12470    else if (end >= 256) {
12471     ANYOF_FLAGS(node) |= ANYOF_UTF8;
12472    }
12473
12474    /* Quit if are above what we should change */
12475    if (start > 255) {
12476     break;
12477    }
12478
12479    change_invlist = TRUE;
12480
12481    /* Set all the bits in the range, up to the max that we are doing */
12482    high = (end < 255) ? end : 255;
12483    for (i = start; i <= (int) high; i++) {
12484     if (! ANYOF_BITMAP_TEST(node, i)) {
12485      ANYOF_BITMAP_SET(node, i);
12486     }
12487    }
12488   }
12489   invlist_iterfinish(*invlist_ptr);
12490
12491   /* Done with loop; remove any code points that are in the bitmap from
12492   * *invlist_ptr; similarly for code points above latin1 if we have a
12493   * flag to match all of them anyways */
12494   if (change_invlist) {
12495    _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12496   }
12497   if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12498    _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12499   }
12500
12501   /* If have completely emptied it, remove it completely */
12502   if (_invlist_len(*invlist_ptr) == 0) {
12503    SvREFCNT_dec_NN(*invlist_ptr);
12504    *invlist_ptr = NULL;
12505   }
12506  }
12507 }
12508
12509 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12510    Character classes ([:foo:]) can also be negated ([:^foo:]).
12511    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12512    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12513    but trigger failures because they are currently unimplemented. */
12514
12515 #define POSIXCC_DONE(c)   ((c) == ':')
12516 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12517 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12518
12519 PERL_STATIC_INLINE I32
12520 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12521 {
12522  I32 namedclass = OOB_NAMEDCLASS;
12523
12524  PERL_ARGS_ASSERT_REGPPOSIXCC;
12525
12526  if (value == '[' && RExC_parse + 1 < RExC_end &&
12527   /* I smell either [: or [= or [. -- POSIX has been here, right? */
12528   POSIXCC(UCHARAT(RExC_parse)))
12529  {
12530   const char c = UCHARAT(RExC_parse);
12531   char* const s = RExC_parse++;
12532
12533   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12534    RExC_parse++;
12535   if (RExC_parse == RExC_end) {
12536    if (strict) {
12537
12538     /* Try to give a better location for the error (than the end of
12539     * the string) by looking for the matching ']' */
12540     RExC_parse = s;
12541     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12542      RExC_parse++;
12543     }
12544     vFAIL2("Unmatched '%c' in POSIX class", c);
12545    }
12546    /* Grandfather lone [:, [=, [. */
12547    RExC_parse = s;
12548   }
12549   else {
12550    const char* const t = RExC_parse++; /* skip over the c */
12551    assert(*t == c);
12552
12553    if (UCHARAT(RExC_parse) == ']') {
12554     const char *posixcc = s + 1;
12555     RExC_parse++; /* skip over the ending ] */
12556
12557     if (*s == ':') {
12558      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12559      const I32 skip = t - posixcc;
12560
12561      /* Initially switch on the length of the name.  */
12562      switch (skip) {
12563      case 4:
12564       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12565               this is the Perl \w
12566               */
12567        namedclass = ANYOF_WORDCHAR;
12568       break;
12569      case 5:
12570       /* Names all of length 5.  */
12571       /* alnum alpha ascii blank cntrl digit graph lower
12572       print punct space upper  */
12573       /* Offset 4 gives the best switch position.  */
12574       switch (posixcc[4]) {
12575       case 'a':
12576        if (memEQ(posixcc, "alph", 4)) /* alpha */
12577         namedclass = ANYOF_ALPHA;
12578        break;
12579       case 'e':
12580        if (memEQ(posixcc, "spac", 4)) /* space */
12581         namedclass = ANYOF_PSXSPC;
12582        break;
12583       case 'h':
12584        if (memEQ(posixcc, "grap", 4)) /* graph */
12585         namedclass = ANYOF_GRAPH;
12586        break;
12587       case 'i':
12588        if (memEQ(posixcc, "asci", 4)) /* ascii */
12589         namedclass = ANYOF_ASCII;
12590        break;
12591       case 'k':
12592        if (memEQ(posixcc, "blan", 4)) /* blank */
12593         namedclass = ANYOF_BLANK;
12594        break;
12595       case 'l':
12596        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12597         namedclass = ANYOF_CNTRL;
12598        break;
12599       case 'm':
12600        if (memEQ(posixcc, "alnu", 4)) /* alnum */
12601         namedclass = ANYOF_ALPHANUMERIC;
12602        break;
12603       case 'r':
12604        if (memEQ(posixcc, "lowe", 4)) /* lower */
12605         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12606        else if (memEQ(posixcc, "uppe", 4)) /* upper */
12607         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12608        break;
12609       case 't':
12610        if (memEQ(posixcc, "digi", 4)) /* digit */
12611         namedclass = ANYOF_DIGIT;
12612        else if (memEQ(posixcc, "prin", 4)) /* print */
12613         namedclass = ANYOF_PRINT;
12614        else if (memEQ(posixcc, "punc", 4)) /* punct */
12615         namedclass = ANYOF_PUNCT;
12616        break;
12617       }
12618       break;
12619      case 6:
12620       if (memEQ(posixcc, "xdigit", 6))
12621        namedclass = ANYOF_XDIGIT;
12622       break;
12623      }
12624
12625      if (namedclass == OOB_NAMEDCLASS)
12626       vFAIL2utf8f(
12627        "POSIX class [:%"UTF8f":] unknown",
12628        UTF8fARG(UTF, t - s - 1, s + 1));
12629
12630      /* The #defines are structured so each complement is +1 to
12631      * the normal one */
12632      if (complement) {
12633       namedclass++;
12634      }
12635      assert (posixcc[skip] == ':');
12636      assert (posixcc[skip+1] == ']');
12637     } else if (!SIZE_ONLY) {
12638      /* [[=foo=]] and [[.foo.]] are still future. */
12639
12640      /* adjust RExC_parse so the warning shows after
12641      the class closes */
12642      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12643       RExC_parse++;
12644      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12645     }
12646    } else {
12647     /* Maternal grandfather:
12648     * "[:" ending in ":" but not in ":]" */
12649     if (strict) {
12650      vFAIL("Unmatched '[' in POSIX class");
12651     }
12652
12653     /* Grandfather lone [:, [=, [. */
12654     RExC_parse = s;
12655    }
12656   }
12657  }
12658
12659  return namedclass;
12660 }
12661
12662 STATIC bool
12663 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12664 {
12665  /* This applies some heuristics at the current parse position (which should
12666  * be at a '[') to see if what follows might be intended to be a [:posix:]
12667  * class.  It returns true if it really is a posix class, of course, but it
12668  * also can return true if it thinks that what was intended was a posix
12669  * class that didn't quite make it.
12670  *
12671  * It will return true for
12672  *      [:alphanumerics:
12673  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12674  *                         ')' indicating the end of the (?[
12675  *      [:any garbage including %^&$ punctuation:]
12676  *
12677  * This is designed to be called only from S_handle_regex_sets; it could be
12678  * easily adapted to be called from the spot at the beginning of regclass()
12679  * that checks to see in a normal bracketed class if the surrounding []
12680  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12681  * change long-standing behavior, so I (khw) didn't do that */
12682  char* p = RExC_parse + 1;
12683  char first_char = *p;
12684
12685  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12686
12687  assert(*(p - 1) == '[');
12688
12689  if (! POSIXCC(first_char)) {
12690   return FALSE;
12691  }
12692
12693  p++;
12694  while (p < RExC_end && isWORDCHAR(*p)) p++;
12695
12696  if (p >= RExC_end) {
12697   return FALSE;
12698  }
12699
12700  if (p - RExC_parse > 2    /* Got at least 1 word character */
12701   && (*p == first_char
12702    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12703  {
12704   return TRUE;
12705  }
12706
12707  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12708
12709  return (p
12710    && p - RExC_parse > 2 /* [:] evaluates to colon;
12711          [::] is a bad posix class. */
12712    && first_char == *(p - 1));
12713 }
12714
12715 STATIC regnode *
12716 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12717      I32 *flagp, U32 depth,
12718      char * const oregcomp_parse)
12719 {
12720  /* Handle the (?[...]) construct to do set operations */
12721
12722  U8 curchar;
12723  UV start, end; /* End points of code point ranges */
12724  SV* result_string;
12725  char *save_end, *save_parse;
12726  SV* final;
12727  STRLEN len;
12728  regnode* node;
12729  AV* stack;
12730  const bool save_fold = FOLD;
12731
12732  GET_RE_DEBUG_FLAGS_DECL;
12733
12734  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12735
12736  if (LOC) {
12737   vFAIL("(?[...]) not valid in locale");
12738  }
12739  RExC_uni_semantics = 1;
12740
12741  /* This will return only an ANYOF regnode, or (unlikely) something smaller
12742  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12743  * call regclass to handle '[]' so as to not have to reinvent its parsing
12744  * rules here (throwing away the size it computes each time).  And, we exit
12745  * upon an unescaped ']' that isn't one ending a regclass.  To do both
12746  * these things, we need to realize that something preceded by a backslash
12747  * is escaped, so we have to keep track of backslashes */
12748  if (SIZE_ONLY) {
12749   UV depth = 0; /* how many nested (?[...]) constructs */
12750
12751   Perl_ck_warner_d(aTHX_
12752    packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12753    "The regex_sets feature is experimental" REPORT_LOCATION,
12754     UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12755     UTF8fARG(UTF,
12756       RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12757       RExC_precomp + (RExC_parse - RExC_precomp)));
12758
12759   while (RExC_parse < RExC_end) {
12760    SV* current = NULL;
12761    RExC_parse = regpatws(pRExC_state, RExC_parse,
12762           TRUE); /* means recognize comments */
12763    switch (*RExC_parse) {
12764     case '?':
12765      if (RExC_parse[1] == '[') depth++, RExC_parse++;
12766      /* FALLTHROUGH */
12767     default:
12768      break;
12769     case '\\':
12770      /* Skip the next byte (which could cause us to end up in
12771      * the middle of a UTF-8 character, but since none of those
12772      * are confusable with anything we currently handle in this
12773      * switch (invariants all), it's safe.  We'll just hit the
12774      * default: case next time and keep on incrementing until
12775      * we find one of the invariants we do handle. */
12776      RExC_parse++;
12777      break;
12778     case '[':
12779     {
12780      /* If this looks like it is a [:posix:] class, leave the
12781      * parse pointer at the '[' to fool regclass() into
12782      * thinking it is part of a '[[:posix:]]'.  That function
12783      * will use strict checking to force a syntax error if it
12784      * doesn't work out to a legitimate class */
12785      bool is_posix_class
12786          = could_it_be_a_POSIX_class(pRExC_state);
12787      if (! is_posix_class) {
12788       RExC_parse++;
12789      }
12790
12791      /* regclass() can only return RESTART_UTF8 if multi-char
12792      folds are allowed.  */
12793      if (!regclass(pRExC_state, flagp,depth+1,
12794         is_posix_class, /* parse the whole char
12795              class only if not a
12796              posix class */
12797         FALSE, /* don't allow multi-char folds */
12798         TRUE, /* silence non-portable warnings. */
12799         &current))
12800       FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12801        (UV) *flagp);
12802
12803      /* function call leaves parse pointing to the ']', except
12804      * if we faked it */
12805      if (is_posix_class) {
12806       RExC_parse--;
12807      }
12808
12809      SvREFCNT_dec(current);   /* In case it returned something */
12810      break;
12811     }
12812
12813     case ']':
12814      if (depth--) break;
12815      RExC_parse++;
12816      if (RExC_parse < RExC_end
12817       && *RExC_parse == ')')
12818      {
12819       node = reganode(pRExC_state, ANYOF, 0);
12820       RExC_size += ANYOF_SKIP;
12821       nextchar(pRExC_state);
12822       Set_Node_Length(node,
12823         RExC_parse - oregcomp_parse + 1); /* MJD */
12824       return node;
12825      }
12826      goto no_close;
12827    }
12828    RExC_parse++;
12829   }
12830
12831   no_close:
12832   FAIL("Syntax error in (?[...])");
12833  }
12834
12835  /* Pass 2 only after this.  Everything in this construct is a
12836  * metacharacter.  Operands begin with either a '\' (for an escape
12837  * sequence), or a '[' for a bracketed character class.  Any other
12838  * character should be an operator, or parenthesis for grouping.  Both
12839  * types of operands are handled by calling regclass() to parse them.  It
12840  * is called with a parameter to indicate to return the computed inversion
12841  * list.  The parsing here is implemented via a stack.  Each entry on the
12842  * stack is a single character representing one of the operators, or the
12843  * '('; or else a pointer to an operand inversion list. */
12844
12845 #define IS_OPERAND(a)  (! SvIOK(a))
12846
12847  /* The stack starts empty.  It is a syntax error if the first thing parsed
12848  * is a binary operator; everything else is pushed on the stack.  When an
12849  * operand is parsed, the top of the stack is examined.  If it is a binary
12850  * operator, the item before it should be an operand, and both are replaced
12851  * by the result of doing that operation on the new operand and the one on
12852  * the stack.   Thus a sequence of binary operands is reduced to a single
12853  * one before the next one is parsed.
12854  *
12855  * A unary operator may immediately follow a binary in the input, for
12856  * example
12857  *      [a] + ! [b]
12858  * When an operand is parsed and the top of the stack is a unary operator,
12859  * the operation is performed, and then the stack is rechecked to see if
12860  * this new operand is part of a binary operation; if so, it is handled as
12861  * above.
12862  *
12863  * A '(' is simply pushed on the stack; it is valid only if the stack is
12864  * empty, or the top element of the stack is an operator or another '('
12865  * (for which the parenthesized expression will become an operand).  By the
12866  * time the corresponding ')' is parsed everything in between should have
12867  * been parsed and evaluated to a single operand (or else is a syntax
12868  * error), and is handled as a regular operand */
12869
12870  sv_2mortal((SV *)(stack = newAV()));
12871
12872  while (RExC_parse < RExC_end) {
12873   I32 top_index = av_tindex(stack);
12874   SV** top_ptr;
12875   SV* current = NULL;
12876
12877   /* Skip white space */
12878   RExC_parse = regpatws(pRExC_state, RExC_parse,
12879           TRUE /* means recognize comments */ );
12880   if (RExC_parse >= RExC_end) {
12881    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12882   }
12883   if ((curchar = UCHARAT(RExC_parse)) == ']') {
12884    break;
12885   }
12886
12887   switch (curchar) {
12888
12889    case '?':
12890     if (av_tindex(stack) >= 0   /* This makes sure that we can
12891            safely subtract 1 from
12892            RExC_parse in the next clause.
12893            If we have something on the
12894            stack, we have parsed something
12895            */
12896      && UCHARAT(RExC_parse - 1) == '('
12897      && RExC_parse < RExC_end)
12898     {
12899      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12900      * This happens when we have some thing like
12901      *
12902      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12903      *   ...
12904      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12905      *
12906      * Here we would be handling the interpolated
12907      * '$thai_or_lao'.  We handle this by a recursive call to
12908      * ourselves which returns the inversion list the
12909      * interpolated expression evaluates to.  We use the flags
12910      * from the interpolated pattern. */
12911      U32 save_flags = RExC_flags;
12912      const char * const save_parse = ++RExC_parse;
12913
12914      parse_lparen_question_flags(pRExC_state);
12915
12916      if (RExC_parse == save_parse  /* Makes sure there was at
12917              least one flag (or this
12918              embedding wasn't compiled)
12919             */
12920       || RExC_parse >= RExC_end - 4
12921       || UCHARAT(RExC_parse) != ':'
12922       || UCHARAT(++RExC_parse) != '('
12923       || UCHARAT(++RExC_parse) != '?'
12924       || UCHARAT(++RExC_parse) != '[')
12925      {
12926
12927       /* In combination with the above, this moves the
12928       * pointer to the point just after the first erroneous
12929       * character (or if there are no flags, to where they
12930       * should have been) */
12931       if (RExC_parse >= RExC_end - 4) {
12932        RExC_parse = RExC_end;
12933       }
12934       else if (RExC_parse != save_parse) {
12935        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12936       }
12937       vFAIL("Expecting '(?flags:(?[...'");
12938      }
12939      RExC_parse++;
12940      (void) handle_regex_sets(pRExC_state, &current, flagp,
12941              depth+1, oregcomp_parse);
12942
12943      /* Here, 'current' contains the embedded expression's
12944      * inversion list, and RExC_parse points to the trailing
12945      * ']'; the next character should be the ')' which will be
12946      * paired with the '(' that has been put on the stack, so
12947      * the whole embedded expression reduces to '(operand)' */
12948      RExC_parse++;
12949
12950      RExC_flags = save_flags;
12951      goto handle_operand;
12952     }
12953     /* FALLTHROUGH */
12954
12955    default:
12956     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12957     vFAIL("Unexpected character");
12958
12959    case '\\':
12960     /* regclass() can only return RESTART_UTF8 if multi-char
12961     folds are allowed.  */
12962     if (!regclass(pRExC_state, flagp,depth+1,
12963        TRUE, /* means parse just the next thing */
12964        FALSE, /* don't allow multi-char folds */
12965        FALSE, /* don't silence non-portable warnings.  */
12966        &current))
12967      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12968       (UV) *flagp);
12969     /* regclass() will return with parsing just the \ sequence,
12970     * leaving the parse pointer at the next thing to parse */
12971     RExC_parse--;
12972     goto handle_operand;
12973
12974    case '[':   /* Is a bracketed character class */
12975    {
12976     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12977
12978     if (! is_posix_class) {
12979      RExC_parse++;
12980     }
12981
12982     /* regclass() can only return RESTART_UTF8 if multi-char
12983     folds are allowed.  */
12984     if(!regclass(pRExC_state, flagp,depth+1,
12985        is_posix_class, /* parse the whole char class
12986             only if not a posix class */
12987        FALSE, /* don't allow multi-char folds */
12988        FALSE, /* don't silence non-portable warnings.  */
12989        &current))
12990      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12991       (UV) *flagp);
12992     /* function call leaves parse pointing to the ']', except if we
12993     * faked it */
12994     if (is_posix_class) {
12995      RExC_parse--;
12996     }
12997
12998     goto handle_operand;
12999    }
13000
13001    case '&':
13002    case '|':
13003    case '+':
13004    case '-':
13005    case '^':
13006     if (top_index < 0
13007      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13008      || ! IS_OPERAND(*top_ptr))
13009     {
13010      RExC_parse++;
13011      vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13012     }
13013     av_push(stack, newSVuv(curchar));
13014     break;
13015
13016    case '!':
13017     av_push(stack, newSVuv(curchar));
13018     break;
13019
13020    case '(':
13021     if (top_index >= 0) {
13022      top_ptr = av_fetch(stack, top_index, FALSE);
13023      assert(top_ptr);
13024      if (IS_OPERAND(*top_ptr)) {
13025       RExC_parse++;
13026       vFAIL("Unexpected '(' with no preceding operator");
13027      }
13028     }
13029     av_push(stack, newSVuv(curchar));
13030     break;
13031
13032    case ')':
13033    {
13034     SV* lparen;
13035     if (top_index < 1
13036      || ! (current = av_pop(stack))
13037      || ! IS_OPERAND(current)
13038      || ! (lparen = av_pop(stack))
13039      || IS_OPERAND(lparen)
13040      || SvUV(lparen) != '(')
13041     {
13042      SvREFCNT_dec(current);
13043      RExC_parse++;
13044      vFAIL("Unexpected ')'");
13045     }
13046     top_index -= 2;
13047     SvREFCNT_dec_NN(lparen);
13048
13049     /* FALLTHROUGH */
13050    }
13051
13052    handle_operand:
13053
13054     /* Here, we have an operand to process, in 'current' */
13055
13056     if (top_index < 0) {    /* Just push if stack is empty */
13057      av_push(stack, current);
13058     }
13059     else {
13060      SV* top = av_pop(stack);
13061      SV *prev = NULL;
13062      char current_operator;
13063
13064      if (IS_OPERAND(top)) {
13065       SvREFCNT_dec_NN(top);
13066       SvREFCNT_dec_NN(current);
13067       vFAIL("Operand with no preceding operator");
13068      }
13069      current_operator = (char) SvUV(top);
13070      switch (current_operator) {
13071       case '(':   /* Push the '(' back on followed by the new
13072          operand */
13073        av_push(stack, top);
13074        av_push(stack, current);
13075        SvREFCNT_inc(top);  /* Counters the '_dec' done
13076             just after the 'break', so
13077             it doesn't get wrongly freed
13078             */
13079        break;
13080
13081       case '!':
13082        _invlist_invert(current);
13083
13084        /* Unlike binary operators, the top of the stack,
13085        * now that this unary one has been popped off, may
13086        * legally be an operator, and we now have operand
13087        * for it. */
13088        top_index--;
13089        SvREFCNT_dec_NN(top);
13090        goto handle_operand;
13091
13092       case '&':
13093        prev = av_pop(stack);
13094        _invlist_intersection(prev,
13095             current,
13096             &current);
13097        av_push(stack, current);
13098        break;
13099
13100       case '|':
13101       case '+':
13102        prev = av_pop(stack);
13103        _invlist_union(prev, current, &current);
13104        av_push(stack, current);
13105        break;
13106
13107       case '-':
13108        prev = av_pop(stack);;
13109        _invlist_subtract(prev, current, &current);
13110        av_push(stack, current);
13111        break;
13112
13113       case '^':   /* The union minus the intersection */
13114       {
13115        SV* i = NULL;
13116        SV* u = NULL;
13117        SV* element;
13118
13119        prev = av_pop(stack);
13120        _invlist_union(prev, current, &u);
13121        _invlist_intersection(prev, current, &i);
13122        /* _invlist_subtract will overwrite current
13123         without freeing what it already contains */
13124        element = current;
13125        _invlist_subtract(u, i, &current);
13126        av_push(stack, current);
13127        SvREFCNT_dec_NN(i);
13128        SvREFCNT_dec_NN(u);
13129        SvREFCNT_dec_NN(element);
13130        break;
13131       }
13132
13133       default:
13134        Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13135     }
13136     SvREFCNT_dec_NN(top);
13137     SvREFCNT_dec(prev);
13138    }
13139   }
13140
13141   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13142  }
13143
13144  if (av_tindex(stack) < 0   /* Was empty */
13145   || ((final = av_pop(stack)) == NULL)
13146   || ! IS_OPERAND(final)
13147   || av_tindex(stack) >= 0)  /* More left on stack */
13148  {
13149   vFAIL("Incomplete expression within '(?[ ])'");
13150  }
13151
13152  /* Here, 'final' is the resultant inversion list from evaluating the
13153  * expression.  Return it if so requested */
13154  if (return_invlist) {
13155   *return_invlist = final;
13156   return END;
13157  }
13158
13159  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13160  * expecting a string of ranges and individual code points */
13161  invlist_iterinit(final);
13162  result_string = newSVpvs("");
13163  while (invlist_iternext(final, &start, &end)) {
13164   if (start == end) {
13165    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13166   }
13167   else {
13168    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13169              start,          end);
13170   }
13171  }
13172
13173  save_parse = RExC_parse;
13174  RExC_parse = SvPV(result_string, len);
13175  save_end = RExC_end;
13176  RExC_end = RExC_parse + len;
13177
13178  /* We turn off folding around the call, as the class we have constructed
13179  * already has all folding taken into consideration, and we don't want
13180  * regclass() to add to that */
13181  RExC_flags &= ~RXf_PMf_FOLD;
13182  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13183  */
13184  node = regclass(pRExC_state, flagp,depth+1,
13185      FALSE, /* means parse the whole char class */
13186      FALSE, /* don't allow multi-char folds */
13187      TRUE, /* silence non-portable warnings.  The above may very
13188        well have generated non-portable code points, but
13189        they're valid on this machine */
13190      NULL);
13191  if (!node)
13192   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13193      PTR2UV(flagp));
13194  if (save_fold) {
13195   RExC_flags |= RXf_PMf_FOLD;
13196  }
13197  RExC_parse = save_parse + 1;
13198  RExC_end = save_end;
13199  SvREFCNT_dec_NN(final);
13200  SvREFCNT_dec_NN(result_string);
13201
13202  nextchar(pRExC_state);
13203  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13204  return node;
13205 }
13206 #undef IS_OPERAND
13207
13208 STATIC void
13209 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13210 {
13211  /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13212  * innocent-looking character class, like /[ks]/i won't have to go out to
13213  * disk to find the possible matches.
13214  *
13215  * This should be called only for a Latin1-range code points, cp, which is
13216  * known to be involved in a simple fold with other code points above
13217  * Latin1.  It would give false results if /aa has been specified.
13218  * Multi-char folds are outside the scope of this, and must be handled
13219  * specially.
13220  *
13221  * XXX It would be better to generate these via regen, in case a new
13222  * version of the Unicode standard adds new mappings, though that is not
13223  * really likely, and may be caught by the default: case of the switch
13224  * below. */
13225
13226  PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13227
13228  assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13229
13230  switch (cp) {
13231   case 'k':
13232   case 'K':
13233   *invlist =
13234    add_cp_to_invlist(*invlist, KELVIN_SIGN);
13235    break;
13236   case 's':
13237   case 'S':
13238   *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13239    break;
13240   case MICRO_SIGN:
13241   *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13242   *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13243    break;
13244   case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13245   case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13246   *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13247    break;
13248   case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13249   *invlist = add_cp_to_invlist(*invlist,
13250           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13251    break;
13252   case LATIN_SMALL_LETTER_SHARP_S:
13253   *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13254    break;
13255   default:
13256    /* Use deprecated warning to increase the chances of this being
13257    * output */
13258    ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13259    break;
13260  }
13261 }
13262
13263 /* The names of properties whose definitions are not known at compile time are
13264  * stored in this SV, after a constant heading.  So if the length has been
13265  * changed since initialization, then there is a run-time definition. */
13266 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13267           (SvCUR(listsv) != initial_listsv_len)
13268
13269 STATIC regnode *
13270 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13271     const bool stop_at_1,  /* Just parse the next thing, don't
13272           look for a full character class */
13273     bool allow_multi_folds,
13274     const bool silence_non_portable,   /* Don't output warnings
13275              about too large
13276              characters */
13277     SV** ret_invlist)  /* Return an inversion list, not a node */
13278 {
13279  /* parse a bracketed class specification.  Most of these will produce an
13280  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13281  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13282  * under /i with multi-character folds: it will be rewritten following the
13283  * paradigm of this example, where the <multi-fold>s are characters which
13284  * fold to multiple character sequences:
13285  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13286  * gets effectively rewritten as:
13287  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13288  * reg() gets called (recursively) on the rewritten version, and this
13289  * function will return what it constructs.  (Actually the <multi-fold>s
13290  * aren't physically removed from the [abcdefghi], it's just that they are
13291  * ignored in the recursion by means of a flag:
13292  * <RExC_in_multi_char_class>.)
13293  *
13294  * ANYOF nodes contain a bit map for the first 256 characters, with the
13295  * corresponding bit set if that character is in the list.  For characters
13296  * above 255, a range list or swash is used.  There are extra bits for \w,
13297  * etc. in locale ANYOFs, as what these match is not determinable at
13298  * compile time
13299  *
13300  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13301  * to be restarted.  This can only happen if ret_invlist is non-NULL.
13302  */
13303
13304  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13305  IV range = 0;
13306  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13307  regnode *ret;
13308  STRLEN numlen;
13309  IV namedclass = OOB_NAMEDCLASS;
13310  char *rangebegin = NULL;
13311  bool need_class = 0;
13312  SV *listsv = NULL;
13313  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13314          than just initialized.  */
13315  SV* properties = NULL;    /* Code points that match \p{} \P{} */
13316  SV* posixes = NULL;     /* Code points that match classes like [:word:],
13317        extended beyond the Latin1 range.  These have to
13318        be kept separate from other code points for much
13319        of this function because their handling  is
13320        different under /i, and for most classes under
13321        /d as well */
13322  SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13323        separate for a while from the non-complemented
13324        versions because of complications with /d
13325        matching */
13326  UV element_count = 0;   /* Number of distinct elements in the class.
13327        Optimizations may be possible if this is tiny */
13328  AV * multi_char_matches = NULL; /* Code points that fold to more than one
13329          character; used under /i */
13330  UV n;
13331  char * stop_ptr = RExC_end;    /* where to stop parsing */
13332  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13333             space? */
13334  const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13335
13336  /* Unicode properties are stored in a swash; this holds the current one
13337  * being parsed.  If this swash is the only above-latin1 component of the
13338  * character class, an optimization is to pass it directly on to the
13339  * execution engine.  Otherwise, it is set to NULL to indicate that there
13340  * are other things in the class that have to be dealt with at execution
13341  * time */
13342  SV* swash = NULL;  /* Code points that match \p{} \P{} */
13343
13344  /* Set if a component of this character class is user-defined; just passed
13345  * on to the engine */
13346  bool has_user_defined_property = FALSE;
13347
13348  /* inversion list of code points this node matches only when the target
13349  * string is in UTF-8.  (Because is under /d) */
13350  SV* depends_list = NULL;
13351
13352  /* Inversion list of code points this node matches regardless of things
13353  * like locale, folding, utf8ness of the target string */
13354  SV* cp_list = NULL;
13355
13356  /* Like cp_list, but code points on this list need to be checked for things
13357  * that fold to/from them under /i */
13358  SV* cp_foldable_list = NULL;
13359
13360  /* Like cp_list, but code points on this list are valid only when the
13361  * runtime locale is UTF-8 */
13362  SV* only_utf8_locale_list = NULL;
13363
13364 #ifdef EBCDIC
13365  /* In a range, counts how many 0-2 of the ends of it came from literals,
13366  * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13367  UV literal_endpoint = 0;
13368 #endif
13369  bool invert = FALSE;    /* Is this class to be complemented */
13370
13371  bool warn_super = ALWAYS_WARN_SUPER;
13372
13373  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13374   case we need to change the emitted regop to an EXACT. */
13375  const char * orig_parse = RExC_parse;
13376  const SSize_t orig_size = RExC_size;
13377  bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13378  GET_RE_DEBUG_FLAGS_DECL;
13379
13380  PERL_ARGS_ASSERT_REGCLASS;
13381 #ifndef DEBUGGING
13382  PERL_UNUSED_ARG(depth);
13383 #endif
13384
13385  DEBUG_PARSE("clas");
13386
13387  /* Assume we are going to generate an ANYOF node. */
13388  ret = reganode(pRExC_state, ANYOF, 0);
13389
13390  if (SIZE_ONLY) {
13391   RExC_size += ANYOF_SKIP;
13392   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13393  }
13394  else {
13395   ANYOF_FLAGS(ret) = 0;
13396
13397   RExC_emit += ANYOF_SKIP;
13398   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13399   initial_listsv_len = SvCUR(listsv);
13400   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13401  }
13402
13403  if (skip_white) {
13404   RExC_parse = regpatws(pRExC_state, RExC_parse,
13405        FALSE /* means don't recognize comments */ );
13406  }
13407
13408  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13409   RExC_parse++;
13410   invert = TRUE;
13411   allow_multi_folds = FALSE;
13412   RExC_naughty++;
13413   if (skip_white) {
13414    RExC_parse = regpatws(pRExC_state, RExC_parse,
13415         FALSE /* means don't recognize comments */ );
13416   }
13417  }
13418
13419  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13420  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13421   const char *s = RExC_parse;
13422   const char  c = *s++;
13423
13424   while (isWORDCHAR(*s))
13425    s++;
13426   if (*s && c == *s && s[1] == ']') {
13427    SAVEFREESV(RExC_rx_sv);
13428    ckWARN3reg(s+2,
13429      "POSIX syntax [%c %c] belongs inside character classes",
13430      c, c);
13431    (void)ReREFCNT_inc(RExC_rx_sv);
13432   }
13433  }
13434
13435  /* If the caller wants us to just parse a single element, accomplish this
13436  * by faking the loop ending condition */
13437  if (stop_at_1 && RExC_end > RExC_parse) {
13438   stop_ptr = RExC_parse + 1;
13439  }
13440
13441  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13442  if (UCHARAT(RExC_parse) == ']')
13443   goto charclassloop;
13444
13445 parseit:
13446  while (1) {
13447   if  (RExC_parse >= stop_ptr) {
13448    break;
13449   }
13450
13451   if (skip_white) {
13452    RExC_parse = regpatws(pRExC_state, RExC_parse,
13453         FALSE /* means don't recognize comments */ );
13454   }
13455
13456   if  (UCHARAT(RExC_parse) == ']') {
13457    break;
13458   }
13459
13460  charclassloop:
13461
13462   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13463   save_value = value;
13464   save_prevvalue = prevvalue;
13465
13466   if (!range) {
13467    rangebegin = RExC_parse;
13468    element_count++;
13469   }
13470   if (UTF) {
13471    value = utf8n_to_uvchr((U8*)RExC_parse,
13472         RExC_end - RExC_parse,
13473         &numlen, UTF8_ALLOW_DEFAULT);
13474    RExC_parse += numlen;
13475   }
13476   else
13477    value = UCHARAT(RExC_parse++);
13478
13479   if (value == '['
13480    && RExC_parse < RExC_end
13481    && POSIXCC(UCHARAT(RExC_parse)))
13482   {
13483    namedclass = regpposixcc(pRExC_state, value, strict);
13484   }
13485   else if (value == '\\') {
13486    if (UTF) {
13487     value = utf8n_to_uvchr((U8*)RExC_parse,
13488         RExC_end - RExC_parse,
13489         &numlen, UTF8_ALLOW_DEFAULT);
13490     RExC_parse += numlen;
13491    }
13492    else
13493     value = UCHARAT(RExC_parse++);
13494
13495    /* Some compilers cannot handle switching on 64-bit integer
13496    * values, therefore value cannot be an UV.  Yes, this will
13497    * be a problem later if we want switch on Unicode.
13498    * A similar issue a little bit later when switching on
13499    * namedclass. --jhi */
13500
13501    /* If the \ is escaping white space when white space is being
13502    * skipped, it means that that white space is wanted literally, and
13503    * is already in 'value'.  Otherwise, need to translate the escape
13504    * into what it signifies. */
13505    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13506
13507    case 'w': namedclass = ANYOF_WORDCHAR; break;
13508    case 'W': namedclass = ANYOF_NWORDCHAR; break;
13509    case 's': namedclass = ANYOF_SPACE; break;
13510    case 'S': namedclass = ANYOF_NSPACE; break;
13511    case 'd': namedclass = ANYOF_DIGIT; break;
13512    case 'D': namedclass = ANYOF_NDIGIT; break;
13513    case 'v': namedclass = ANYOF_VERTWS; break;
13514    case 'V': namedclass = ANYOF_NVERTWS; break;
13515    case 'h': namedclass = ANYOF_HORIZWS; break;
13516    case 'H': namedclass = ANYOF_NHORIZWS; break;
13517    case 'N':  /* Handle \N{NAME} in class */
13518     {
13519      /* We only pay attention to the first char of
13520      multichar strings being returned. I kinda wonder
13521      if this makes sense as it does change the behaviour
13522      from earlier versions, OTOH that behaviour was broken
13523      as well. */
13524      if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13525          TRUE, /* => charclass */
13526          strict))
13527      {
13528       if (*flagp & RESTART_UTF8)
13529        FAIL("panic: grok_bslash_N set RESTART_UTF8");
13530       goto parseit;
13531      }
13532     }
13533     break;
13534    case 'p':
13535    case 'P':
13536     {
13537     char *e;
13538
13539     /* We will handle any undefined properties ourselves */
13540     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13541          /* And we actually would prefer to get
13542           * the straight inversion list of the
13543           * swash, since we will be accessing it
13544           * anyway, to save a little time */
13545          |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13546
13547     if (RExC_parse >= RExC_end)
13548      vFAIL2("Empty \\%c{}", (U8)value);
13549     if (*RExC_parse == '{') {
13550      const U8 c = (U8)value;
13551      e = strchr(RExC_parse++, '}');
13552      if (!e)
13553       vFAIL2("Missing right brace on \\%c{}", c);
13554      while (isSPACE(*RExC_parse))
13555       RExC_parse++;
13556      if (e == RExC_parse)
13557       vFAIL2("Empty \\%c{}", c);
13558      n = e - RExC_parse;
13559      while (isSPACE(*(RExC_parse + n - 1)))
13560       n--;
13561     }
13562     else {
13563      e = RExC_parse;
13564      n = 1;
13565     }
13566     if (!SIZE_ONLY) {
13567      SV* invlist;
13568      char* name;
13569
13570      if (UCHARAT(RExC_parse) == '^') {
13571       RExC_parse++;
13572       n--;
13573       /* toggle.  (The rhs xor gets the single bit that
13574       * differs between P and p; the other xor inverts just
13575       * that bit) */
13576       value ^= 'P' ^ 'p';
13577
13578       while (isSPACE(*RExC_parse)) {
13579        RExC_parse++;
13580        n--;
13581       }
13582      }
13583      /* Try to get the definition of the property into
13584      * <invlist>.  If /i is in effect, the effective property
13585      * will have its name be <__NAME_i>.  The design is
13586      * discussed in commit
13587      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13588      name = savepv(Perl_form(aTHX_
13589           "%s%.*s%s\n",
13590           (FOLD) ? "__" : "",
13591           (int)n,
13592           RExC_parse,
13593           (FOLD) ? "_i" : ""
13594         ));
13595
13596      /* Look up the property name, and get its swash and
13597      * inversion list, if the property is found  */
13598      if (swash) {
13599       SvREFCNT_dec_NN(swash);
13600      }
13601      swash = _core_swash_init("utf8", name, &PL_sv_undef,
13602            1, /* binary */
13603            0, /* not tr/// */
13604            NULL, /* No inversion list */
13605            &swash_init_flags
13606            );
13607      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13608       HV* curpkg = (IN_PERL_COMPILETIME)
13609          ? PL_curstash
13610          : CopSTASH(PL_curcop);
13611       if (swash) {
13612        SvREFCNT_dec_NN(swash);
13613        swash = NULL;
13614       }
13615
13616       /* Here didn't find it.  It could be a user-defined
13617       * property that will be available at run-time.  If we
13618       * accept only compile-time properties, is an error;
13619       * otherwise add it to the list for run-time look up */
13620       if (ret_invlist) {
13621        RExC_parse = e + 1;
13622        vFAIL2utf8f(
13623         "Property '%"UTF8f"' is unknown",
13624         UTF8fARG(UTF, n, name));
13625       }
13626
13627       /* If the property name doesn't already have a package
13628       * name, add the current one to it so that it can be
13629       * referred to outside it. [perl #121777] */
13630       if (curpkg && ! instr(name, "::")) {
13631        char* pkgname = HvNAME(curpkg);
13632        if (strNE(pkgname, "main")) {
13633         char* full_name = Perl_form(aTHX_
13634                "%s::%s",
13635                pkgname,
13636                name);
13637         n = strlen(full_name);
13638         Safefree(name);
13639         name = savepvn(full_name, n);
13640        }
13641       }
13642       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13643           (value == 'p' ? '+' : '!'),
13644           UTF8fARG(UTF, n, name));
13645       has_user_defined_property = TRUE;
13646
13647       /* We don't know yet, so have to assume that the
13648       * property could match something in the Latin1 range,
13649       * hence something that isn't utf8.  Note that this
13650       * would cause things in <depends_list> to match
13651       * inappropriately, except that any \p{}, including
13652       * this one forces Unicode semantics, which means there
13653       * is no <depends_list> */
13654       ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13655      }
13656      else {
13657
13658       /* Here, did get the swash and its inversion list.  If
13659       * the swash is from a user-defined property, then this
13660       * whole character class should be regarded as such */
13661       if (swash_init_flags
13662        & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13663       {
13664        has_user_defined_property = TRUE;
13665       }
13666       else if
13667        /* We warn on matching an above-Unicode code point
13668        * if the match would return true, except don't
13669        * warn for \p{All}, which has exactly one element
13670        * = 0 */
13671        (_invlist_contains_cp(invlist, 0x110000)
13672         && (! (_invlist_len(invlist) == 1
13673          && *invlist_array(invlist) == 0)))
13674       {
13675        warn_super = TRUE;
13676       }
13677
13678
13679       /* Invert if asking for the complement */
13680       if (value == 'P') {
13681        _invlist_union_complement_2nd(properties,
13682               invlist,
13683               &properties);
13684
13685        /* The swash can't be used as-is, because we've
13686        * inverted things; delay removing it to here after
13687        * have copied its invlist above */
13688        SvREFCNT_dec_NN(swash);
13689        swash = NULL;
13690       }
13691       else {
13692        _invlist_union(properties, invlist, &properties);
13693       }
13694      }
13695      Safefree(name);
13696     }
13697     RExC_parse = e + 1;
13698     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13699             named */
13700
13701     /* \p means they want Unicode semantics */
13702     RExC_uni_semantics = 1;
13703     }
13704     break;
13705    case 'n': value = '\n';   break;
13706    case 'r': value = '\r';   break;
13707    case 't': value = '\t';   break;
13708    case 'f': value = '\f';   break;
13709    case 'b': value = '\b';   break;
13710    case 'e': value = ASCII_TO_NATIVE('\033');break;
13711    case 'a': value = '\a';                   break;
13712    case 'o':
13713     RExC_parse--; /* function expects to be pointed at the 'o' */
13714     {
13715      const char* error_msg;
13716      bool valid = grok_bslash_o(&RExC_parse,
13717            &value,
13718            &error_msg,
13719            SIZE_ONLY,   /* warnings in pass
13720                1 only */
13721            strict,
13722            silence_non_portable,
13723            UTF);
13724      if (! valid) {
13725       vFAIL(error_msg);
13726      }
13727     }
13728     if (PL_encoding && value < 0x100) {
13729      goto recode_encoding;
13730     }
13731     break;
13732    case 'x':
13733     RExC_parse--; /* function expects to be pointed at the 'x' */
13734     {
13735      const char* error_msg;
13736      bool valid = grok_bslash_x(&RExC_parse,
13737            &value,
13738            &error_msg,
13739            TRUE, /* Output warnings */
13740            strict,
13741            silence_non_portable,
13742            UTF);
13743      if (! valid) {
13744       vFAIL(error_msg);
13745      }
13746     }
13747     if (PL_encoding && value < 0x100)
13748      goto recode_encoding;
13749     break;
13750    case 'c':
13751     value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13752     break;
13753    case '0': case '1': case '2': case '3': case '4':
13754    case '5': case '6': case '7':
13755     {
13756      /* Take 1-3 octal digits */
13757      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13758      numlen = (strict) ? 4 : 3;
13759      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13760      RExC_parse += numlen;
13761      if (numlen != 3) {
13762       if (strict) {
13763        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13764        vFAIL("Need exactly 3 octal digits");
13765       }
13766       else if (! SIZE_ONLY /* like \08, \178 */
13767         && numlen < 3
13768         && RExC_parse < RExC_end
13769         && isDIGIT(*RExC_parse)
13770         && ckWARN(WARN_REGEXP))
13771       {
13772        SAVEFREESV(RExC_rx_sv);
13773        reg_warn_non_literal_string(
13774         RExC_parse + 1,
13775         form_short_octal_warning(RExC_parse, numlen));
13776        (void)ReREFCNT_inc(RExC_rx_sv);
13777       }
13778      }
13779      if (PL_encoding && value < 0x100)
13780       goto recode_encoding;
13781      break;
13782     }
13783    recode_encoding:
13784     if (! RExC_override_recoding) {
13785      SV* enc = PL_encoding;
13786      value = reg_recode((const char)(U8)value, &enc);
13787      if (!enc) {
13788       if (strict) {
13789        vFAIL("Invalid escape in the specified encoding");
13790       }
13791       else if (SIZE_ONLY) {
13792        ckWARNreg(RExC_parse,
13793         "Invalid escape in the specified encoding");
13794       }
13795      }
13796      break;
13797     }
13798    default:
13799     /* Allow \_ to not give an error */
13800     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13801      if (strict) {
13802       vFAIL2("Unrecognized escape \\%c in character class",
13803        (int)value);
13804      }
13805      else {
13806       SAVEFREESV(RExC_rx_sv);
13807       ckWARN2reg(RExC_parse,
13808        "Unrecognized escape \\%c in character class passed through",
13809        (int)value);
13810       (void)ReREFCNT_inc(RExC_rx_sv);
13811      }
13812     }
13813     break;
13814    }   /* End of switch on char following backslash */
13815   } /* end of handling backslash escape sequences */
13816 #ifdef EBCDIC
13817   else
13818    literal_endpoint++;
13819 #endif
13820
13821   /* Here, we have the current token in 'value' */
13822
13823   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13824    U8 classnum;
13825
13826    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13827    * literal, as is the character that began the false range, i.e.
13828    * the 'a' in the examples */
13829    if (range) {
13830     if (!SIZE_ONLY) {
13831      const int w = (RExC_parse >= rangebegin)
13832         ? RExC_parse - rangebegin
13833         : 0;
13834      if (strict) {
13835       vFAIL2utf8f(
13836        "False [] range \"%"UTF8f"\"",
13837        UTF8fARG(UTF, w, rangebegin));
13838      }
13839      else {
13840       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13841       ckWARN2reg(RExC_parse,
13842        "False [] range \"%"UTF8f"\"",
13843        UTF8fARG(UTF, w, rangebegin));
13844       (void)ReREFCNT_inc(RExC_rx_sv);
13845       cp_list = add_cp_to_invlist(cp_list, '-');
13846       cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13847                prevvalue);
13848      }
13849     }
13850
13851     range = 0; /* this was not a true range */
13852     element_count += 2; /* So counts for three values */
13853    }
13854
13855    classnum = namedclass_to_classnum(namedclass);
13856
13857    if (LOC && namedclass < ANYOF_POSIXL_MAX
13858 #ifndef HAS_ISASCII
13859     && classnum != _CC_ASCII
13860 #endif
13861    ) {
13862     /* What the Posix classes (like \w, [:space:]) match in locale
13863     * isn't knowable under locale until actual match time.  Room
13864     * must be reserved (one time per outer bracketed class) to
13865     * store such classes.  The space will contain a bit for each
13866     * named class that is to be matched against.  This isn't
13867     * needed for \p{} and pseudo-classes, as they are not affected
13868     * by locale, and hence are dealt with separately */
13869     if (! need_class) {
13870      need_class = 1;
13871      if (SIZE_ONLY) {
13872       RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13873      }
13874      else {
13875       RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13876      }
13877      ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13878      ANYOF_POSIXL_ZERO(ret);
13879     }
13880
13881     /* Coverity thinks it is possible for this to be negative; both
13882     * jhi and khw think it's not, but be safer */
13883     assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13884      || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
13885
13886     /* See if it already matches the complement of this POSIX
13887     * class */
13888     if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13889      && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13890                ? -1
13891                : 1)))
13892     {
13893      posixl_matches_all = TRUE;
13894      break;  /* No need to continue.  Since it matches both
13895        e.g., \w and \W, it matches everything, and the
13896        bracketed class can be optimized into qr/./s */
13897     }
13898
13899     /* Add this class to those that should be checked at runtime */
13900     ANYOF_POSIXL_SET(ret, namedclass);
13901
13902     /* The above-Latin1 characters are not subject to locale rules.
13903     * Just add them, in the second pass, to the
13904     * unconditionally-matched list */
13905     if (! SIZE_ONLY) {
13906      SV* scratch_list = NULL;
13907
13908      /* Get the list of the above-Latin1 code points this
13909      * matches */
13910      _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13911           PL_XPosix_ptrs[classnum],
13912
13913           /* Odd numbers are complements, like
13914           * NDIGIT, NASCII, ... */
13915           namedclass % 2 != 0,
13916           &scratch_list);
13917      /* Checking if 'cp_list' is NULL first saves an extra
13918      * clone.  Its reference count will be decremented at the
13919      * next union, etc, or if this is the only instance, at the
13920      * end of the routine */
13921      if (! cp_list) {
13922       cp_list = scratch_list;
13923      }
13924      else {
13925       _invlist_union(cp_list, scratch_list, &cp_list);
13926       SvREFCNT_dec_NN(scratch_list);
13927      }
13928      continue;   /* Go get next character */
13929     }
13930    }
13931    else if (! SIZE_ONLY) {
13932
13933     /* Here, not in pass1 (in that pass we skip calculating the
13934     * contents of this class), and is /l, or is a POSIX class for
13935     * which /l doesn't matter (or is a Unicode property, which is
13936     * skipped here). */
13937     if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13938      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13939
13940       /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13941       * nor /l make a difference in what these match,
13942       * therefore we just add what they match to cp_list. */
13943       if (classnum != _CC_VERTSPACE) {
13944        assert(   namedclass == ANYOF_HORIZWS
13945         || namedclass == ANYOF_NHORIZWS);
13946
13947        /* It turns out that \h is just a synonym for
13948        * XPosixBlank */
13949        classnum = _CC_BLANK;
13950       }
13951
13952       _invlist_union_maybe_complement_2nd(
13953         cp_list,
13954         PL_XPosix_ptrs[classnum],
13955         namedclass % 2 != 0,    /* Complement if odd
13956               (NHORIZWS, NVERTWS)
13957               */
13958         &cp_list);
13959      }
13960     }
13961     else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13962       complement and use nposixes */
13963      SV** posixes_ptr = namedclass % 2 == 0
13964          ? &posixes
13965          : &nposixes;
13966      SV** source_ptr = &PL_XPosix_ptrs[classnum];
13967      _invlist_union_maybe_complement_2nd(
13968              *posixes_ptr,
13969              *source_ptr,
13970              namedclass % 2 != 0,
13971              posixes_ptr);
13972     }
13973     continue;   /* Go get next character */
13974    }
13975   } /* end of namedclass \blah */
13976
13977   /* Here, we have a single value.  If 'range' is set, it is the ending
13978   * of a range--check its validity.  Later, we will handle each
13979   * individual code point in the range.  If 'range' isn't set, this
13980   * could be the beginning of a range, so check for that by looking
13981   * ahead to see if the next real character to be processed is the range
13982   * indicator--the minus sign */
13983
13984   if (skip_white) {
13985    RExC_parse = regpatws(pRExC_state, RExC_parse,
13986         FALSE /* means don't recognize comments */ );
13987   }
13988
13989   if (range) {
13990    if (prevvalue > value) /* b-a */ {
13991     const int w = RExC_parse - rangebegin;
13992     vFAIL2utf8f(
13993      "Invalid [] range \"%"UTF8f"\"",
13994      UTF8fARG(UTF, w, rangebegin));
13995     range = 0; /* not a valid range */
13996    }
13997   }
13998   else {
13999    prevvalue = value; /* save the beginning of the potential range */
14000    if (! stop_at_1     /* Can't be a range if parsing just one thing */
14001     && *RExC_parse == '-')
14002    {
14003     char* next_char_ptr = RExC_parse + 1;
14004     if (skip_white) {   /* Get the next real char after the '-' */
14005      next_char_ptr = regpatws(pRExC_state,
14006            RExC_parse + 1,
14007            FALSE); /* means don't recognize
14008               comments */
14009     }
14010
14011     /* If the '-' is at the end of the class (just before the ']',
14012     * it is a literal minus; otherwise it is a range */
14013     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14014      RExC_parse = next_char_ptr;
14015
14016      /* a bad range like \w-, [:word:]- ? */
14017      if (namedclass > OOB_NAMEDCLASS) {
14018       if (strict || ckWARN(WARN_REGEXP)) {
14019        const int w =
14020         RExC_parse >= rangebegin ?
14021         RExC_parse - rangebegin : 0;
14022        if (strict) {
14023         vFAIL4("False [] range \"%*.*s\"",
14024          w, w, rangebegin);
14025        }
14026        else {
14027         vWARN4(RExC_parse,
14028          "False [] range \"%*.*s\"",
14029          w, w, rangebegin);
14030        }
14031       }
14032       if (!SIZE_ONLY) {
14033        cp_list = add_cp_to_invlist(cp_list, '-');
14034       }
14035       element_count++;
14036      } else
14037       range = 1; /* yeah, it's a range! */
14038      continue; /* but do it the next time */
14039     }
14040    }
14041   }
14042
14043   /* Here, <prevvalue> is the beginning of the range, if any; or <value>
14044   * if not */
14045
14046   /* non-Latin1 code point implies unicode semantics.  Must be set in
14047   * pass1 so is there for the whole of pass 2 */
14048   if (value > 255) {
14049    RExC_uni_semantics = 1;
14050   }
14051
14052   /* Ready to process either the single value, or the completed range.
14053   * For single-valued non-inverted ranges, we consider the possibility
14054   * of multi-char folds.  (We made a conscious decision to not do this
14055   * for the other cases because it can often lead to non-intuitive
14056   * results.  For example, you have the peculiar case that:
14057   *  "s s" =~ /^[^\xDF]+$/i => Y
14058   *  "ss"  =~ /^[^\xDF]+$/i => N
14059   *
14060   * See [perl #89750] */
14061   if (FOLD && allow_multi_folds && value == prevvalue) {
14062    if (value == LATIN_SMALL_LETTER_SHARP_S
14063     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14064               value)))
14065    {
14066     /* Here <value> is indeed a multi-char fold.  Get what it is */
14067
14068     U8 foldbuf[UTF8_MAXBYTES_CASE];
14069     STRLEN foldlen;
14070
14071     UV folded = _to_uni_fold_flags(
14072         value,
14073         foldbuf,
14074         &foldlen,
14075         FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14076             ? FOLD_FLAGS_NOMIX_ASCII
14077             : 0)
14078         );
14079
14080     /* Here, <folded> should be the first character of the
14081     * multi-char fold of <value>, with <foldbuf> containing the
14082     * whole thing.  But, if this fold is not allowed (because of
14083     * the flags), <fold> will be the same as <value>, and should
14084     * be processed like any other character, so skip the special
14085     * handling */
14086     if (folded != value) {
14087
14088      /* Skip if we are recursed, currently parsing the class
14089      * again.  Otherwise add this character to the list of
14090      * multi-char folds. */
14091      if (! RExC_in_multi_char_class) {
14092       AV** this_array_ptr;
14093       AV* this_array;
14094       STRLEN cp_count = utf8_length(foldbuf,
14095              foldbuf + foldlen);
14096       SV* multi_fold = sv_2mortal(newSVpvs(""));
14097
14098       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14099
14100
14101       if (! multi_char_matches) {
14102        multi_char_matches = newAV();
14103       }
14104
14105       /* <multi_char_matches> is actually an array of arrays.
14106       * There will be one or two top-level elements: [2],
14107       * and/or [3].  The [2] element is an array, each
14108       * element thereof is a character which folds to TWO
14109       * characters; [3] is for folds to THREE characters.
14110       * (Unicode guarantees a maximum of 3 characters in any
14111       * fold.)  When we rewrite the character class below,
14112       * we will do so such that the longest folds are
14113       * written first, so that it prefers the longest
14114       * matching strings first.  This is done even if it
14115       * turns out that any quantifier is non-greedy, out of
14116       * programmer laziness.  Tom Christiansen has agreed
14117       * that this is ok.  This makes the test for the
14118       * ligature 'ffi' come before the test for 'ff' */
14119       if (av_exists(multi_char_matches, cp_count)) {
14120        this_array_ptr = (AV**) av_fetch(multi_char_matches,
14121                cp_count, FALSE);
14122        this_array = *this_array_ptr;
14123       }
14124       else {
14125        this_array = newAV();
14126        av_store(multi_char_matches, cp_count,
14127          (SV*) this_array);
14128       }
14129       av_push(this_array, multi_fold);
14130      }
14131
14132      /* This element should not be processed further in this
14133      * class */
14134      element_count--;
14135      value = save_value;
14136      prevvalue = save_prevvalue;
14137      continue;
14138     }
14139    }
14140   }
14141
14142   /* Deal with this element of the class */
14143   if (! SIZE_ONLY) {
14144 #ifndef EBCDIC
14145    cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14146              prevvalue, value);
14147 #else
14148    SV* this_range = _new_invlist(1);
14149    _append_range_to_invlist(this_range, prevvalue, value);
14150
14151    /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14152    * If this range was specified using something like 'i-j', we want
14153    * to include only the 'i' and the 'j', and not anything in
14154    * between, so exclude non-ASCII, non-alphabetics from it.
14155    * However, if the range was specified with something like
14156    * [\x89-\x91] or [\x89-j], all code points within it should be
14157    * included.  literal_endpoint==2 means both ends of the range used
14158    * a literal character, not \x{foo} */
14159    if (literal_endpoint == 2
14160     && ((prevvalue >= 'a' && value <= 'z')
14161      || (prevvalue >= 'A' && value <= 'Z')))
14162    {
14163     _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14164          &this_range);
14165
14166     /* Since this above only contains ascii, the intersection of it
14167     * with anything will still yield only ascii */
14168     _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14169          &this_range);
14170    }
14171    _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14172    literal_endpoint = 0;
14173 #endif
14174   }
14175
14176   range = 0; /* this range (if it was one) is done now */
14177  } /* End of loop through all the text within the brackets */
14178
14179  /* If anything in the class expands to more than one character, we have to
14180  * deal with them by building up a substitute parse string, and recursively
14181  * calling reg() on it, instead of proceeding */
14182  if (multi_char_matches) {
14183   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14184   I32 cp_count;
14185   STRLEN len;
14186   char *save_end = RExC_end;
14187   char *save_parse = RExC_parse;
14188   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14189          a "|" */
14190   I32 reg_flags;
14191
14192   assert(! invert);
14193 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14194   because too confusing */
14195   if (invert) {
14196    sv_catpv(substitute_parse, "(?:");
14197   }
14198 #endif
14199
14200   /* Look at the longest folds first */
14201   for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14202
14203    if (av_exists(multi_char_matches, cp_count)) {
14204     AV** this_array_ptr;
14205     SV* this_sequence;
14206
14207     this_array_ptr = (AV**) av_fetch(multi_char_matches,
14208             cp_count, FALSE);
14209     while ((this_sequence = av_pop(*this_array_ptr)) !=
14210                 &PL_sv_undef)
14211     {
14212      if (! first_time) {
14213       sv_catpv(substitute_parse, "|");
14214      }
14215      first_time = FALSE;
14216
14217      sv_catpv(substitute_parse, SvPVX(this_sequence));
14218     }
14219    }
14220   }
14221
14222   /* If the character class contains anything else besides these
14223   * multi-character folds, have to include it in recursive parsing */
14224   if (element_count) {
14225    sv_catpv(substitute_parse, "|[");
14226    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14227    sv_catpv(substitute_parse, "]");
14228   }
14229
14230   sv_catpv(substitute_parse, ")");
14231 #if 0
14232   if (invert) {
14233    /* This is a way to get the parse to skip forward a whole named
14234    * sequence instead of matching the 2nd character when it fails the
14235    * first */
14236    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14237   }
14238 #endif
14239
14240   RExC_parse = SvPV(substitute_parse, len);
14241   RExC_end = RExC_parse + len;
14242   RExC_in_multi_char_class = 1;
14243   RExC_emit = (regnode *)orig_emit;
14244
14245   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14246
14247   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14248
14249   RExC_parse = save_parse;
14250   RExC_end = save_end;
14251   RExC_in_multi_char_class = 0;
14252   SvREFCNT_dec_NN(multi_char_matches);
14253   return ret;
14254  }
14255
14256  /* Here, we've gone through the entire class and dealt with multi-char
14257  * folds.  We are now in a position that we can do some checks to see if we
14258  * can optimize this ANYOF node into a simpler one, even in Pass 1.
14259  * Currently we only do two checks:
14260  * 1) is in the unlikely event that the user has specified both, eg. \w and
14261  *    \W under /l, then the class matches everything.  (This optimization
14262  *    is done only to make the optimizer code run later work.)
14263  * 2) if the character class contains only a single element (including a
14264  *    single range), we see if there is an equivalent node for it.
14265  * Other checks are possible */
14266  if (! ret_invlist   /* Can't optimize if returning the constructed
14267       inversion list */
14268   && (UNLIKELY(posixl_matches_all) || element_count == 1))
14269  {
14270   U8 op = END;
14271   U8 arg = 0;
14272
14273   if (UNLIKELY(posixl_matches_all)) {
14274    op = SANY;
14275   }
14276   else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14277             \w or [:digit:] or \p{foo}
14278             */
14279
14280    /* All named classes are mapped into POSIXish nodes, with its FLAG
14281    * argument giving which class it is */
14282    switch ((I32)namedclass) {
14283     case ANYOF_UNIPROP:
14284      break;
14285
14286     /* These don't depend on the charset modifiers.  They always
14287     * match under /u rules */
14288     case ANYOF_NHORIZWS:
14289     case ANYOF_HORIZWS:
14290      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14291      /* FALLTHROUGH */
14292
14293     case ANYOF_NVERTWS:
14294     case ANYOF_VERTWS:
14295      op = POSIXU;
14296      goto join_posix;
14297
14298     /* The actual POSIXish node for all the rest depends on the
14299     * charset modifier.  The ones in the first set depend only on
14300     * ASCII or, if available on this platform, locale */
14301     case ANYOF_ASCII:
14302     case ANYOF_NASCII:
14303 #ifdef HAS_ISASCII
14304      op = (LOC) ? POSIXL : POSIXA;
14305 #else
14306      op = POSIXA;
14307 #endif
14308      goto join_posix;
14309
14310     case ANYOF_NCASED:
14311     case ANYOF_LOWER:
14312     case ANYOF_NLOWER:
14313     case ANYOF_UPPER:
14314     case ANYOF_NUPPER:
14315      /* under /a could be alpha */
14316      if (FOLD) {
14317       if (ASCII_RESTRICTED) {
14318        namedclass = ANYOF_ALPHA + (namedclass % 2);
14319       }
14320       else if (! LOC) {
14321        break;
14322       }
14323      }
14324      /* FALLTHROUGH */
14325
14326     /* The rest have more possibilities depending on the charset.
14327     * We take advantage of the enum ordering of the charset
14328     * modifiers to get the exact node type, */
14329     default:
14330      op = POSIXD + get_regex_charset(RExC_flags);
14331      if (op > POSIXA) { /* /aa is same as /a */
14332       op = POSIXA;
14333      }
14334
14335     join_posix:
14336      /* The odd numbered ones are the complements of the
14337      * next-lower even number one */
14338      if (namedclass % 2 == 1) {
14339       invert = ! invert;
14340       namedclass--;
14341      }
14342      arg = namedclass_to_classnum(namedclass);
14343      break;
14344    }
14345   }
14346   else if (value == prevvalue) {
14347
14348    /* Here, the class consists of just a single code point */
14349
14350    if (invert) {
14351     if (! LOC && value == '\n') {
14352      op = REG_ANY; /* Optimize [^\n] */
14353      *flagp |= HASWIDTH|SIMPLE;
14354      RExC_naughty++;
14355     }
14356    }
14357    else if (value < 256 || UTF) {
14358
14359     /* Optimize a single value into an EXACTish node, but not if it
14360     * would require converting the pattern to UTF-8. */
14361     op = compute_EXACTish(pRExC_state);
14362    }
14363   } /* Otherwise is a range */
14364   else if (! LOC) {   /* locale could vary these */
14365    if (prevvalue == '0') {
14366     if (value == '9') {
14367      arg = _CC_DIGIT;
14368      op = POSIXA;
14369     }
14370    }
14371    else if (prevvalue == 'A') {
14372     if (value == 'Z'
14373 #ifdef EBCDIC
14374      && literal_endpoint == 2
14375 #endif
14376     ) {
14377      arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14378      op = POSIXA;
14379     }
14380    }
14381    else if (prevvalue == 'a') {
14382     if (value == 'z'
14383 #ifdef EBCDIC
14384      && literal_endpoint == 2
14385 #endif
14386     ) {
14387      arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14388      op = POSIXA;
14389     }
14390    }
14391   }
14392
14393   /* Here, we have changed <op> away from its initial value iff we found
14394   * an optimization */
14395   if (op != END) {
14396
14397    /* Throw away this ANYOF regnode, and emit the calculated one,
14398    * which should correspond to the beginning, not current, state of
14399    * the parse */
14400    const char * cur_parse = RExC_parse;
14401    RExC_parse = (char *)orig_parse;
14402    if ( SIZE_ONLY) {
14403     if (! LOC) {
14404
14405      /* To get locale nodes to not use the full ANYOF size would
14406      * require moving the code above that writes the portions
14407      * of it that aren't in other nodes to after this point.
14408      * e.g.  ANYOF_POSIXL_SET */
14409      RExC_size = orig_size;
14410     }
14411    }
14412    else {
14413     RExC_emit = (regnode *)orig_emit;
14414     if (PL_regkind[op] == POSIXD) {
14415      if (op == POSIXL) {
14416       RExC_contains_locale = 1;
14417      }
14418      if (invert) {
14419       op += NPOSIXD - POSIXD;
14420      }
14421     }
14422    }
14423
14424    ret = reg_node(pRExC_state, op);
14425
14426    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14427     if (! SIZE_ONLY) {
14428      FLAGS(ret) = arg;
14429     }
14430     *flagp |= HASWIDTH|SIMPLE;
14431    }
14432    else if (PL_regkind[op] == EXACT) {
14433     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14434           TRUE /* downgradable to EXACT */
14435           );
14436    }
14437
14438    RExC_parse = (char *) cur_parse;
14439
14440    SvREFCNT_dec(posixes);
14441    SvREFCNT_dec(nposixes);
14442    SvREFCNT_dec(cp_list);
14443    SvREFCNT_dec(cp_foldable_list);
14444    return ret;
14445   }
14446  }
14447
14448  if (SIZE_ONLY)
14449   return ret;
14450  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14451
14452  /* If folding, we calculate all characters that could fold to or from the
14453  * ones already on the list */
14454  if (cp_foldable_list) {
14455   if (FOLD) {
14456    UV start, end; /* End points of code point ranges */
14457
14458    SV* fold_intersection = NULL;
14459    SV** use_list;
14460
14461    /* Our calculated list will be for Unicode rules.  For locale
14462    * matching, we have to keep a separate list that is consulted at
14463    * runtime only when the locale indicates Unicode rules.  For
14464    * non-locale, we just use to the general list */
14465    if (LOC) {
14466     use_list = &only_utf8_locale_list;
14467    }
14468    else {
14469     use_list = &cp_list;
14470    }
14471
14472    /* Only the characters in this class that participate in folds need
14473    * be checked.  Get the intersection of this class and all the
14474    * possible characters that are foldable.  This can quickly narrow
14475    * down a large class */
14476    _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14477         &fold_intersection);
14478
14479    /* The folds for all the Latin1 characters are hard-coded into this
14480    * program, but we have to go out to disk to get the others. */
14481    if (invlist_highest(cp_foldable_list) >= 256) {
14482
14483     /* This is a hash that for a particular fold gives all
14484     * characters that are involved in it */
14485     if (! PL_utf8_foldclosures) {
14486      _load_PL_utf8_foldclosures();
14487     }
14488    }
14489
14490    /* Now look at the foldable characters in this class individually */
14491    invlist_iterinit(fold_intersection);
14492    while (invlist_iternext(fold_intersection, &start, &end)) {
14493     UV j;
14494
14495     /* Look at every character in the range */
14496     for (j = start; j <= end; j++) {
14497      U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14498      STRLEN foldlen;
14499      SV** listp;
14500
14501      if (j < 256) {
14502
14503       if (IS_IN_SOME_FOLD_L1(j)) {
14504
14505        /* ASCII is always matched; non-ASCII is matched
14506        * only under Unicode rules (which could happen
14507        * under /l if the locale is a UTF-8 one */
14508        if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14509         *use_list = add_cp_to_invlist(*use_list,
14510                PL_fold_latin1[j]);
14511        }
14512        else {
14513         depends_list =
14514         add_cp_to_invlist(depends_list,
14515             PL_fold_latin1[j]);
14516        }
14517       }
14518
14519       if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14520        && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14521       {
14522        add_above_Latin1_folds(pRExC_state,
14523             (U8) j,
14524             use_list);
14525       }
14526       continue;
14527      }
14528
14529      /* Here is an above Latin1 character.  We don't have the
14530      * rules hard-coded for it.  First, get its fold.  This is
14531      * the simple fold, as the multi-character folds have been
14532      * handled earlier and separated out */
14533      _to_uni_fold_flags(j, foldbuf, &foldlen,
14534               (ASCII_FOLD_RESTRICTED)
14535               ? FOLD_FLAGS_NOMIX_ASCII
14536               : 0);
14537
14538      /* Single character fold of above Latin1.  Add everything in
14539      * its fold closure to the list that this node should match.
14540      * The fold closures data structure is a hash with the keys
14541      * being the UTF-8 of every character that is folded to, like
14542      * 'k', and the values each an array of all code points that
14543      * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14544      * Multi-character folds are not included */
14545      if ((listp = hv_fetch(PL_utf8_foldclosures,
14546           (char *) foldbuf, foldlen, FALSE)))
14547      {
14548       AV* list = (AV*) *listp;
14549       IV k;
14550       for (k = 0; k <= av_tindex(list); k++) {
14551        SV** c_p = av_fetch(list, k, FALSE);
14552        UV c;
14553        assert(c_p);
14554
14555        c = SvUV(*c_p);
14556
14557        /* /aa doesn't allow folds between ASCII and non- */
14558        if ((ASCII_FOLD_RESTRICTED
14559         && (isASCII(c) != isASCII(j))))
14560        {
14561         continue;
14562        }
14563
14564        /* Folds under /l which cross the 255/256 boundary
14565        * are added to a separate list.  (These are valid
14566        * only when the locale is UTF-8.) */
14567        if (c < 256 && LOC) {
14568         *use_list = add_cp_to_invlist(*use_list, c);
14569         continue;
14570        }
14571
14572        if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14573        {
14574         cp_list = add_cp_to_invlist(cp_list, c);
14575        }
14576        else {
14577         /* Similarly folds involving non-ascii Latin1
14578         * characters under /d are added to their list */
14579         depends_list = add_cp_to_invlist(depends_list,
14580                 c);
14581        }
14582       }
14583      }
14584     }
14585    }
14586    SvREFCNT_dec_NN(fold_intersection);
14587   }
14588
14589   /* Now that we have finished adding all the folds, there is no reason
14590   * to keep the foldable list separate */
14591   _invlist_union(cp_list, cp_foldable_list, &cp_list);
14592   SvREFCNT_dec_NN(cp_foldable_list);
14593  }
14594
14595  /* And combine the result (if any) with any inversion list from posix
14596  * classes.  The lists are kept separate up to now because we don't want to
14597  * fold the classes (folding of those is automatically handled by the swash
14598  * fetching code) */
14599  if (posixes || nposixes) {
14600   if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14601    /* Under /a and /aa, nothing above ASCII matches these */
14602    _invlist_intersection(posixes,
14603         PL_XPosix_ptrs[_CC_ASCII],
14604         &posixes);
14605   }
14606   if (nposixes) {
14607    if (DEPENDS_SEMANTICS) {
14608     /* Under /d, everything in the upper half of the Latin1 range
14609     * matches these complements */
14610     ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14611    }
14612    else if (AT_LEAST_ASCII_RESTRICTED) {
14613     /* Under /a and /aa, everything above ASCII matches these
14614     * complements */
14615     _invlist_union_complement_2nd(nposixes,
14616            PL_XPosix_ptrs[_CC_ASCII],
14617            &nposixes);
14618    }
14619    if (posixes) {
14620     _invlist_union(posixes, nposixes, &posixes);
14621     SvREFCNT_dec_NN(nposixes);
14622    }
14623    else {
14624     posixes = nposixes;
14625    }
14626   }
14627   if (! DEPENDS_SEMANTICS) {
14628    if (cp_list) {
14629     _invlist_union(cp_list, posixes, &cp_list);
14630     SvREFCNT_dec_NN(posixes);
14631    }
14632    else {
14633     cp_list = posixes;
14634    }
14635   }
14636   else {
14637    /* Under /d, we put into a separate list the Latin1 things that
14638    * match only when the target string is utf8 */
14639    SV* nonascii_but_latin1_properties = NULL;
14640    _invlist_intersection(posixes, PL_UpperLatin1,
14641         &nonascii_but_latin1_properties);
14642    _invlist_subtract(posixes, nonascii_but_latin1_properties,
14643        &posixes);
14644    if (cp_list) {
14645     _invlist_union(cp_list, posixes, &cp_list);
14646     SvREFCNT_dec_NN(posixes);
14647    }
14648    else {
14649     cp_list = posixes;
14650    }
14651
14652    if (depends_list) {
14653     _invlist_union(depends_list, nonascii_but_latin1_properties,
14654        &depends_list);
14655     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14656    }
14657    else {
14658     depends_list = nonascii_but_latin1_properties;
14659    }
14660   }
14661  }
14662
14663  /* And combine the result (if any) with any inversion list from properties.
14664  * The lists are kept separate up to now so that we can distinguish the two
14665  * in regards to matching above-Unicode.  A run-time warning is generated
14666  * if a Unicode property is matched against a non-Unicode code point. But,
14667  * we allow user-defined properties to match anything, without any warning,
14668  * and we also suppress the warning if there is a portion of the character
14669  * class that isn't a Unicode property, and which matches above Unicode, \W
14670  * or [\x{110000}] for example.
14671  * (Note that in this case, unlike the Posix one above, there is no
14672  * <depends_list>, because having a Unicode property forces Unicode
14673  * semantics */
14674  if (properties) {
14675   if (cp_list) {
14676
14677    /* If it matters to the final outcome, see if a non-property
14678    * component of the class matches above Unicode.  If so, the
14679    * warning gets suppressed.  This is true even if just a single
14680    * such code point is specified, as though not strictly correct if
14681    * another such code point is matched against, the fact that they
14682    * are using above-Unicode code points indicates they should know
14683    * the issues involved */
14684    if (warn_super) {
14685     warn_super = ! (invert
14686        ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14687    }
14688
14689    _invlist_union(properties, cp_list, &cp_list);
14690    SvREFCNT_dec_NN(properties);
14691   }
14692   else {
14693    cp_list = properties;
14694   }
14695
14696   if (warn_super) {
14697    ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14698   }
14699  }
14700
14701  /* Here, we have calculated what code points should be in the character
14702  * class.
14703  *
14704  * Now we can see about various optimizations.  Fold calculation (which we
14705  * did above) needs to take place before inversion.  Otherwise /[^k]/i
14706  * would invert to include K, which under /i would match k, which it
14707  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14708  * folded until runtime */
14709
14710  /* If we didn't do folding, it's because some information isn't available
14711  * until runtime; set the run-time fold flag for these.  (We don't have to
14712  * worry about properties folding, as that is taken care of by the swash
14713  * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14714  * locales, or the class matches at least one 0-255 range code point */
14715  if (LOC && FOLD) {
14716   if (only_utf8_locale_list) {
14717    ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14718   }
14719   else if (cp_list) { /* Look to see if there a 0-255 code point is in
14720        the list */
14721    UV start, end;
14722    invlist_iterinit(cp_list);
14723    if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14724     ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14725    }
14726    invlist_iterfinish(cp_list);
14727   }
14728  }
14729
14730  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14731  * at compile time.  Besides not inverting folded locale now, we can't
14732  * invert if there are things such as \w, which aren't known until runtime
14733  * */
14734  if (cp_list
14735   && invert
14736   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14737   && ! depends_list
14738   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14739  {
14740   _invlist_invert(cp_list);
14741
14742   /* Any swash can't be used as-is, because we've inverted things */
14743   if (swash) {
14744    SvREFCNT_dec_NN(swash);
14745    swash = NULL;
14746   }
14747
14748   /* Clear the invert flag since have just done it here */
14749   invert = FALSE;
14750  }
14751
14752  if (ret_invlist) {
14753   *ret_invlist = cp_list;
14754   SvREFCNT_dec(swash);
14755
14756   /* Discard the generated node */
14757   if (SIZE_ONLY) {
14758    RExC_size = orig_size;
14759   }
14760   else {
14761    RExC_emit = orig_emit;
14762   }
14763   return orig_emit;
14764  }
14765
14766  /* Some character classes are equivalent to other nodes.  Such nodes take
14767  * up less room and generally fewer operations to execute than ANYOF nodes.
14768  * Above, we checked for and optimized into some such equivalents for
14769  * certain common classes that are easy to test.  Getting to this point in
14770  * the code means that the class didn't get optimized there.  Since this
14771  * code is only executed in Pass 2, it is too late to save space--it has
14772  * been allocated in Pass 1, and currently isn't given back.  But turning
14773  * things into an EXACTish node can allow the optimizer to join it to any
14774  * adjacent such nodes.  And if the class is equivalent to things like /./,
14775  * expensive run-time swashes can be avoided.  Now that we have more
14776  * complete information, we can find things necessarily missed by the
14777  * earlier code.  I (khw) am not sure how much to look for here.  It would
14778  * be easy, but perhaps too slow, to check any candidates against all the
14779  * node types they could possibly match using _invlistEQ(). */
14780
14781  if (cp_list
14782   && ! invert
14783   && ! depends_list
14784   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14785   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14786
14787   /* We don't optimize if we are supposed to make sure all non-Unicode
14788    * code points raise a warning, as only ANYOF nodes have this check.
14789    * */
14790   && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14791  {
14792   UV start, end;
14793   U8 op = END;  /* The optimzation node-type */
14794   const char * cur_parse= RExC_parse;
14795
14796   invlist_iterinit(cp_list);
14797   if (! invlist_iternext(cp_list, &start, &end)) {
14798
14799    /* Here, the list is empty.  This happens, for example, when a
14800    * Unicode property is the only thing in the character class, and
14801    * it doesn't match anything.  (perluniprops.pod notes such
14802    * properties) */
14803    op = OPFAIL;
14804    *flagp |= HASWIDTH|SIMPLE;
14805   }
14806   else if (start == end) {    /* The range is a single code point */
14807    if (! invlist_iternext(cp_list, &start, &end)
14808
14809      /* Don't do this optimization if it would require changing
14810      * the pattern to UTF-8 */
14811     && (start < 256 || UTF))
14812    {
14813     /* Here, the list contains a single code point.  Can optimize
14814     * into an EXACTish node */
14815
14816     value = start;
14817
14818     if (! FOLD) {
14819      op = EXACT;
14820     }
14821     else if (LOC) {
14822
14823      /* A locale node under folding with one code point can be
14824      * an EXACTFL, as its fold won't be calculated until
14825      * runtime */
14826      op = EXACTFL;
14827     }
14828     else {
14829
14830      /* Here, we are generally folding, but there is only one
14831      * code point to match.  If we have to, we use an EXACT
14832      * node, but it would be better for joining with adjacent
14833      * nodes in the optimization pass if we used the same
14834      * EXACTFish node that any such are likely to be.  We can
14835      * do this iff the code point doesn't participate in any
14836      * folds.  For example, an EXACTF of a colon is the same as
14837      * an EXACT one, since nothing folds to or from a colon. */
14838      if (value < 256) {
14839       if (IS_IN_SOME_FOLD_L1(value)) {
14840        op = EXACT;
14841       }
14842      }
14843      else {
14844       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14845        op = EXACT;
14846       }
14847      }
14848
14849      /* If we haven't found the node type, above, it means we
14850      * can use the prevailing one */
14851      if (op == END) {
14852       op = compute_EXACTish(pRExC_state);
14853      }
14854     }
14855    }
14856   }
14857   else if (start == 0) {
14858    if (end == UV_MAX) {
14859     op = SANY;
14860     *flagp |= HASWIDTH|SIMPLE;
14861     RExC_naughty++;
14862    }
14863    else if (end == '\n' - 1
14864      && invlist_iternext(cp_list, &start, &end)
14865      && start == '\n' + 1 && end == UV_MAX)
14866    {
14867     op = REG_ANY;
14868     *flagp |= HASWIDTH|SIMPLE;
14869     RExC_naughty++;
14870    }
14871   }
14872   invlist_iterfinish(cp_list);
14873
14874   if (op != END) {
14875    RExC_parse = (char *)orig_parse;
14876    RExC_emit = (regnode *)orig_emit;
14877
14878    ret = reg_node(pRExC_state, op);
14879
14880    RExC_parse = (char *)cur_parse;
14881
14882    if (PL_regkind[op] == EXACT) {
14883     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14884           TRUE /* downgradable to EXACT */
14885           );
14886    }
14887
14888    SvREFCNT_dec_NN(cp_list);
14889    return ret;
14890   }
14891  }
14892
14893  /* Here, <cp_list> contains all the code points we can determine at
14894  * compile time that match under all conditions.  Go through it, and
14895  * for things that belong in the bitmap, put them there, and delete from
14896  * <cp_list>.  While we are at it, see if everything above 255 is in the
14897  * list, and if so, set a flag to speed up execution */
14898
14899  populate_ANYOF_from_invlist(ret, &cp_list);
14900
14901  if (invert) {
14902   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14903  }
14904
14905  /* Here, the bitmap has been populated with all the Latin1 code points that
14906  * always match.  Can now add to the overall list those that match only
14907  * when the target string is UTF-8 (<depends_list>). */
14908  if (depends_list) {
14909   if (cp_list) {
14910    _invlist_union(cp_list, depends_list, &cp_list);
14911    SvREFCNT_dec_NN(depends_list);
14912   }
14913   else {
14914    cp_list = depends_list;
14915   }
14916   ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14917  }
14918
14919  /* If there is a swash and more than one element, we can't use the swash in
14920  * the optimization below. */
14921  if (swash && element_count > 1) {
14922   SvREFCNT_dec_NN(swash);
14923   swash = NULL;
14924  }
14925
14926  set_ANYOF_arg(pRExC_state, ret, cp_list,
14927     (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14928     ? listsv : NULL,
14929     only_utf8_locale_list,
14930     swash, has_user_defined_property);
14931
14932  *flagp |= HASWIDTH|SIMPLE;
14933
14934  if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14935   RExC_contains_locale = 1;
14936  }
14937
14938  return ret;
14939 }
14940
14941 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14942
14943 STATIC void
14944 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14945     regnode* const node,
14946     SV* const cp_list,
14947     SV* const runtime_defns,
14948     SV* const only_utf8_locale_list,
14949     SV* const swash,
14950     const bool has_user_defined_property)
14951 {
14952  /* Sets the arg field of an ANYOF-type node 'node', using information about
14953  * the node passed-in.  If there is nothing outside the node's bitmap, the
14954  * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14955  * the count returned by add_data(), having allocated and stored an array,
14956  * av, that that count references, as follows:
14957  *  av[0] stores the character class description in its textual form.
14958  *        This is used later (regexec.c:Perl_regclass_swash()) to
14959  *        initialize the appropriate swash, and is also useful for dumping
14960  *        the regnode.  This is set to &PL_sv_undef if the textual
14961  *        description is not needed at run-time (as happens if the other
14962  *        elements completely define the class)
14963  *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14964  *        computed from av[0].  But if no further computation need be done,
14965  *        the swash is stored here now (and av[0] is &PL_sv_undef).
14966  *  av[2] stores the inversion list of code points that match only if the
14967  *        current locale is UTF-8
14968  *  av[3] stores the cp_list inversion list for use in addition or instead
14969  *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14970  *        (Otherwise everything needed is already in av[0] and av[1])
14971  *  av[4] is set if any component of the class is from a user-defined
14972  *        property; used only if av[3] exists */
14973
14974  UV n;
14975
14976  PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14977
14978  if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14979   assert(! (ANYOF_FLAGS(node)
14980      & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14981   ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14982  }
14983  else {
14984   AV * const av = newAV();
14985   SV *rv;
14986
14987   assert(ANYOF_FLAGS(node)
14988      & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14989
14990   av_store(av, 0, (runtime_defns)
14991       ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14992   if (swash) {
14993    assert(cp_list);
14994    av_store(av, 1, swash);
14995    SvREFCNT_dec_NN(cp_list);
14996   }
14997   else {
14998    av_store(av, 1, &PL_sv_undef);
14999    if (cp_list) {
15000     av_store(av, 3, cp_list);
15001     av_store(av, 4, newSVuv(has_user_defined_property));
15002    }
15003   }
15004
15005   if (only_utf8_locale_list) {
15006    av_store(av, 2, only_utf8_locale_list);
15007   }
15008   else {
15009    av_store(av, 2, &PL_sv_undef);
15010   }
15011
15012   rv = newRV_noinc(MUTABLE_SV(av));
15013   n = add_data(pRExC_state, STR_WITH_LEN("s"));
15014   RExC_rxi->data->data[n] = (void*)rv;
15015   ARG_SET(node, n);
15016  }
15017 }
15018
15019
15020 /* reg_skipcomment()
15021
15022    Absorbs an /x style # comment from the input stream,
15023    returning a pointer to the first character beyond the comment, or if the
15024    comment terminates the pattern without anything following it, this returns
15025    one past the final character of the pattern (in other words, RExC_end) and
15026    sets the REG_RUN_ON_COMMENT_SEEN flag.
15027
15028    Note it's the callers responsibility to ensure that we are
15029    actually in /x mode
15030
15031 */
15032
15033 PERL_STATIC_INLINE char*
15034 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15035 {
15036  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15037
15038  assert(*p == '#');
15039
15040  while (p < RExC_end) {
15041   if (*(++p) == '\n') {
15042    return p+1;
15043   }
15044  }
15045
15046  /* we ran off the end of the pattern without ending the comment, so we have
15047  * to add an \n when wrapping */
15048  RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15049  return p;
15050 }
15051
15052 /* nextchar()
15053
15054    Advances the parse position, and optionally absorbs
15055    "whitespace" from the inputstream.
15056
15057    Without /x "whitespace" means (?#...) style comments only,
15058    with /x this means (?#...) and # comments and whitespace proper.
15059
15060    Returns the RExC_parse point from BEFORE the scan occurs.
15061
15062    This is the /x friendly way of saying RExC_parse++.
15063 */
15064
15065 STATIC char*
15066 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15067 {
15068  char* const retval = RExC_parse++;
15069
15070  PERL_ARGS_ASSERT_NEXTCHAR;
15071
15072  for (;;) {
15073   if (RExC_end - RExC_parse >= 3
15074    && *RExC_parse == '('
15075    && RExC_parse[1] == '?'
15076    && RExC_parse[2] == '#')
15077   {
15078    while (*RExC_parse != ')') {
15079     if (RExC_parse == RExC_end)
15080      FAIL("Sequence (?#... not terminated");
15081     RExC_parse++;
15082    }
15083    RExC_parse++;
15084    continue;
15085   }
15086   if (RExC_flags & RXf_PMf_EXTENDED) {
15087    char * p = regpatws(pRExC_state, RExC_parse,
15088           TRUE); /* means recognize comments */
15089    if (p != RExC_parse) {
15090     RExC_parse = p;
15091     continue;
15092    }
15093   }
15094   return retval;
15095  }
15096 }
15097
15098 /*
15099 - reg_node - emit a node
15100 */
15101 STATIC regnode *   /* Location. */
15102 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15103 {
15104  regnode *ptr;
15105  regnode * const ret = RExC_emit;
15106  GET_RE_DEBUG_FLAGS_DECL;
15107
15108  PERL_ARGS_ASSERT_REG_NODE;
15109
15110  if (SIZE_ONLY) {
15111   SIZE_ALIGN(RExC_size);
15112   RExC_size += 1;
15113   return(ret);
15114  }
15115  if (RExC_emit >= RExC_emit_bound)
15116   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15117     op, (void*)RExC_emit, (void*)RExC_emit_bound);
15118
15119  NODE_ALIGN_FILL(ret);
15120  ptr = ret;
15121  FILL_ADVANCE_NODE(ptr, op);
15122  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
15123 #ifdef RE_TRACK_PATTERN_OFFSETS
15124  if (RExC_offsets) {         /* MJD */
15125   MJD_OFFSET_DEBUG(
15126    ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15127    "reg_node", __LINE__,
15128    PL_reg_name[op],
15129    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15130     ? "Overwriting end of array!\n" : "OK",
15131    (UV)(RExC_emit - RExC_emit_start),
15132    (UV)(RExC_parse - RExC_start),
15133    (UV)RExC_offsets[0]));
15134   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15135  }
15136 #endif
15137  RExC_emit = ptr;
15138  return(ret);
15139 }
15140
15141 /*
15142 - reganode - emit a node with an argument
15143 */
15144 STATIC regnode *   /* Location. */
15145 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15146 {
15147  regnode *ptr;
15148  regnode * const ret = RExC_emit;
15149  GET_RE_DEBUG_FLAGS_DECL;
15150
15151  PERL_ARGS_ASSERT_REGANODE;
15152
15153  if (SIZE_ONLY) {
15154   SIZE_ALIGN(RExC_size);
15155   RExC_size += 2;
15156   /*
15157   We can't do this:
15158
15159   assert(2==regarglen[op]+1);
15160
15161   Anything larger than this has to allocate the extra amount.
15162   If we changed this to be:
15163
15164   RExC_size += (1 + regarglen[op]);
15165
15166   then it wouldn't matter. Its not clear what side effect
15167   might come from that so its not done so far.
15168   -- dmq
15169   */
15170   return(ret);
15171  }
15172  if (RExC_emit >= RExC_emit_bound)
15173   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15174     op, (void*)RExC_emit, (void*)RExC_emit_bound);
15175
15176  NODE_ALIGN_FILL(ret);
15177  ptr = ret;
15178  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15179  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
15180 #ifdef RE_TRACK_PATTERN_OFFSETS
15181  if (RExC_offsets) {         /* MJD */
15182   MJD_OFFSET_DEBUG(
15183    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15184    "reganode",
15185    __LINE__,
15186    PL_reg_name[op],
15187    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15188    "Overwriting end of array!\n" : "OK",
15189    (UV)(RExC_emit - RExC_emit_start),
15190    (UV)(RExC_parse - RExC_start),
15191    (UV)RExC_offsets[0]));
15192   Set_Cur_Node_Offset;
15193  }
15194 #endif
15195  RExC_emit = ptr;
15196  return(ret);
15197 }
15198
15199 /*
15200 - reguni - emit (if appropriate) a Unicode character
15201 */
15202 PERL_STATIC_INLINE STRLEN
15203 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15204 {
15205  PERL_ARGS_ASSERT_REGUNI;
15206
15207  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15208 }
15209
15210 /*
15211 - reginsert - insert an operator in front of already-emitted operand
15212 *
15213 * Means relocating the operand.
15214 */
15215 STATIC void
15216 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15217 {
15218  regnode *src;
15219  regnode *dst;
15220  regnode *place;
15221  const int offset = regarglen[(U8)op];
15222  const int size = NODE_STEP_REGNODE + offset;
15223  GET_RE_DEBUG_FLAGS_DECL;
15224
15225  PERL_ARGS_ASSERT_REGINSERT;
15226  PERL_UNUSED_CONTEXT;
15227  PERL_UNUSED_ARG(depth);
15228 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15229  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15230  if (SIZE_ONLY) {
15231   RExC_size += size;
15232   return;
15233  }
15234
15235  src = RExC_emit;
15236  RExC_emit += size;
15237  dst = RExC_emit;
15238  if (RExC_open_parens) {
15239   int paren;
15240   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15241   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15242    if ( RExC_open_parens[paren] >= opnd ) {
15243     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15244     RExC_open_parens[paren] += size;
15245    } else {
15246     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15247    }
15248    if ( RExC_close_parens[paren] >= opnd ) {
15249     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15250     RExC_close_parens[paren] += size;
15251    } else {
15252     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15253    }
15254   }
15255  }
15256
15257  while (src > opnd) {
15258   StructCopy(--src, --dst, regnode);
15259 #ifdef RE_TRACK_PATTERN_OFFSETS
15260   if (RExC_offsets) {     /* MJD 20010112 */
15261    MJD_OFFSET_DEBUG(
15262     ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15263     "reg_insert",
15264     __LINE__,
15265     PL_reg_name[op],
15266     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15267      ? "Overwriting end of array!\n" : "OK",
15268     (UV)(src - RExC_emit_start),
15269     (UV)(dst - RExC_emit_start),
15270     (UV)RExC_offsets[0]));
15271    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15272    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15273   }
15274 #endif
15275  }
15276
15277
15278  place = opnd;  /* Op node, where operand used to be. */
15279 #ifdef RE_TRACK_PATTERN_OFFSETS
15280  if (RExC_offsets) {         /* MJD */
15281   MJD_OFFSET_DEBUG(
15282    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15283    "reginsert",
15284    __LINE__,
15285    PL_reg_name[op],
15286    (UV)(place - RExC_emit_start) > RExC_offsets[0]
15287    ? "Overwriting end of array!\n" : "OK",
15288    (UV)(place - RExC_emit_start),
15289    (UV)(RExC_parse - RExC_start),
15290    (UV)RExC_offsets[0]));
15291   Set_Node_Offset(place, RExC_parse);
15292   Set_Node_Length(place, 1);
15293  }
15294 #endif
15295  src = NEXTOPER(place);
15296  FILL_ADVANCE_NODE(place, op);
15297  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
15298  Zero(src, offset, regnode);
15299 }
15300
15301 /*
15302 - regtail - set the next-pointer at the end of a node chain of p to val.
15303 - SEE ALSO: regtail_study
15304 */
15305 /* TODO: All three parms should be const */
15306 STATIC void
15307 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15308     const regnode *val,U32 depth)
15309 {
15310  regnode *scan;
15311  GET_RE_DEBUG_FLAGS_DECL;
15312
15313  PERL_ARGS_ASSERT_REGTAIL;
15314 #ifndef DEBUGGING
15315  PERL_UNUSED_ARG(depth);
15316 #endif
15317
15318  if (SIZE_ONLY)
15319   return;
15320
15321  /* Find last node. */
15322  scan = p;
15323  for (;;) {
15324   regnode * const temp = regnext(scan);
15325   DEBUG_PARSE_r({
15326    SV * const mysv=sv_newmortal();
15327    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15328    regprop(RExC_rx, mysv, scan, NULL);
15329    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15330     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15331      (temp == NULL ? "->" : ""),
15332      (temp == NULL ? PL_reg_name[OP(val)] : "")
15333    );
15334   });
15335   if (temp == NULL)
15336    break;
15337   scan = temp;
15338  }
15339
15340  if (reg_off_by_arg[OP(scan)]) {
15341   ARG_SET(scan, val - scan);
15342  }
15343  else {
15344   NEXT_OFF(scan) = val - scan;
15345  }
15346 }
15347
15348 #ifdef DEBUGGING
15349 /*
15350 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15351 - Look for optimizable sequences at the same time.
15352 - currently only looks for EXACT chains.
15353
15354 This is experimental code. The idea is to use this routine to perform
15355 in place optimizations on branches and groups as they are constructed,
15356 with the long term intention of removing optimization from study_chunk so
15357 that it is purely analytical.
15358
15359 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15360 to control which is which.
15361
15362 */
15363 /* TODO: All four parms should be const */
15364
15365 STATIC U8
15366 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15367      const regnode *val,U32 depth)
15368 {
15369  regnode *scan;
15370  U8 exact = PSEUDO;
15371 #ifdef EXPERIMENTAL_INPLACESCAN
15372  I32 min = 0;
15373 #endif
15374  GET_RE_DEBUG_FLAGS_DECL;
15375
15376  PERL_ARGS_ASSERT_REGTAIL_STUDY;
15377
15378
15379  if (SIZE_ONLY)
15380   return exact;
15381
15382  /* Find last node. */
15383
15384  scan = p;
15385  for (;;) {
15386   regnode * const temp = regnext(scan);
15387 #ifdef EXPERIMENTAL_INPLACESCAN
15388   if (PL_regkind[OP(scan)] == EXACT) {
15389    bool unfolded_multi_char; /* Unexamined in this routine */
15390    if (join_exact(pRExC_state, scan, &min,
15391       &unfolded_multi_char, 1, val, depth+1))
15392     return EXACT;
15393   }
15394 #endif
15395   if ( exact ) {
15396    switch (OP(scan)) {
15397     case EXACT:
15398     case EXACTF:
15399     case EXACTFA_NO_TRIE:
15400     case EXACTFA:
15401     case EXACTFU:
15402     case EXACTFU_SS:
15403     case EXACTFL:
15404       if( exact == PSEUDO )
15405        exact= OP(scan);
15406       else if ( exact != OP(scan) )
15407        exact= 0;
15408     case NOTHING:
15409      break;
15410     default:
15411      exact= 0;
15412    }
15413   }
15414   DEBUG_PARSE_r({
15415    SV * const mysv=sv_newmortal();
15416    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15417    regprop(RExC_rx, mysv, scan, NULL);
15418    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15419     SvPV_nolen_const(mysv),
15420     REG_NODE_NUM(scan),
15421     PL_reg_name[exact]);
15422   });
15423   if (temp == NULL)
15424    break;
15425   scan = temp;
15426  }
15427  DEBUG_PARSE_r({
15428   SV * const mysv_val=sv_newmortal();
15429   DEBUG_PARSE_MSG("");
15430   regprop(RExC_rx, mysv_val, val, NULL);
15431   PerlIO_printf(Perl_debug_log,
15432      "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15433      SvPV_nolen_const(mysv_val),
15434      (IV)REG_NODE_NUM(val),
15435      (IV)(val - scan)
15436   );
15437  });
15438  if (reg_off_by_arg[OP(scan)]) {
15439   ARG_SET(scan, val - scan);
15440  }
15441  else {
15442   NEXT_OFF(scan) = val - scan;
15443  }
15444
15445  return exact;
15446 }
15447 #endif
15448
15449 /*
15450  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15451  */
15452 #ifdef DEBUGGING
15453
15454 static void
15455 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15456 {
15457  int bit;
15458  int set=0;
15459
15460  ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15461
15462  for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15463   if (flags & (1<<bit)) {
15464    if (!set++ && lead)
15465     PerlIO_printf(Perl_debug_log, "%s",lead);
15466    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15467   }
15468  }
15469  if (lead)  {
15470   if (set)
15471    PerlIO_printf(Perl_debug_log, "\n");
15472   else
15473    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15474  }
15475 }
15476
15477 static void
15478 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15479 {
15480  int bit;
15481  int set=0;
15482  regex_charset cs;
15483
15484  ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15485
15486  for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15487   if (flags & (1<<bit)) {
15488    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15489     continue;
15490    }
15491    if (!set++ && lead)
15492     PerlIO_printf(Perl_debug_log, "%s",lead);
15493    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15494   }
15495  }
15496  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15497    if (!set++ && lead) {
15498     PerlIO_printf(Perl_debug_log, "%s",lead);
15499    }
15500    switch (cs) {
15501     case REGEX_UNICODE_CHARSET:
15502      PerlIO_printf(Perl_debug_log, "UNICODE");
15503      break;
15504     case REGEX_LOCALE_CHARSET:
15505      PerlIO_printf(Perl_debug_log, "LOCALE");
15506      break;
15507     case REGEX_ASCII_RESTRICTED_CHARSET:
15508      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15509      break;
15510     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15511      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15512      break;
15513     default:
15514      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15515      break;
15516    }
15517  }
15518  if (lead)  {
15519   if (set)
15520    PerlIO_printf(Perl_debug_log, "\n");
15521   else
15522    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15523  }
15524 }
15525 #endif
15526
15527 void
15528 Perl_regdump(pTHX_ const regexp *r)
15529 {
15530 #ifdef DEBUGGING
15531  SV * const sv = sv_newmortal();
15532  SV *dsv= sv_newmortal();
15533  RXi_GET_DECL(r,ri);
15534  GET_RE_DEBUG_FLAGS_DECL;
15535
15536  PERL_ARGS_ASSERT_REGDUMP;
15537
15538  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15539
15540  /* Header fields of interest. */
15541  if (r->anchored_substr) {
15542   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15543    RE_SV_DUMPLEN(r->anchored_substr), 30);
15544   PerlIO_printf(Perl_debug_log,
15545      "anchored %s%s at %"IVdf" ",
15546      s, RE_SV_TAIL(r->anchored_substr),
15547      (IV)r->anchored_offset);
15548  } else if (r->anchored_utf8) {
15549   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15550    RE_SV_DUMPLEN(r->anchored_utf8), 30);
15551   PerlIO_printf(Perl_debug_log,
15552      "anchored utf8 %s%s at %"IVdf" ",
15553      s, RE_SV_TAIL(r->anchored_utf8),
15554      (IV)r->anchored_offset);
15555  }
15556  if (r->float_substr) {
15557   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15558    RE_SV_DUMPLEN(r->float_substr), 30);
15559   PerlIO_printf(Perl_debug_log,
15560      "floating %s%s at %"IVdf"..%"UVuf" ",
15561      s, RE_SV_TAIL(r->float_substr),
15562      (IV)r->float_min_offset, (UV)r->float_max_offset);
15563  } else if (r->float_utf8) {
15564   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15565    RE_SV_DUMPLEN(r->float_utf8), 30);
15566   PerlIO_printf(Perl_debug_log,
15567      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15568      s, RE_SV_TAIL(r->float_utf8),
15569      (IV)r->float_min_offset, (UV)r->float_max_offset);
15570  }
15571  if (r->check_substr || r->check_utf8)
15572   PerlIO_printf(Perl_debug_log,
15573      (const char *)
15574      (r->check_substr == r->float_substr
15575      && r->check_utf8 == r->float_utf8
15576      ? "(checking floating" : "(checking anchored"));
15577  if (r->intflags & PREGf_NOSCAN)
15578   PerlIO_printf(Perl_debug_log, " noscan");
15579  if (r->extflags & RXf_CHECK_ALL)
15580   PerlIO_printf(Perl_debug_log, " isall");
15581  if (r->check_substr || r->check_utf8)
15582   PerlIO_printf(Perl_debug_log, ") ");
15583
15584  if (ri->regstclass) {
15585   regprop(r, sv, ri->regstclass, NULL);
15586   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15587  }
15588  if (r->intflags & PREGf_ANCH) {
15589   PerlIO_printf(Perl_debug_log, "anchored");
15590   if (r->intflags & PREGf_ANCH_BOL)
15591    PerlIO_printf(Perl_debug_log, "(BOL)");
15592   if (r->intflags & PREGf_ANCH_MBOL)
15593    PerlIO_printf(Perl_debug_log, "(MBOL)");
15594   if (r->intflags & PREGf_ANCH_SBOL)
15595    PerlIO_printf(Perl_debug_log, "(SBOL)");
15596   if (r->intflags & PREGf_ANCH_GPOS)
15597    PerlIO_printf(Perl_debug_log, "(GPOS)");
15598   PerlIO_putc(Perl_debug_log, ' ');
15599  }
15600  if (r->intflags & PREGf_GPOS_SEEN)
15601   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15602  if (r->intflags & PREGf_SKIP)
15603   PerlIO_printf(Perl_debug_log, "plus ");
15604  if (r->intflags & PREGf_IMPLICIT)
15605   PerlIO_printf(Perl_debug_log, "implicit ");
15606  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15607  if (r->extflags & RXf_EVAL_SEEN)
15608   PerlIO_printf(Perl_debug_log, "with eval ");
15609  PerlIO_printf(Perl_debug_log, "\n");
15610  DEBUG_FLAGS_r({
15611   regdump_extflags("r->extflags: ",r->extflags);
15612   regdump_intflags("r->intflags: ",r->intflags);
15613  });
15614 #else
15615  PERL_ARGS_ASSERT_REGDUMP;
15616  PERL_UNUSED_CONTEXT;
15617  PERL_UNUSED_ARG(r);
15618 #endif /* DEBUGGING */
15619 }
15620
15621 /*
15622 - regprop - printable representation of opcode, with run time support
15623 */
15624
15625 void
15626 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15627 {
15628 #ifdef DEBUGGING
15629  int k;
15630
15631  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15632  static const char * const anyofs[] = {
15633 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15634  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15635  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15636  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15637  || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15638  || _CC_VERTSPACE != 16
15639   #error Need to adjust order of anyofs[]
15640 #endif
15641   "\\w",
15642   "\\W",
15643   "\\d",
15644   "\\D",
15645   "[:alpha:]",
15646   "[:^alpha:]",
15647   "[:lower:]",
15648   "[:^lower:]",
15649   "[:upper:]",
15650   "[:^upper:]",
15651   "[:punct:]",
15652   "[:^punct:]",
15653   "[:print:]",
15654   "[:^print:]",
15655   "[:alnum:]",
15656   "[:^alnum:]",
15657   "[:graph:]",
15658   "[:^graph:]",
15659   "[:cased:]",
15660   "[:^cased:]",
15661   "\\s",
15662   "\\S",
15663   "[:blank:]",
15664   "[:^blank:]",
15665   "[:xdigit:]",
15666   "[:^xdigit:]",
15667   "[:space:]",
15668   "[:^space:]",
15669   "[:cntrl:]",
15670   "[:^cntrl:]",
15671   "[:ascii:]",
15672   "[:^ascii:]",
15673   "\\v",
15674   "\\V"
15675  };
15676  RXi_GET_DECL(prog,progi);
15677  GET_RE_DEBUG_FLAGS_DECL;
15678
15679  PERL_ARGS_ASSERT_REGPROP;
15680
15681  sv_setpvs(sv, "");
15682
15683  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
15684   /* It would be nice to FAIL() here, but this may be called from
15685   regexec.c, and it would be hard to supply pRExC_state. */
15686   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15687            (int)OP(o), (int)REGNODE_MAX);
15688  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15689
15690  k = PL_regkind[OP(o)];
15691
15692  if (k == EXACT) {
15693   sv_catpvs(sv, " ");
15694   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15695   * is a crude hack but it may be the best for now since
15696   * we have no flag "this EXACTish node was UTF-8"
15697   * --jhi */
15698   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15699     PERL_PV_ESCAPE_UNI_DETECT |
15700     PERL_PV_ESCAPE_NONASCII   |
15701     PERL_PV_PRETTY_ELLIPSES   |
15702     PERL_PV_PRETTY_LTGT       |
15703     PERL_PV_PRETTY_NOCLEAR
15704     );
15705  } else if (k == TRIE) {
15706   /* print the details of the trie in dumpuntil instead, as
15707   * progi->data isn't available here */
15708   const char op = OP(o);
15709   const U32 n = ARG(o);
15710   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15711    (reg_ac_data *)progi->data->data[n] :
15712    NULL;
15713   const reg_trie_data * const trie
15714    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15715
15716   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15717   DEBUG_TRIE_COMPILE_r(
15718   Perl_sv_catpvf(aTHX_ sv,
15719    "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15720    (UV)trie->startstate,
15721    (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15722    (UV)trie->wordcount,
15723    (UV)trie->minlen,
15724    (UV)trie->maxlen,
15725    (UV)TRIE_CHARCOUNT(trie),
15726    (UV)trie->uniquecharcount
15727   );
15728   );
15729   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15730    sv_catpvs(sv, "[");
15731    (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15732             ? ANYOF_BITMAP(o)
15733             : TRIE_BITMAP(trie));
15734    sv_catpvs(sv, "]");
15735   }
15736
15737  } else if (k == CURLY) {
15738   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15739    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15740   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15741  }
15742  else if (k == WHILEM && o->flags)   /* Ordinal/of */
15743   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15744  else if (k == REF || k == OPEN || k == CLOSE
15745    || k == GROUPP || OP(o)==ACCEPT)
15746  {
15747   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15748   if ( RXp_PAREN_NAMES(prog) ) {
15749    if ( k != REF || (OP(o) < NREF)) {
15750     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15751     SV **name= av_fetch(list, ARG(o), 0 );
15752     if (name)
15753      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15754    }
15755    else {
15756     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15757     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15758     I32 *nums=(I32*)SvPVX(sv_dat);
15759     SV **name= av_fetch(list, nums[0], 0 );
15760     I32 n;
15761     if (name) {
15762      for ( n=0; n<SvIVX(sv_dat); n++ ) {
15763       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15764          (n ? "," : ""), (IV)nums[n]);
15765      }
15766      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15767     }
15768    }
15769   }
15770   if ( k == REF && reginfo) {
15771    U32 n = ARG(o);  /* which paren pair */
15772    I32 ln = prog->offs[n].start;
15773    if (prog->lastparen < n || ln == -1)
15774     Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15775    else if (ln == prog->offs[n].end)
15776     Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15777    else {
15778     const char *s = reginfo->strbeg + ln;
15779     Perl_sv_catpvf(aTHX_ sv, ": ");
15780     Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15781      PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15782    }
15783   }
15784  } else if (k == GOSUB)
15785   /* Paren and offset */
15786   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15787  else if (k == VERB) {
15788   if (!o->flags)
15789    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15790       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15791  } else if (k == LOGICAL)
15792   /* 2: embedded, otherwise 1 */
15793   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15794  else if (k == ANYOF) {
15795   const U8 flags = ANYOF_FLAGS(o);
15796   int do_sep = 0;
15797
15798
15799   if (flags & ANYOF_LOCALE_FLAGS)
15800    sv_catpvs(sv, "{loc}");
15801   if (flags & ANYOF_LOC_FOLD)
15802    sv_catpvs(sv, "{i}");
15803   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15804   if (flags & ANYOF_INVERT)
15805    sv_catpvs(sv, "^");
15806
15807   /* output what the standard cp 0-255 bitmap matches */
15808   do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15809
15810   /* output any special charclass tests (used entirely under use
15811   * locale) * */
15812   if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15813    int i;
15814    for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15815     if (ANYOF_POSIXL_TEST(o,i)) {
15816      sv_catpv(sv, anyofs[i]);
15817      do_sep = 1;
15818     }
15819    }
15820   }
15821
15822   if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15823      |ANYOF_UTF8
15824      |ANYOF_NONBITMAP_NON_UTF8
15825      |ANYOF_LOC_FOLD)))
15826   {
15827    if (do_sep) {
15828     Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15829     if (flags & ANYOF_INVERT)
15830      /*make sure the invert info is in each */
15831      sv_catpvs(sv, "^");
15832    }
15833
15834    if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15835     sv_catpvs(sv, "{non-utf8-latin1-all}");
15836    }
15837
15838    /* output information about the unicode matching */
15839    if (flags & ANYOF_ABOVE_LATIN1_ALL)
15840     sv_catpvs(sv, "{unicode_all}");
15841    else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15842     SV *lv; /* Set if there is something outside the bit map. */
15843     bool byte_output = FALSE;   /* If something in the bitmap has
15844            been output */
15845     SV *only_utf8_locale;
15846
15847     /* Get the stuff that wasn't in the bitmap */
15848     (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15849              &lv, &only_utf8_locale);
15850     if (lv && lv != &PL_sv_undef) {
15851      char *s = savesvpv(lv);
15852      char * const origs = s;
15853
15854      while (*s && *s != '\n')
15855       s++;
15856
15857      if (*s == '\n') {
15858       const char * const t = ++s;
15859
15860       if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15861        sv_catpvs(sv, "{outside bitmap}");
15862       }
15863       else {
15864        sv_catpvs(sv, "{utf8}");
15865       }
15866
15867       if (byte_output) {
15868        sv_catpvs(sv, " ");
15869       }
15870
15871       while (*s) {
15872        if (*s == '\n') {
15873
15874         /* Truncate very long output */
15875         if (s - origs > 256) {
15876          Perl_sv_catpvf(aTHX_ sv,
15877             "%.*s...",
15878             (int) (s - origs - 1),
15879             t);
15880          goto out_dump;
15881         }
15882         *s = ' ';
15883        }
15884        else if (*s == '\t') {
15885         *s = '-';
15886        }
15887        s++;
15888       }
15889       if (s[-1] == ' ')
15890        s[-1] = 0;
15891
15892       sv_catpv(sv, t);
15893      }
15894
15895     out_dump:
15896
15897      Safefree(origs);
15898      SvREFCNT_dec_NN(lv);
15899     }
15900
15901     if ((flags & ANYOF_LOC_FOLD)
15902      && only_utf8_locale
15903      && only_utf8_locale != &PL_sv_undef)
15904     {
15905      UV start, end;
15906      int max_entries = 256;
15907
15908      sv_catpvs(sv, "{utf8 locale}");
15909      invlist_iterinit(only_utf8_locale);
15910      while (invlist_iternext(only_utf8_locale,
15911            &start, &end)) {
15912       put_range(sv, start, end);
15913       max_entries --;
15914       if (max_entries < 0) {
15915        sv_catpvs(sv, "...");
15916        break;
15917       }
15918      }
15919      invlist_iterfinish(only_utf8_locale);
15920     }
15921    }
15922   }
15923
15924   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15925  }
15926  else if (k == POSIXD || k == NPOSIXD) {
15927   U8 index = FLAGS(o) * 2;
15928   if (index < C_ARRAY_LENGTH(anyofs)) {
15929    if (*anyofs[index] != '[')  {
15930     sv_catpv(sv, "[");
15931    }
15932    sv_catpv(sv, anyofs[index]);
15933    if (*anyofs[index] != '[')  {
15934     sv_catpv(sv, "]");
15935    }
15936   }
15937   else {
15938    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15939   }
15940  }
15941  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15942   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15943 #else
15944  PERL_UNUSED_CONTEXT;
15945  PERL_UNUSED_ARG(sv);
15946  PERL_UNUSED_ARG(o);
15947  PERL_UNUSED_ARG(prog);
15948  PERL_UNUSED_ARG(reginfo);
15949 #endif /* DEBUGGING */
15950 }
15951
15952
15953
15954 SV *
15955 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15956 {    /* Assume that RE_INTUIT is set */
15957  struct regexp *const prog = ReANY(r);
15958  GET_RE_DEBUG_FLAGS_DECL;
15959
15960  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15961  PERL_UNUSED_CONTEXT;
15962
15963  DEBUG_COMPILE_r(
15964   {
15965    const char * const s = SvPV_nolen_const(prog->check_substr
15966      ? prog->check_substr : prog->check_utf8);
15967
15968    if (!PL_colorset) reginitcolors();
15969    PerlIO_printf(Perl_debug_log,
15970      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15971      PL_colors[4],
15972      prog->check_substr ? "" : "utf8 ",
15973      PL_colors[5],PL_colors[0],
15974      s,
15975      PL_colors[1],
15976      (strlen(s) > 60 ? "..." : ""));
15977   } );
15978
15979  return prog->check_substr ? prog->check_substr : prog->check_utf8;
15980 }
15981
15982 /*
15983    pregfree()
15984
15985    handles refcounting and freeing the perl core regexp structure. When
15986    it is necessary to actually free the structure the first thing it
15987    does is call the 'free' method of the regexp_engine associated to
15988    the regexp, allowing the handling of the void *pprivate; member
15989    first. (This routine is not overridable by extensions, which is why
15990    the extensions free is called first.)
15991
15992    See regdupe and regdupe_internal if you change anything here.
15993 */
15994 #ifndef PERL_IN_XSUB_RE
15995 void
15996 Perl_pregfree(pTHX_ REGEXP *r)
15997 {
15998  SvREFCNT_dec(r);
15999 }
16000
16001 void
16002 Perl_pregfree2(pTHX_ REGEXP *rx)
16003 {
16004  struct regexp *const r = ReANY(rx);
16005  GET_RE_DEBUG_FLAGS_DECL;
16006
16007  PERL_ARGS_ASSERT_PREGFREE2;
16008
16009  if (r->mother_re) {
16010   ReREFCNT_dec(r->mother_re);
16011  } else {
16012   CALLREGFREE_PVT(rx); /* free the private data */
16013   SvREFCNT_dec(RXp_PAREN_NAMES(r));
16014   Safefree(r->xpv_len_u.xpvlenu_pv);
16015  }
16016  if (r->substrs) {
16017   SvREFCNT_dec(r->anchored_substr);
16018   SvREFCNT_dec(r->anchored_utf8);
16019   SvREFCNT_dec(r->float_substr);
16020   SvREFCNT_dec(r->float_utf8);
16021   Safefree(r->substrs);
16022  }
16023  RX_MATCH_COPY_FREE(rx);
16024 #ifdef PERL_ANY_COW
16025  SvREFCNT_dec(r->saved_copy);
16026 #endif
16027  Safefree(r->offs);
16028  SvREFCNT_dec(r->qr_anoncv);
16029  rx->sv_u.svu_rx = 0;
16030 }
16031
16032 /*  reg_temp_copy()
16033
16034  This is a hacky workaround to the structural issue of match results
16035  being stored in the regexp structure which is in turn stored in
16036  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16037  could be PL_curpm in multiple contexts, and could require multiple
16038  result sets being associated with the pattern simultaneously, such
16039  as when doing a recursive match with (??{$qr})
16040
16041  The solution is to make a lightweight copy of the regexp structure
16042  when a qr// is returned from the code executed by (??{$qr}) this
16043  lightweight copy doesn't actually own any of its data except for
16044  the starp/end and the actual regexp structure itself.
16045
16046 */
16047
16048
16049 REGEXP *
16050 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16051 {
16052  struct regexp *ret;
16053  struct regexp *const r = ReANY(rx);
16054  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16055
16056  PERL_ARGS_ASSERT_REG_TEMP_COPY;
16057
16058  if (!ret_x)
16059   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16060  else {
16061   SvOK_off((SV *)ret_x);
16062   if (islv) {
16063    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16064    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16065    made both spots point to the same regexp body.) */
16066    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16067    assert(!SvPVX(ret_x));
16068    ret_x->sv_u.svu_rx = temp->sv_any;
16069    temp->sv_any = NULL;
16070    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16071    SvREFCNT_dec_NN(temp);
16072    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16073    ing below will not set it. */
16074    SvCUR_set(ret_x, SvCUR(rx));
16075   }
16076  }
16077  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16078  sv_force_normal(sv) is called.  */
16079  SvFAKE_on(ret_x);
16080  ret = ReANY(ret_x);
16081
16082  SvFLAGS(ret_x) |= SvUTF8(rx);
16083  /* We share the same string buffer as the original regexp, on which we
16084  hold a reference count, incremented when mother_re is set below.
16085  The string pointer is copied here, being part of the regexp struct.
16086  */
16087  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16088   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16089  if (r->offs) {
16090   const I32 npar = r->nparens+1;
16091   Newx(ret->offs, npar, regexp_paren_pair);
16092   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16093  }
16094  if (r->substrs) {
16095   Newx(ret->substrs, 1, struct reg_substr_data);
16096   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16097
16098   SvREFCNT_inc_void(ret->anchored_substr);
16099   SvREFCNT_inc_void(ret->anchored_utf8);
16100   SvREFCNT_inc_void(ret->float_substr);
16101   SvREFCNT_inc_void(ret->float_utf8);
16102
16103   /* check_substr and check_utf8, if non-NULL, point to either their
16104   anchored or float namesakes, and don't hold a second reference.  */
16105  }
16106  RX_MATCH_COPIED_off(ret_x);
16107 #ifdef PERL_ANY_COW
16108  ret->saved_copy = NULL;
16109 #endif
16110  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16111  SvREFCNT_inc_void(ret->qr_anoncv);
16112
16113  return ret_x;
16114 }
16115 #endif
16116
16117 /* regfree_internal()
16118
16119    Free the private data in a regexp. This is overloadable by
16120    extensions. Perl takes care of the regexp structure in pregfree(),
16121    this covers the *pprivate pointer which technically perl doesn't
16122    know about, however of course we have to handle the
16123    regexp_internal structure when no extension is in use.
16124
16125    Note this is called before freeing anything in the regexp
16126    structure.
16127  */
16128
16129 void
16130 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16131 {
16132  struct regexp *const r = ReANY(rx);
16133  RXi_GET_DECL(r,ri);
16134  GET_RE_DEBUG_FLAGS_DECL;
16135
16136  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16137
16138  DEBUG_COMPILE_r({
16139   if (!PL_colorset)
16140    reginitcolors();
16141   {
16142    SV *dsv= sv_newmortal();
16143    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16144     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16145    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16146     PL_colors[4],PL_colors[5],s);
16147   }
16148  });
16149 #ifdef RE_TRACK_PATTERN_OFFSETS
16150  if (ri->u.offsets)
16151   Safefree(ri->u.offsets);             /* 20010421 MJD */
16152 #endif
16153  if (ri->code_blocks) {
16154   int n;
16155   for (n = 0; n < ri->num_code_blocks; n++)
16156    SvREFCNT_dec(ri->code_blocks[n].src_regex);
16157   Safefree(ri->code_blocks);
16158  }
16159
16160  if (ri->data) {
16161   int n = ri->data->count;
16162
16163   while (--n >= 0) {
16164   /* If you add a ->what type here, update the comment in regcomp.h */
16165    switch (ri->data->what[n]) {
16166    case 'a':
16167    case 'r':
16168    case 's':
16169    case 'S':
16170    case 'u':
16171     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16172     break;
16173    case 'f':
16174     Safefree(ri->data->data[n]);
16175     break;
16176    case 'l':
16177    case 'L':
16178     break;
16179    case 'T':
16180     { /* Aho Corasick add-on structure for a trie node.
16181      Used in stclass optimization only */
16182      U32 refcount;
16183      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16184 #ifdef USE_ITHREADS
16185      dVAR;
16186 #endif
16187      OP_REFCNT_LOCK;
16188      refcount = --aho->refcount;
16189      OP_REFCNT_UNLOCK;
16190      if ( !refcount ) {
16191       PerlMemShared_free(aho->states);
16192       PerlMemShared_free(aho->fail);
16193       /* do this last!!!! */
16194       PerlMemShared_free(ri->data->data[n]);
16195       /* we should only ever get called once, so
16196       * assert as much, and also guard the free
16197       * which /might/ happen twice. At the least
16198       * it will make code anlyzers happy and it
16199       * doesn't cost much. - Yves */
16200       assert(ri->regstclass);
16201       if (ri->regstclass) {
16202        PerlMemShared_free(ri->regstclass);
16203        ri->regstclass = 0;
16204       }
16205      }
16206     }
16207     break;
16208    case 't':
16209     {
16210      /* trie structure. */
16211      U32 refcount;
16212      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16213 #ifdef USE_ITHREADS
16214      dVAR;
16215 #endif
16216      OP_REFCNT_LOCK;
16217      refcount = --trie->refcount;
16218      OP_REFCNT_UNLOCK;
16219      if ( !refcount ) {
16220       PerlMemShared_free(trie->charmap);
16221       PerlMemShared_free(trie->states);
16222       PerlMemShared_free(trie->trans);
16223       if (trie->bitmap)
16224        PerlMemShared_free(trie->bitmap);
16225       if (trie->jump)
16226        PerlMemShared_free(trie->jump);
16227       PerlMemShared_free(trie->wordinfo);
16228       /* do this last!!!! */
16229       PerlMemShared_free(ri->data->data[n]);
16230      }
16231     }
16232     break;
16233    default:
16234     Perl_croak(aTHX_ "panic: regfree data code '%c'",
16235              ri->data->what[n]);
16236    }
16237   }
16238   Safefree(ri->data->what);
16239   Safefree(ri->data);
16240  }
16241
16242  Safefree(ri);
16243 }
16244
16245 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16246 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16247 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16248
16249 /*
16250    re_dup - duplicate a regexp.
16251
16252    This routine is expected to clone a given regexp structure. It is only
16253    compiled under USE_ITHREADS.
16254
16255    After all of the core data stored in struct regexp is duplicated
16256    the regexp_engine.dupe method is used to copy any private data
16257    stored in the *pprivate pointer. This allows extensions to handle
16258    any duplication it needs to do.
16259
16260    See pregfree() and regfree_internal() if you change anything here.
16261 */
16262 #if defined(USE_ITHREADS)
16263 #ifndef PERL_IN_XSUB_RE
16264 void
16265 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16266 {
16267  dVAR;
16268  I32 npar;
16269  const struct regexp *r = ReANY(sstr);
16270  struct regexp *ret = ReANY(dstr);
16271
16272  PERL_ARGS_ASSERT_RE_DUP_GUTS;
16273
16274  npar = r->nparens+1;
16275  Newx(ret->offs, npar, regexp_paren_pair);
16276  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16277
16278  if (ret->substrs) {
16279   /* Do it this way to avoid reading from *r after the StructCopy().
16280   That way, if any of the sv_dup_inc()s dislodge *r from the L1
16281   cache, it doesn't matter.  */
16282   const bool anchored = r->check_substr
16283    ? r->check_substr == r->anchored_substr
16284    : r->check_utf8 == r->anchored_utf8;
16285   Newx(ret->substrs, 1, struct reg_substr_data);
16286   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16287
16288   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16289   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16290   ret->float_substr = sv_dup_inc(ret->float_substr, param);
16291   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16292
16293   /* check_substr and check_utf8, if non-NULL, point to either their
16294   anchored or float namesakes, and don't hold a second reference.  */
16295
16296   if (ret->check_substr) {
16297    if (anchored) {
16298     assert(r->check_utf8 == r->anchored_utf8);
16299     ret->check_substr = ret->anchored_substr;
16300     ret->check_utf8 = ret->anchored_utf8;
16301    } else {
16302     assert(r->check_substr == r->float_substr);
16303     assert(r->check_utf8 == r->float_utf8);
16304     ret->check_substr = ret->float_substr;
16305     ret->check_utf8 = ret->float_utf8;
16306    }
16307   } else if (ret->check_utf8) {
16308    if (anchored) {
16309     ret->check_utf8 = ret->anchored_utf8;
16310    } else {
16311     ret->check_utf8 = ret->float_utf8;
16312    }
16313   }
16314  }
16315
16316  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16317  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16318
16319  if (ret->pprivate)
16320   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16321
16322  if (RX_MATCH_COPIED(dstr))
16323   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16324  else
16325   ret->subbeg = NULL;
16326 #ifdef PERL_ANY_COW
16327  ret->saved_copy = NULL;
16328 #endif
16329
16330  /* Whether mother_re be set or no, we need to copy the string.  We
16331  cannot refrain from copying it when the storage points directly to
16332  our mother regexp, because that's
16333    1: a buffer in a different thread
16334    2: something we no longer hold a reference on
16335    so we need to copy it locally.  */
16336  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16337  ret->mother_re   = NULL;
16338 }
16339 #endif /* PERL_IN_XSUB_RE */
16340
16341 /*
16342    regdupe_internal()
16343
16344    This is the internal complement to regdupe() which is used to copy
16345    the structure pointed to by the *pprivate pointer in the regexp.
16346    This is the core version of the extension overridable cloning hook.
16347    The regexp structure being duplicated will be copied by perl prior
16348    to this and will be provided as the regexp *r argument, however
16349    with the /old/ structures pprivate pointer value. Thus this routine
16350    may override any copying normally done by perl.
16351
16352    It returns a pointer to the new regexp_internal structure.
16353 */
16354
16355 void *
16356 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16357 {
16358  dVAR;
16359  struct regexp *const r = ReANY(rx);
16360  regexp_internal *reti;
16361  int len;
16362  RXi_GET_DECL(r,ri);
16363
16364  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16365
16366  len = ProgLen(ri);
16367
16368  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16369   char, regexp_internal);
16370  Copy(ri->program, reti->program, len+1, regnode);
16371
16372  reti->num_code_blocks = ri->num_code_blocks;
16373  if (ri->code_blocks) {
16374   int n;
16375   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16376     struct reg_code_block);
16377   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16378     struct reg_code_block);
16379   for (n = 0; n < ri->num_code_blocks; n++)
16380    reti->code_blocks[n].src_regex = (REGEXP*)
16381      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16382  }
16383  else
16384   reti->code_blocks = NULL;
16385
16386  reti->regstclass = NULL;
16387
16388  if (ri->data) {
16389   struct reg_data *d;
16390   const int count = ri->data->count;
16391   int i;
16392
16393   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16394     char, struct reg_data);
16395   Newx(d->what, count, U8);
16396
16397   d->count = count;
16398   for (i = 0; i < count; i++) {
16399    d->what[i] = ri->data->what[i];
16400    switch (d->what[i]) {
16401     /* see also regcomp.h and regfree_internal() */
16402    case 'a': /* actually an AV, but the dup function is identical.  */
16403    case 'r':
16404    case 's':
16405    case 'S':
16406    case 'u': /* actually an HV, but the dup function is identical.  */
16407     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16408     break;
16409    case 'f':
16410     /* This is cheating. */
16411     Newx(d->data[i], 1, regnode_ssc);
16412     StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16413     reti->regstclass = (regnode*)d->data[i];
16414     break;
16415    case 'T':
16416     /* Trie stclasses are readonly and can thus be shared
16417     * without duplication. We free the stclass in pregfree
16418     * when the corresponding reg_ac_data struct is freed.
16419     */
16420     reti->regstclass= ri->regstclass;
16421     /* FALLTHROUGH */
16422    case 't':
16423     OP_REFCNT_LOCK;
16424     ((reg_trie_data*)ri->data->data[i])->refcount++;
16425     OP_REFCNT_UNLOCK;
16426     /* FALLTHROUGH */
16427    case 'l':
16428    case 'L':
16429     d->data[i] = ri->data->data[i];
16430     break;
16431    default:
16432     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16433               ri->data->what[i]);
16434    }
16435   }
16436
16437   reti->data = d;
16438  }
16439  else
16440   reti->data = NULL;
16441
16442  reti->name_list_idx = ri->name_list_idx;
16443
16444 #ifdef RE_TRACK_PATTERN_OFFSETS
16445  if (ri->u.offsets) {
16446   Newx(reti->u.offsets, 2*len+1, U32);
16447   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16448  }
16449 #else
16450  SetProgLen(reti,len);
16451 #endif
16452
16453  return (void*)reti;
16454 }
16455
16456 #endif    /* USE_ITHREADS */
16457
16458 #ifndef PERL_IN_XSUB_RE
16459
16460 /*
16461  - regnext - dig the "next" pointer out of a node
16462  */
16463 regnode *
16464 Perl_regnext(pTHX_ regnode *p)
16465 {
16466  I32 offset;
16467
16468  if (!p)
16469   return(NULL);
16470
16471  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
16472   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16473             (int)OP(p), (int)REGNODE_MAX);
16474  }
16475
16476  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16477  if (offset == 0)
16478   return(NULL);
16479
16480  return(p+offset);
16481 }
16482 #endif
16483
16484 STATIC void
16485 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16486 {
16487  va_list args;
16488  STRLEN l1 = strlen(pat1);
16489  STRLEN l2 = strlen(pat2);
16490  char buf[512];
16491  SV *msv;
16492  const char *message;
16493
16494  PERL_ARGS_ASSERT_RE_CROAK2;
16495
16496  if (l1 > 510)
16497   l1 = 510;
16498  if (l1 + l2 > 510)
16499   l2 = 510 - l1;
16500  Copy(pat1, buf, l1 , char);
16501  Copy(pat2, buf + l1, l2 , char);
16502  buf[l1 + l2] = '\n';
16503  buf[l1 + l2 + 1] = '\0';
16504  va_start(args, pat2);
16505  msv = vmess(buf, &args);
16506  va_end(args);
16507  message = SvPV_const(msv,l1);
16508  if (l1 > 512)
16509   l1 = 512;
16510  Copy(message, buf, l1 , char);
16511  /* l1-1 to avoid \n */
16512  Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16513 }
16514
16515 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16516
16517 #ifndef PERL_IN_XSUB_RE
16518 void
16519 Perl_save_re_context(pTHX)
16520 {
16521  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16522  if (PL_curpm) {
16523   const REGEXP * const rx = PM_GETRE(PL_curpm);
16524   if (rx) {
16525    U32 i;
16526    for (i = 1; i <= RX_NPARENS(rx); i++) {
16527     char digits[TYPE_CHARS(long)];
16528     const STRLEN len = my_snprintf(digits, sizeof(digits),
16529            "%lu", (long)i);
16530     GV *const *const gvp
16531      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16532
16533     if (gvp) {
16534      GV * const gv = *gvp;
16535      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16536       save_scalar(gv);
16537     }
16538    }
16539   }
16540  }
16541 }
16542 #endif
16543
16544 #ifdef DEBUGGING
16545
16546 STATIC void
16547 S_put_byte(pTHX_ SV *sv, int c)
16548 {
16549  PERL_ARGS_ASSERT_PUT_BYTE;
16550
16551  if (!isPRINT(c)) {
16552   switch (c) {
16553    case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16554    case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16555    case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16556    case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16557    case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16558
16559    default:
16560     Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16561     break;
16562   }
16563  }
16564  else {
16565   const char string = c;
16566   if (c == '-' || c == ']' || c == '\\' || c == '^')
16567    sv_catpvs(sv, "\\");
16568   sv_catpvn(sv, &string, 1);
16569  }
16570 }
16571
16572 STATIC void
16573 S_put_range(pTHX_ SV *sv, UV start, UV end)
16574 {
16575
16576  /* Appends to 'sv' a displayable version of the range of code points from
16577  * 'start' to 'end'.  It assumes that only ASCII printables are displayable
16578  * as-is (though some of these will be escaped by put_byte()).  For the
16579  * time being, this subroutine only works for latin1 (< 256) code points */
16580
16581  assert(start <= end);
16582
16583  PERL_ARGS_ASSERT_PUT_RANGE;
16584
16585  while (start <= end) {
16586   if (end - start < 3) {  /* Individual chars in short ranges */
16587    for (; start <= end; start++) {
16588     put_byte(sv, start);
16589    }
16590    break;
16591   }
16592
16593   /* For small ranges that include printable ASCII characters, it's more
16594   * legible to print those characters rather than hex values.  For
16595   * larger ranges that include more than printables, it's probably
16596   * clearer to just give the start and end points of the range in hex,
16597   * and that's all we can do if there aren't any printables within the
16598   * range
16599   *
16600   * On ASCII platforms the range of printables is contiguous.  If the
16601   * entire range is printable, we print each character as such.  If the
16602   * range is partially printable and partially not, it's less likely
16603   * that the individual printables are meaningful, especially if all or
16604   * almost all of them are in the range.  But we err on the side of the
16605   * individual printables being meaningful by using the hex only if the
16606   * range contains all but 2 of the printables.
16607   *
16608   * On EBCDIC platforms, the printables are scattered around so that the
16609   * maximum range length containing only them is about 10.  Anything
16610   * longer we treat as hex; otherwise we examine the range character by
16611   * character to see */
16612 #ifdef EBCDIC
16613   if (start < 256 && (((end < 255) ? end : 255) - start <= 10))
16614 #else
16615   if ((isPRINT_A(start) && isPRINT_A(end))
16616    || (end >= 0x7F && (isPRINT_A(start) && start > 0x21))
16617    || ((end < 0x7D && isPRINT_A(end)) && start < 0x20))
16618 #endif
16619   {
16620    /* If the range beginning isn't an ASCII printable, we find the
16621    * last such in the range, then split the output, so all the
16622    * non-printables are in one subrange; then process the remaining
16623    * portion as usual.  If the entire range isn't printables, we
16624    * don't split, but drop down to print as hex */
16625    if (! isPRINT_A(start)) {
16626     UV temp_end = start + 1;
16627     while (temp_end <= end && ! isPRINT_A(temp_end)) {
16628      temp_end++;
16629     }
16630     if (temp_end <= end) {
16631      put_range(sv, start, temp_end - 1);
16632      start = temp_end;
16633      continue;
16634     }
16635    }
16636
16637    /* If the range beginning is a digit, output a subrange of just the
16638    * digits, then process the remaining portion as usual */
16639    if (isDIGIT_A(start)) {
16640     put_byte(sv, start);
16641     sv_catpvs(sv, "-");
16642     while (start <= end && isDIGIT_A(start)) start++;
16643     put_byte(sv, start - 1);
16644     continue;
16645    }
16646
16647    /* Similarly for alphabetics.  Because in both ASCII and EBCDIC,
16648    * the code points for upper and lower A-Z and a-z aren't
16649    * intermixed, the resulting subrange will consist solely of either
16650    * upper- or lower- alphabetics */
16651    if (isALPHA_A(start)) {
16652     put_byte(sv, start);
16653     sv_catpvs(sv, "-");
16654     while (start <= end && isALPHA_A(start)) start++;
16655     put_byte(sv, start - 1);
16656     continue;
16657    }
16658
16659    /* We output any remaining printables as individual characters */
16660    if (isPUNCT_A(start) || isSPACE_A(start)) {
16661     while (start <= end && (isPUNCT_A(start) || isSPACE_A(start))) {
16662      put_byte(sv, start);
16663      start++;
16664     }
16665     continue;
16666    }
16667   }
16668
16669   /* Here is a control or non-ascii.  Output the range or subrange as
16670   * hex. */
16671   Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16672      start,
16673      (end < 256) ? end : 255);
16674   break;
16675  }
16676 }
16677
16678 STATIC bool
16679 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16680 {
16681  /* Appends to 'sv' a displayable version of the innards of the bracketed
16682  * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16683  * output anything */
16684
16685  int i;
16686  bool has_output_anything = FALSE;
16687
16688  PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16689
16690  for (i = 0; i < 256; i++) {
16691   if (BITMAP_TEST((U8 *) bitmap,i)) {
16692
16693    /* The character at index i should be output.  Find the next
16694    * character that should NOT be output */
16695    int j;
16696    for (j = i + 1; j < 256; j++) {
16697     if (! BITMAP_TEST((U8 *) bitmap, j)) {
16698      break;
16699     }
16700    }
16701
16702    /* Everything between them is a single range that should be output
16703    * */
16704    put_range(sv, i, j - 1);
16705    has_output_anything = TRUE;
16706    i = j;
16707   }
16708  }
16709
16710  return has_output_anything;
16711 }
16712
16713 #define CLEAR_OPTSTART \
16714  if (optstart) STMT_START {                                               \
16715   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16716        " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16717   optstart=NULL;                                                       \
16718  } STMT_END
16719
16720 #define DUMPUNTIL(b,e)                                                       \
16721      CLEAR_OPTSTART;                                          \
16722      node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16723
16724 STATIC const regnode *
16725 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16726    const regnode *last, const regnode *plast,
16727    SV* sv, I32 indent, U32 depth)
16728 {
16729  U8 op = PSEUDO; /* Arbitrary non-END op. */
16730  const regnode *next;
16731  const regnode *optstart= NULL;
16732
16733  RXi_GET_DECL(r,ri);
16734  GET_RE_DEBUG_FLAGS_DECL;
16735
16736  PERL_ARGS_ASSERT_DUMPUNTIL;
16737
16738 #ifdef DEBUG_DUMPUNTIL
16739  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16740   last ? last-start : 0,plast ? plast-start : 0);
16741 #endif
16742
16743  if (plast && plast < last)
16744   last= plast;
16745
16746  while (PL_regkind[op] != END && (!last || node < last)) {
16747   assert(node);
16748   /* While that wasn't END last time... */
16749   NODE_ALIGN(node);
16750   op = OP(node);
16751   if (op == CLOSE || op == WHILEM)
16752    indent--;
16753   next = regnext((regnode *)node);
16754
16755   /* Where, what. */
16756   if (OP(node) == OPTIMIZED) {
16757    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16758     optstart = node;
16759    else
16760     goto after_print;
16761   } else
16762    CLEAR_OPTSTART;
16763
16764   regprop(r, sv, node, NULL);
16765   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16766      (int)(2*indent + 1), "", SvPVX_const(sv));
16767
16768   if (OP(node) != OPTIMIZED) {
16769    if (next == NULL)  /* Next ptr. */
16770     PerlIO_printf(Perl_debug_log, " (0)");
16771    else if (PL_regkind[(U8)op] == BRANCH
16772      && PL_regkind[OP(next)] != BRANCH )
16773     PerlIO_printf(Perl_debug_log, " (FAIL)");
16774    else
16775     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16776    (void)PerlIO_putc(Perl_debug_log, '\n');
16777   }
16778
16779  after_print:
16780   if (PL_regkind[(U8)op] == BRANCHJ) {
16781    assert(next);
16782    {
16783     const regnode *nnode = (OP(next) == LONGJMP
16784          ? regnext((regnode *)next)
16785          : next);
16786     if (last && nnode > last)
16787      nnode = last;
16788     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16789    }
16790   }
16791   else if (PL_regkind[(U8)op] == BRANCH) {
16792    assert(next);
16793    DUMPUNTIL(NEXTOPER(node), next);
16794   }
16795   else if ( PL_regkind[(U8)op]  == TRIE ) {
16796    const regnode *this_trie = node;
16797    const char op = OP(node);
16798    const U32 n = ARG(node);
16799    const reg_ac_data * const ac = op>=AHOCORASICK ?
16800    (reg_ac_data *)ri->data->data[n] :
16801    NULL;
16802    const reg_trie_data * const trie =
16803     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16804 #ifdef DEBUGGING
16805    AV *const trie_words
16806       = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16807 #endif
16808    const regnode *nextbranch= NULL;
16809    I32 word_idx;
16810    sv_setpvs(sv, "");
16811    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16812     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16813
16814     PerlIO_printf(Perl_debug_log, "%*s%s ",
16815     (int)(2*(indent+3)), "",
16816      elem_ptr
16817      ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16818         SvCUR(*elem_ptr), 60,
16819         PL_colors[0], PL_colors[1],
16820         (SvUTF8(*elem_ptr)
16821         ? PERL_PV_ESCAPE_UNI
16822         : 0)
16823         | PERL_PV_PRETTY_ELLIPSES
16824         | PERL_PV_PRETTY_LTGT
16825        )
16826      : "???"
16827     );
16828     if (trie->jump) {
16829      U16 dist= trie->jump[word_idx+1];
16830      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16831        (UV)((dist ? this_trie + dist : next) - start));
16832      if (dist) {
16833       if (!nextbranch)
16834        nextbranch= this_trie + trie->jump[0];
16835       DUMPUNTIL(this_trie + dist, nextbranch);
16836      }
16837      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16838       nextbranch= regnext((regnode *)nextbranch);
16839     } else {
16840      PerlIO_printf(Perl_debug_log, "\n");
16841     }
16842    }
16843    if (last && next > last)
16844     node= last;
16845    else
16846     node= next;
16847   }
16848   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16849    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16850      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16851   }
16852   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16853    assert(next);
16854    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16855   }
16856   else if ( op == PLUS || op == STAR) {
16857    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16858   }
16859   else if (PL_regkind[(U8)op] == ANYOF) {
16860    /* arglen 1 + class block */
16861    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16862       ? ANYOF_POSIXL_SKIP
16863       : ANYOF_SKIP);
16864    node = NEXTOPER(node);
16865   }
16866   else if (PL_regkind[(U8)op] == EXACT) {
16867    /* Literal string, where present. */
16868    node += NODE_SZ_STR(node) - 1;
16869    node = NEXTOPER(node);
16870   }
16871   else {
16872    node = NEXTOPER(node);
16873    node += regarglen[(U8)op];
16874   }
16875   if (op == CURLYX || op == OPEN)
16876    indent++;
16877  }
16878  CLEAR_OPTSTART;
16879 #ifdef DEBUG_DUMPUNTIL
16880  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16881 #endif
16882  return node;
16883 }
16884
16885 #endif /* DEBUGGING */
16886
16887 /*
16888  * Local variables:
16889  * c-indentation-style: bsd
16890  * c-basic-offset: 4
16891  * indent-tabs-mode: nil
16892  * End:
16893  *
16894  * ex: set ts=8 sts=4 sw=4 et:
16895  */