src/5021002/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifndef STATIC
 102 #define STATIC static
 103 #endif
 104
 105
 106 struct RExC_state_t {
 107  U32  flags;   /* RXf_* are we folding, multilining? */
 108  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 109  char *precomp;  /* uncompiled string. */
 110  REGEXP *rx_sv;   /* The SV that is the regexp. */
 111  regexp *rx;                    /* perl core regexp structure */
 112  regexp_internal *rxi;           /* internal data for regexp object
 113           pprivate field */
 114  char *start;   /* Start of input for compile */
 115  char *end;   /* End of input for compile */
 116  char *parse;   /* Input-scan pointer. */
 117  SSize_t whilem_seen;  /* number of WHILEM in this expr */
 118  regnode *emit_start;  /* Start of emitted-code area */
 119  regnode *emit_bound;  /* First regnode outside of the
 120           allocated space */
 121  regnode *emit;   /* Code-emit pointer; if = &emit_dummy,
 122           implies compiling, so don't emit */
 123  regnode_ssc emit_dummy;  /* placeholder for emit to point to;
 124           large enough for the largest
 125           non-EXACTish node, so can use it as
 126           scratch in pass1 */
 127  I32  naughty;  /* How bad is this pattern? */
 128  I32  sawback;  /* Did we see \1, ...? */
 129  U32  seen;
 130  SSize_t size;   /* Code size. */
 131  I32                npar;            /* Capture buffer count, (OPEN) plus
 132           one. ("par" 0 is the whole
 133           pattern)*/
 134  I32  nestroot;  /* root parens we are in - used by
 135           accept */
 136  I32  extralen;
 137  I32  seen_zerolen;
 138  regnode **open_parens;  /* pointers to open parens */
 139  regnode **close_parens;  /* pointers to close parens */
 140  regnode *opend;   /* END node in program */
 141  I32  utf8;  /* whether the pattern is utf8 or not */
 142  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 143         /* XXX use this for future optimisation of case
 144         * where pattern must be upgraded to utf8. */
 145  I32  uni_semantics; /* If a d charset modifier should use unicode
 146         rules, even if the pattern is not in
 147         utf8 */
 148  HV  *paren_names;  /* Paren names */
 149
 150  regnode **recurse;  /* Recurse regops */
 151  I32  recurse_count;  /* Number of recurse regops */
 152  U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 153           through */
 154  U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 155  I32  in_lookbehind;
 156  I32  contains_locale;
 157  I32  contains_i;
 158  I32  override_recoding;
 159  I32  in_multi_char_class;
 160  struct reg_code_block *code_blocks; /* positions of literal (?{})
 161            within pattern */
 162  int  num_code_blocks; /* size of code_blocks[] */
 163  int  code_index;  /* next code_blocks[] slot */
 164  SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 165 #ifdef ADD_TO_REGEXEC
 166  char  *starttry;  /* -Dr: where regtry was called. */
 167 #define RExC_starttry (pRExC_state->starttry)
 168 #endif
 169  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 170 #ifdef DEBUGGING
 171  const char  *lastparse;
 172  I32         lastnum;
 173  AV          *paren_name_list;       /* idx -> name */
 174 #define RExC_lastparse (pRExC_state->lastparse)
 175 #define RExC_lastnum (pRExC_state->lastnum)
 176 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 177 #endif
 178 };
 179
 180 #define RExC_flags (pRExC_state->flags)
 181 #define RExC_pm_flags (pRExC_state->pm_flags)
 182 #define RExC_precomp (pRExC_state->precomp)
 183 #define RExC_rx_sv (pRExC_state->rx_sv)
 184 #define RExC_rx  (pRExC_state->rx)
 185 #define RExC_rxi (pRExC_state->rxi)
 186 #define RExC_start (pRExC_state->start)
 187 #define RExC_end (pRExC_state->end)
 188 #define RExC_parse (pRExC_state->parse)
 189 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 190 #ifdef RE_TRACK_PATTERN_OFFSETS
 191 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
 192               others */
 193 #endif
 194 #define RExC_emit (pRExC_state->emit)
 195 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 196 #define RExC_emit_start (pRExC_state->emit_start)
 197 #define RExC_emit_bound (pRExC_state->emit_bound)
 198 #define RExC_naughty (pRExC_state->naughty)
 199 #define RExC_sawback (pRExC_state->sawback)
 200 #define RExC_seen (pRExC_state->seen)
 201 #define RExC_size (pRExC_state->size)
 202 #define RExC_maxlen        (pRExC_state->maxlen)
 203 #define RExC_npar (pRExC_state->npar)
 204 #define RExC_nestroot   (pRExC_state->nestroot)
 205 #define RExC_extralen (pRExC_state->extralen)
 206 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 207 #define RExC_utf8 (pRExC_state->utf8)
 208 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 209 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 210 #define RExC_open_parens (pRExC_state->open_parens)
 211 #define RExC_close_parens (pRExC_state->close_parens)
 212 #define RExC_opend (pRExC_state->opend)
 213 #define RExC_paren_names (pRExC_state->paren_names)
 214 #define RExC_recurse (pRExC_state->recurse)
 215 #define RExC_recurse_count (pRExC_state->recurse_count)
 216 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 217 #define RExC_study_chunk_recursed_bytes  \
 218         (pRExC_state->study_chunk_recursed_bytes)
 219 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 220 #define RExC_contains_locale (pRExC_state->contains_locale)
 221 #define RExC_contains_i (pRExC_state->contains_i)
 222 #define RExC_override_recoding (pRExC_state->override_recoding)
 223 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 224
 225
 226 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 227 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 228   ((*s) == '{' && regcurly(s)))
 229
 230 /*
 231  * Flags to be passed up and down.
 232  */
 233 #define WORST  0 /* Worst case. */
 234 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 235
 236 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 237  * character.  (There needs to be a case: in the switch statement in regexec.c
 238  * for any node marked SIMPLE.)  Note that this is not the same thing as
 239  * REGNODE_SIMPLE */
 240 #define SIMPLE  0x02
 241 #define SPSTART  0x04 /* Starts with * or + */
 242 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 243 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 244 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 245
 246 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 247
 248 /* whether trie related optimizations are enabled */
 249 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 250 #define TRIE_STUDY_OPT
 251 #define FULL_TRIE_STUDY
 252 #define TRIE_STCLASS
 253 #endif
 254
 255
 256
 257 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 258 #define PBITVAL(paren) (1 << ((paren) & 7))
 259 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 260 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 261 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 262
 263 #define REQUIRE_UTF8 STMT_START {                                       \
 264          if (!UTF) {                           \
 265           *flagp = RESTART_UTF8;            \
 266           return NULL;                      \
 267          }                                     \
 268       } STMT_END
 269
 270 /* This converts the named class defined in regcomp.h to its equivalent class
 271  * number defined in handy.h. */
 272 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 273 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 274
 275 #define _invlist_union_complement_2nd(a, b, output) \
 276       _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 277 #define _invlist_intersection_complement_2nd(a, b, output) \
 278     _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 279
 280 /* About scan_data_t.
 281
 282   During optimisation we recurse through the regexp program performing
 283   various inplace (keyhole style) optimisations. In addition study_chunk
 284   and scan_commit populate this data structure with information about
 285   what strings MUST appear in the pattern. We look for the longest
 286   string that must appear at a fixed location, and we look for the
 287   longest string that may appear at a floating location. So for instance
 288   in the pattern:
 289
 290  /FOO[xX]A.*B[xX]BAR/
 291
 292   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 293   strings (because they follow a .* construct). study_chunk will identify
 294   both FOO and BAR as being the longest fixed and floating strings respectively.
 295
 296   The strings can be composites, for instance
 297
 298  /(f)(o)(o)/
 299
 300   will result in a composite fixed substring 'foo'.
 301
 302   For each string some basic information is maintained:
 303
 304   - offset or min_offset
 305  This is the position the string must appear at, or not before.
 306  It also implicitly (when combined with minlenp) tells us how many
 307  characters must match before the string we are searching for.
 308  Likewise when combined with minlenp and the length of the string it
 309  tells us how many characters must appear after the string we have
 310  found.
 311
 312   - max_offset
 313  Only used for floating strings. This is the rightmost point that
 314  the string can appear at. If set to SSize_t_MAX it indicates that the
 315  string can occur infinitely far to the right.
 316
 317   - minlenp
 318  A pointer to the minimum number of characters of the pattern that the
 319  string was found inside. This is important as in the case of positive
 320  lookahead or positive lookbehind we can have multiple patterns
 321  involved. Consider
 322
 323  /(?=FOO).*F/
 324
 325  The minimum length of the pattern overall is 3, the minimum length
 326  of the lookahead part is 3, but the minimum length of the part that
 327  will actually match is 1. So 'FOO's minimum length is 3, but the
 328  minimum length for the F is 1. This is important as the minimum length
 329  is used to determine offsets in front of and behind the string being
 330  looked for.  Since strings can be composites this is the length of the
 331  pattern at the time it was committed with a scan_commit. Note that
 332  the length is calculated by study_chunk, so that the minimum lengths
 333  are not known until the full pattern has been compiled, thus the
 334  pointer to the value.
 335
 336   - lookbehind
 337
 338  In the case of lookbehind the string being searched for can be
 339  offset past the start point of the final matching string.
 340  If this value was just blithely removed from the min_offset it would
 341  invalidate some of the calculations for how many chars must match
 342  before or after (as they are derived from min_offset and minlen and
 343  the length of the string being searched for).
 344  When the final pattern is compiled and the data is moved from the
 345  scan_data_t structure into the regexp structure the information
 346  about lookbehind is factored in, with the information that would
 347  have been lost precalculated in the end_shift field for the
 348  associated string.
 349
 350   The fields pos_min and pos_delta are used to store the minimum offset
 351   and the delta to the maximum offset at the current point in the pattern.
 352
 353 */
 354
 355 typedef struct scan_data_t {
 356  /*I32 len_min;      unused */
 357  /*I32 len_delta;    unused */
 358  SSize_t pos_min;
 359  SSize_t pos_delta;
 360  SV *last_found;
 361  SSize_t last_end;     /* min value, <0 unless valid. */
 362  SSize_t last_start_min;
 363  SSize_t last_start_max;
 364  SV **longest;     /* Either &l_fixed, or &l_float. */
 365  SV *longest_fixed;      /* longest fixed string found in pattern */
 366  SSize_t offset_fixed;   /* offset where it starts */
 367  SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 368  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 369  SV *longest_float;      /* longest floating string found in pattern */
 370  SSize_t offset_float_min; /* earliest point in string it can appear */
 371  SSize_t offset_float_max; /* latest point in string it can appear */
 372  SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 373  SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 374  I32 flags;
 375  I32 whilem_c;
 376  SSize_t *last_closep;
 377  regnode_ssc *start_class;
 378 } scan_data_t;
 379
 380 /* The below is perhaps overboard, but this allows us to save a test at the
 381  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 382  * and 'a' differ by a single bit; the same with the upper and lower case of
 383  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 384  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 385  * then inverts it to form a mask, with just a single 0, in the bit position
 386  * where the upper- and lowercase differ.  XXX There are about 40 other
 387  * instances in the Perl core where this micro-optimization could be used.
 388  * Should decide if maintenance cost is worse, before changing those
 389  *
 390  * Returns a boolean as to whether or not 'v' is either a lowercase or
 391  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 392  * compile-time constant, the generated code is better than some optimizing
 393  * compilers figure out, amounting to a mask and test.  The results are
 394  * meaningless if 'c' is not one of [A-Za-z] */
 395 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 396        (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 397
 398 /*
 399  * Forward declarations for pregcomp()'s friends.
 400  */
 401
 402 static const scan_data_t zero_scan_data =
 403   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 404
 405 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 406 #define SF_BEFORE_SEOL  0x0001
 407 #define SF_BEFORE_MEOL  0x0002
 408 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 409 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 410
 411 #define SF_FIX_SHIFT_EOL (+2)
 412 #define SF_FL_SHIFT_EOL  (+4)
 413
 414 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 415 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 416
 417 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 418 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 419 #define SF_IS_INF  0x0040
 420 #define SF_HAS_PAR  0x0080
 421 #define SF_IN_PAR  0x0100
 422 #define SF_HAS_EVAL  0x0200
 423 #define SCF_DO_SUBSTR  0x0400
 424 #define SCF_DO_STCLASS_AND 0x0800
 425 #define SCF_DO_STCLASS_OR 0x1000
 426 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 427 #define SCF_WHILEM_VISITED_POS 0x2000
 428
 429 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 430 #define SCF_SEEN_ACCEPT         0x8000
 431 #define SCF_TRIE_DOING_RESTUDY 0x10000
 432
 433 #define UTF cBOOL(RExC_utf8)
 434
 435 /* The enums for all these are ordered so things work out correctly */
 436 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 437 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 438              == REGEX_DEPENDS_CHARSET)
 439 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 440 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 441              >= REGEX_UNICODE_CHARSET)
 442 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 443            == REGEX_ASCII_RESTRICTED_CHARSET)
 444 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 445            >= REGEX_ASCII_RESTRICTED_CHARSET)
 446 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 447           == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 448
 449 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 450
 451 /* For programs that want to be strictly Unicode compatible by dying if any
 452  * attempt is made to match a non-Unicode code point against a Unicode
 453  * property.  */
 454 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 455
 456 #define OOB_NAMEDCLASS  -1
 457
 458 /* There is no code point that is out-of-bounds, so this is problematic.  But
 459  * its only current use is to initialize a variable that is always set before
 460  * looked at. */
 461 #define OOB_UNICODE  0xDEADBEEF
 462
 463 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 464 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 465
 466
 467 /* length of regex to show in messages that don't mark a position within */
 468 #define RegexLengthToShowInErrorMessages 127
 469
 470 /*
 471  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 472  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 473  * op/pragma/warn/regcomp.
 474  */
 475 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 476 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 477
 478 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 479       " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 480
 481 #define REPORT_LOCATION_ARGS(offset)            \
 482     UTF8fARG(UTF, offset, RExC_precomp), \
 483     UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 484
 485 /*
 486  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 487  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 488  * "...".
 489  */
 490 #define _FAIL(code) STMT_START {     \
 491  const char *ellipses = "";      \
 492  IV len = RExC_end - RExC_precomp;     \
 493                   \
 494  if (!SIZE_ONLY)       \
 495   SAVEFREESV(RExC_rx_sv);      \
 496  if (len > RegexLengthToShowInErrorMessages) {   \
 497   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 498   len = RegexLengthToShowInErrorMessages - 10;   \
 499   ellipses = "...";      \
 500  }         \
 501  code;                                                               \
 502 } STMT_END
 503
 504 #define FAIL(msg) _FAIL(       \
 505  Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",     \
 506    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 507
 508 #define FAIL2(msg,arg) _FAIL(       \
 509  Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",     \
 510    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 511
 512 /*
 513  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 514  */
 515 #define Simple_vFAIL(m) STMT_START {     \
 516  const IV offset = RExC_parse - RExC_precomp;   \
 517  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 518    m, REPORT_LOCATION_ARGS(offset)); \
 519 } STMT_END
 520
 521 /*
 522  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 523  */
 524 #define vFAIL(m) STMT_START {    \
 525  if (!SIZE_ONLY)     \
 526   SAVEFREESV(RExC_rx_sv);    \
 527  Simple_vFAIL(m);     \
 528 } STMT_END
 529
 530 /*
 531  * Like Simple_vFAIL(), but accepts two arguments.
 532  */
 533 #define Simple_vFAIL2(m,a1) STMT_START {   \
 534  const IV offset = RExC_parse - RExC_precomp;   \
 535  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,   \
 536      REPORT_LOCATION_ARGS(offset)); \
 537 } STMT_END
 538
 539 /*
 540  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 541  */
 542 #define vFAIL2(m,a1) STMT_START {   \
 543  if (!SIZE_ONLY)     \
 544   SAVEFREESV(RExC_rx_sv);    \
 545  Simple_vFAIL2(m, a1);    \
 546 } STMT_END
 547
 548
 549 /*
 550  * Like Simple_vFAIL(), but accepts three arguments.
 551  */
 552 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 553  const IV offset = RExC_parse - RExC_precomp;  \
 554  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 555    REPORT_LOCATION_ARGS(offset)); \
 556 } STMT_END
 557
 558 /*
 559  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 560  */
 561 #define vFAIL3(m,a1,a2) STMT_START {   \
 562  if (!SIZE_ONLY)     \
 563   SAVEFREESV(RExC_rx_sv);    \
 564  Simple_vFAIL3(m, a1, a2);    \
 565 } STMT_END
 566
 567 /*
 568  * Like Simple_vFAIL(), but accepts four arguments.
 569  */
 570 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 571  const IV offset = RExC_parse - RExC_precomp;  \
 572  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,  \
 573    REPORT_LOCATION_ARGS(offset)); \
 574 } STMT_END
 575
 576 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 577  if (!SIZE_ONLY)     \
 578   SAVEFREESV(RExC_rx_sv);    \
 579  Simple_vFAIL4(m, a1, a2, a3);   \
 580 } STMT_END
 581
 582 /* A specialized version of vFAIL2 that works with UTF8f */
 583 #define vFAIL2utf8f(m, a1) STMT_START { \
 584  const IV offset = RExC_parse - RExC_precomp;   \
 585  if (!SIZE_ONLY)                                \
 586   SAVEFREESV(RExC_rx_sv);                    \
 587  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 588    REPORT_LOCATION_ARGS(offset));         \
 589 } STMT_END
 590
 591
 592 /* m is not necessarily a "literal string", in this macro */
 593 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 594  const IV offset = loc - RExC_precomp;                               \
 595  Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 596    m, REPORT_LOCATION_ARGS(offset));       \
 597 } STMT_END
 598
 599 #define ckWARNreg(loc,m) STMT_START {     \
 600  const IV offset = loc - RExC_precomp;    \
 601  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 602    REPORT_LOCATION_ARGS(offset));  \
 603 } STMT_END
 604
 605 #define vWARN_dep(loc, m) STMT_START {            \
 606  const IV offset = loc - RExC_precomp;    \
 607  Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 608    REPORT_LOCATION_ARGS(offset));         \
 609 } STMT_END
 610
 611 #define ckWARNdep(loc,m) STMT_START {            \
 612  const IV offset = loc - RExC_precomp;    \
 613  Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 614    m REPORT_LOCATION,      \
 615    REPORT_LOCATION_ARGS(offset));  \
 616 } STMT_END
 617
 618 #define ckWARNregdep(loc,m) STMT_START {    \
 619  const IV offset = loc - RExC_precomp;    \
 620  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 621    m REPORT_LOCATION,      \
 622    REPORT_LOCATION_ARGS(offset));  \
 623 } STMT_END
 624
 625 #define ckWARN2reg_d(loc,m, a1) STMT_START {    \
 626  const IV offset = loc - RExC_precomp;    \
 627  Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),   \
 628    m REPORT_LOCATION,      \
 629    a1, REPORT_LOCATION_ARGS(offset)); \
 630 } STMT_END
 631
 632 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 633  const IV offset = loc - RExC_precomp;    \
 634  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 635    a1, REPORT_LOCATION_ARGS(offset)); \
 636 } STMT_END
 637
 638 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 639  const IV offset = loc - RExC_precomp;    \
 640  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 641    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 642 } STMT_END
 643
 644 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 645  const IV offset = loc - RExC_precomp;    \
 646  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 647    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 648 } STMT_END
 649
 650 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 651  const IV offset = loc - RExC_precomp;    \
 652  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 653    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 654 } STMT_END
 655
 656 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 657  const IV offset = loc - RExC_precomp;    \
 658  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 659    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 660 } STMT_END
 661
 662 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 663  const IV offset = loc - RExC_precomp;    \
 664  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 665    a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 666 } STMT_END
 667
 668
 669 /* Allow for side effects in s */
 670 #define REGC(c,s) STMT_START {   \
 671  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 672 } STMT_END
 673
 674 /* Macros for recording node offsets.   20001227 mjd@plover.com
 675  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 676  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 677  * Element 0 holds the number n.
 678  * Position is 1 indexed.
 679  */
 680 #ifndef RE_TRACK_PATTERN_OFFSETS
 681 #define Set_Node_Offset_To_R(node,byte)
 682 #define Set_Node_Offset(node,byte)
 683 #define Set_Cur_Node_Offset
 684 #define Set_Node_Length_To_R(node,len)
 685 #define Set_Node_Length(node,len)
 686 #define Set_Node_Cur_Length(node,start)
 687 #define Node_Offset(n)
 688 #define Node_Length(n)
 689 #define Set_Node_Offset_Length(node,offset,len)
 690 #define ProgLen(ri) ri->u.proglen
 691 #define SetProgLen(ri,x) ri->u.proglen = x
 692 #else
 693 #define ProgLen(ri) ri->u.offsets[0]
 694 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 695 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 696  if (! SIZE_ONLY) {       \
 697   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 698      __LINE__, (int)(node), (int)(byte)));  \
 699   if((node) < 0) {      \
 700    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 701           (int)(node));                  \
 702   } else {       \
 703    RExC_offsets[2*(node)-1] = (byte);    \
 704   }        \
 705  }         \
 706 } STMT_END
 707
 708 #define Set_Node_Offset(node,byte) \
 709  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 710 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 711
 712 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 713  if (! SIZE_ONLY) {       \
 714   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 715     __LINE__, (int)(node), (int)(len)));   \
 716   if((node) < 0) {      \
 717    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 718           (int)(node));                  \
 719   } else {       \
 720    RExC_offsets[2*(node)] = (len);    \
 721   }        \
 722  }         \
 723 } STMT_END
 724
 725 #define Set_Node_Length(node,len) \
 726  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 727 #define Set_Node_Cur_Length(node, start)                \
 728  Set_Node_Length(node, RExC_parse - start)
 729
 730 /* Get offsets and lengths */
 731 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 732 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 733
 734 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 735  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 736  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 737 } STMT_END
 738 #endif
 739
 740 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 741 #define EXPERIMENTAL_INPLACESCAN
 742 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 743
 744 #define DEBUG_RExC_seen() \
 745   DEBUG_OPTIMISE_MORE_r({                                             \
 746    PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 747                    \
 748    if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 749     PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 750                    \
 751    if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 752     PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 753                    \
 754    if (RExC_seen & REG_GPOS_SEEN)                                  \
 755     PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 756                    \
 757    if (RExC_seen & REG_CANY_SEEN)                                  \
 758     PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 759                    \
 760    if (RExC_seen & REG_RECURSE_SEEN)                               \
 761     PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 762                    \
 763    if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 764     PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 765                    \
 766    if (RExC_seen & REG_VERBARG_SEEN)                               \
 767     PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 768                    \
 769    if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 770     PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 771                    \
 772    if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 773     PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 774                    \
 775    if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 776     PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 777                    \
 778    if (RExC_seen & REG_GOSTART_SEEN)                               \
 779     PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 780                    \
 781    if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 782     PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 783                    \
 784    PerlIO_printf(Perl_debug_log,"\n");                             \
 785   });
 786
 787 #define DEBUG_STUDYDATA(str,data,depth)                              \
 788 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 789  PerlIO_printf(Perl_debug_log,                                    \
 790   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 791   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 792   (int)(depth)*2, "",                                          \
 793   (IV)((data)->pos_min),                                       \
 794   (IV)((data)->pos_delta),                                     \
 795   (UV)((data)->flags),                                         \
 796   (IV)((data)->whilem_c),                                      \
 797   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 798   is_inf ? "INF " : ""                                         \
 799  );                                                               \
 800  if ((data)->last_found)                                          \
 801   PerlIO_printf(Perl_debug_log,                                \
 802    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 803    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 804    SvPVX_const((data)->last_found),                         \
 805    (IV)((data)->last_end),                                  \
 806    (IV)((data)->last_start_min),                            \
 807    (IV)((data)->last_start_max),                            \
 808    ((data)->longest &&                                      \
 809    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 810    SvPVX_const((data)->longest_fixed),                      \
 811    (IV)((data)->offset_fixed),                              \
 812    ((data)->longest &&                                      \
 813    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 814    SvPVX_const((data)->longest_float),                      \
 815    (IV)((data)->offset_float_min),                          \
 816    (IV)((data)->offset_float_max)                           \
 817   );                                                           \
 818  PerlIO_printf(Perl_debug_log,"\n");                              \
 819 });
 820
 821 /* Mark that we cannot extend a found fixed substring at this point.
 822    Update the longest found anchored substring and the longest found
 823    floating substrings if needed. */
 824
 825 STATIC void
 826 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 827      SSize_t *minlenp, int is_inf)
 828 {
 829  const STRLEN l = CHR_SVLEN(data->last_found);
 830  const STRLEN old_l = CHR_SVLEN(*data->longest);
 831  GET_RE_DEBUG_FLAGS_DECL;
 832
 833  PERL_ARGS_ASSERT_SCAN_COMMIT;
 834
 835  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 836   SvSetMagicSV(*data->longest, data->last_found);
 837   if (*data->longest == data->longest_fixed) {
 838    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 839    if (data->flags & SF_BEFORE_EOL)
 840     data->flags
 841      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 842    else
 843     data->flags &= ~SF_FIX_BEFORE_EOL;
 844    data->minlen_fixed=minlenp;
 845    data->lookbehind_fixed=0;
 846   }
 847   else { /* *data->longest == data->longest_float */
 848    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 849    data->offset_float_max = (l
 850          ? data->last_start_max
 851          : (data->pos_delta == SSize_t_MAX
 852           ? SSize_t_MAX
 853           : data->pos_min + data->pos_delta));
 854    if (is_inf
 855     || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 856     data->offset_float_max = SSize_t_MAX;
 857    if (data->flags & SF_BEFORE_EOL)
 858     data->flags
 859      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 860    else
 861     data->flags &= ~SF_FL_BEFORE_EOL;
 862    data->minlen_float=minlenp;
 863    data->lookbehind_float=0;
 864   }
 865  }
 866  SvCUR_set(data->last_found, 0);
 867  {
 868   SV * const sv = data->last_found;
 869   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 870    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 871    if (mg)
 872     mg->mg_len = 0;
 873   }
 874  }
 875  data->last_end = -1;
 876  data->flags &= ~SF_BEFORE_EOL;
 877  DEBUG_STUDYDATA("commit: ",data,0);
 878 }
 879
 880 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 881  * list that describes which code points it matches */
 882
 883 STATIC void
 884 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 885 {
 886  /* Set the SSC 'ssc' to match an empty string or any code point */
 887
 888  PERL_ARGS_ASSERT_SSC_ANYTHING;
 889
 890  assert(is_ANYOF_SYNTHETIC(ssc));
 891
 892  ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 893  _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 894  ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 895 }
 896
 897 STATIC int
 898 S_ssc_is_anything(const regnode_ssc *ssc)
 899 {
 900  /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 901  * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 902  * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 903  * in any way, so there's no point in using it */
 904
 905  UV start, end;
 906  bool ret;
 907
 908  PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 909
 910  assert(is_ANYOF_SYNTHETIC(ssc));
 911
 912  if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 913   return FALSE;
 914  }
 915
 916  /* See if the list consists solely of the range 0 - Infinity */
 917  invlist_iterinit(ssc->invlist);
 918  ret = invlist_iternext(ssc->invlist, &start, &end)
 919   && start == 0
 920   && end == UV_MAX;
 921
 922  invlist_iterfinish(ssc->invlist);
 923
 924  if (ret) {
 925   return TRUE;
 926  }
 927
 928  /* If e.g., both \w and \W are set, matches everything */
 929  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 930   int i;
 931   for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 932    if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 933     return TRUE;
 934    }
 935   }
 936  }
 937
 938  return FALSE;
 939 }
 940
 941 STATIC void
 942 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 943 {
 944  /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 945  * string, any code point, or any posix class under locale */
 946
 947  PERL_ARGS_ASSERT_SSC_INIT;
 948
 949  Zero(ssc, 1, regnode_ssc);
 950  set_ANYOF_SYNTHETIC(ssc);
 951  ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 952  ssc_anything(ssc);
 953
 954  /* If any portion of the regex is to operate under locale rules,
 955  * initialization includes it.  The reason this isn't done for all regexes
 956  * is that the optimizer was written under the assumption that locale was
 957  * all-or-nothing.  Given the complexity and lack of documentation in the
 958  * optimizer, and that there are inadequate test cases for locale, many
 959  * parts of it may not work properly, it is safest to avoid locale unless
 960  * necessary. */
 961  if (RExC_contains_locale) {
 962   ANYOF_POSIXL_SETALL(ssc);
 963  }
 964  else {
 965   ANYOF_POSIXL_ZERO(ssc);
 966  }
 967 }
 968
 969 STATIC int
 970 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
 971       const regnode_ssc *ssc)
 972 {
 973  /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 974  * to the list of code points matched, and locale posix classes; hence does
 975  * not check its flags) */
 976
 977  UV start, end;
 978  bool ret;
 979
 980  PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 981
 982  assert(is_ANYOF_SYNTHETIC(ssc));
 983
 984  invlist_iterinit(ssc->invlist);
 985  ret = invlist_iternext(ssc->invlist, &start, &end)
 986   && start == 0
 987   && end == UV_MAX;
 988
 989  invlist_iterfinish(ssc->invlist);
 990
 991  if (! ret) {
 992   return FALSE;
 993  }
 994
 995  if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 996   return FALSE;
 997  }
 998
 999  return TRUE;
1000 }
1001
1002 STATIC SV*
1003 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1004        const regnode_charclass* const node)
1005 {
1006  /* Returns a mortal inversion list defining which code points are matched
1007  * by 'node', which is of type ANYOF.  Handles complementing the result if
1008  * appropriate.  If some code points aren't knowable at this time, the
1009  * returned list must, and will, contain every code point that is a
1010  * possibility. */
1011
1012  SV* invlist = sv_2mortal(_new_invlist(0));
1013  SV* only_utf8_locale_invlist = NULL;
1014  unsigned int i;
1015  const U32 n = ARG(node);
1016  bool new_node_has_latin1 = FALSE;
1017
1018  PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1019
1020  /* Look at the data structure created by S_set_ANYOF_arg() */
1021  if (n != ANYOF_NONBITMAP_EMPTY) {
1022   SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1023   AV * const av = MUTABLE_AV(SvRV(rv));
1024   SV **const ary = AvARRAY(av);
1025   assert(RExC_rxi->data->what[n] == 's');
1026
1027   if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1028    invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1029   }
1030   else if (ary[0] && ary[0] != &PL_sv_undef) {
1031
1032    /* Here, no compile-time swash, and there are things that won't be
1033    * known until runtime -- we have to assume it could be anything */
1034    return _add_range_to_invlist(invlist, 0, UV_MAX);
1035   }
1036   else if (ary[3] && ary[3] != &PL_sv_undef) {
1037
1038    /* Here no compile-time swash, and no run-time only data.  Use the
1039    * node's inversion list */
1040    invlist = sv_2mortal(invlist_clone(ary[3]));
1041   }
1042
1043   /* Get the code points valid only under UTF-8 locales */
1044   if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1045    && ary[2] && ary[2] != &PL_sv_undef)
1046   {
1047    only_utf8_locale_invlist = ary[2];
1048   }
1049  }
1050
1051  /* An ANYOF node contains a bitmap for the first 256 code points, and an
1052  * inversion list for the others, but if there are code points that should
1053  * match only conditionally on the target string being UTF-8, those are
1054  * placed in the inversion list, and not the bitmap.  Since there are
1055  * circumstances under which they could match, they are included in the
1056  * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1057  * here, so that when we invert below, the end result actually does include
1058  * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1059  * before we add the unconditionally matched code points */
1060  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1061   _invlist_intersection_complement_2nd(invlist,
1062            PL_UpperLatin1,
1063            &invlist);
1064  }
1065
1066  /* Add in the points from the bit map */
1067  for (i = 0; i < 256; i++) {
1068   if (ANYOF_BITMAP_TEST(node, i)) {
1069    invlist = add_cp_to_invlist(invlist, i);
1070    new_node_has_latin1 = TRUE;
1071   }
1072  }
1073
1074  /* If this can match all upper Latin1 code points, have to add them
1075  * as well */
1076  if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1077   _invlist_union(invlist, PL_UpperLatin1, &invlist);
1078  }
1079
1080  /* Similarly for these */
1081  if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1082   invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1083  }
1084
1085  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1086   _invlist_invert(invlist);
1087  }
1088  else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1089
1090   /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1091   * locale.  We can skip this if there are no 0-255 at all. */
1092   _invlist_union(invlist, PL_Latin1, &invlist);
1093  }
1094
1095  /* Similarly add the UTF-8 locale possible matches.  These have to be
1096  * deferred until after the non-UTF-8 locale ones are taken care of just
1097  * above, or it leads to wrong results under ANYOF_INVERT */
1098  if (only_utf8_locale_invlist) {
1099   _invlist_union_maybe_complement_2nd(invlist,
1100            only_utf8_locale_invlist,
1101            ANYOF_FLAGS(node) & ANYOF_INVERT,
1102            &invlist);
1103  }
1104
1105  return invlist;
1106 }
1107
1108 /* These two functions currently do the exact same thing */
1109 #define ssc_init_zero  ssc_init
1110
1111 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1112 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1113
1114 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1115  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1116  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1117
1118 STATIC void
1119 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1120     const regnode_charclass *and_with)
1121 {
1122  /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1123  * another SSC or a regular ANYOF class.  Can create false positives. */
1124
1125  SV* anded_cp_list;
1126  U8  anded_flags;
1127
1128  PERL_ARGS_ASSERT_SSC_AND;
1129
1130  assert(is_ANYOF_SYNTHETIC(ssc));
1131
1132  /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1133  * the code point inversion list and just the relevant flags */
1134  if (is_ANYOF_SYNTHETIC(and_with)) {
1135   anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1136   anded_flags = ANYOF_FLAGS(and_with);
1137
1138   /* XXX This is a kludge around what appears to be deficiencies in the
1139   * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1140   * there are paths through the optimizer where it doesn't get weeded
1141   * out when it should.  And if we don't make some extra provision for
1142   * it like the code just below, it doesn't get added when it should.
1143   * This solution is to add it only when AND'ing, which is here, and
1144   * only when what is being AND'ed is the pristine, original node
1145   * matching anything.  Thus it is like adding it to ssc_anything() but
1146   * only when the result is to be AND'ed.  Probably the same solution
1147   * could be adopted for the same problem we have with /l matching,
1148   * which is solved differently in S_ssc_init(), and that would lead to
1149   * fewer false positives than that solution has.  But if this solution
1150   * creates bugs, the consequences are only that a warning isn't raised
1151   * that should be; while the consequences for having /l bugs is
1152   * incorrect matches */
1153   if (ssc_is_anything((regnode_ssc *)and_with)) {
1154    anded_flags |= ANYOF_WARN_SUPER;
1155   }
1156  }
1157  else {
1158   anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1159   anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1160  }
1161
1162  ANYOF_FLAGS(ssc) &= anded_flags;
1163
1164  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1165  * C2 is the list of code points in 'and-with'; P2, its posix classes.
1166  * 'and_with' may be inverted.  When not inverted, we have the situation of
1167  * computing:
1168  *  (C1 | P1) & (C2 | P2)
1169  *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1170  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1171  *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1172  *                    <=  ((C1 & C2) | P1 | P2)
1173  * Alternatively, the last few steps could be:
1174  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1175  *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1176  *                    <=  (C1 | C2 | (P1 & P2))
1177  * We favor the second approach if either P1 or P2 is non-empty.  This is
1178  * because these components are a barrier to doing optimizations, as what
1179  * they match cannot be known until the moment of matching as they are
1180  * dependent on the current locale, 'AND"ing them likely will reduce or
1181  * eliminate them.
1182  * But we can do better if we know that C1,P1 are in their initial state (a
1183  * frequent occurrence), each matching everything:
1184  *  (<everything>) & (C2 | P2) =  C2 | P2
1185  * Similarly, if C2,P2 are in their initial state (again a frequent
1186  * occurrence), the result is a no-op
1187  *  (C1 | P1) & (<everything>) =  C1 | P1
1188  *
1189  * Inverted, we have
1190  *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1191  *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1192  *                         <=  (C1 & ~C2) | (P1 & ~P2)
1193  * */
1194
1195  if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1196   && ! is_ANYOF_SYNTHETIC(and_with))
1197  {
1198   unsigned int i;
1199
1200   ssc_intersection(ssc,
1201       anded_cp_list,
1202       FALSE /* Has already been inverted */
1203       );
1204
1205   /* If either P1 or P2 is empty, the intersection will be also; can skip
1206   * the loop */
1207   if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1208    ANYOF_POSIXL_ZERO(ssc);
1209   }
1210   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1211
1212    /* Note that the Posix class component P from 'and_with' actually
1213    * looks like:
1214    *      P = Pa | Pb | ... | Pn
1215    * where each component is one posix class, such as in [\w\s].
1216    * Thus
1217    *      ~P = ~(Pa | Pb | ... | Pn)
1218    *         = ~Pa & ~Pb & ... & ~Pn
1219    *        <= ~Pa | ~Pb | ... | ~Pn
1220    * The last is something we can easily calculate, but unfortunately
1221    * is likely to have many false positives.  We could do better
1222    * in some (but certainly not all) instances if two classes in
1223    * P have known relationships.  For example
1224    *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1225    * So
1226    *      :lower: & :print: = :lower:
1227    * And similarly for classes that must be disjoint.  For example,
1228    * since \s and \w can have no elements in common based on rules in
1229    * the POSIX standard,
1230    *      \w & ^\S = nothing
1231    * Unfortunately, some vendor locales do not meet the Posix
1232    * standard, in particular almost everything by Microsoft.
1233    * The loop below just changes e.g., \w into \W and vice versa */
1234
1235    regnode_charclass_posixl temp;
1236    int add = 1;    /* To calculate the index of the complement */
1237
1238    ANYOF_POSIXL_ZERO(&temp);
1239    for (i = 0; i < ANYOF_MAX; i++) {
1240     assert(i % 2 != 0
1241      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1242      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1243
1244     if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1245      ANYOF_POSIXL_SET(&temp, i + add);
1246     }
1247     add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1248    }
1249    ANYOF_POSIXL_AND(&temp, ssc);
1250
1251   } /* else ssc already has no posixes */
1252  } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1253   in its initial state */
1254  else if (! is_ANYOF_SYNTHETIC(and_with)
1255    || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1256  {
1257   /* But if 'ssc' is in its initial state, the result is just 'and_with';
1258   * copy it over 'ssc' */
1259   if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1260    if (is_ANYOF_SYNTHETIC(and_with)) {
1261     StructCopy(and_with, ssc, regnode_ssc);
1262    }
1263    else {
1264     ssc->invlist = anded_cp_list;
1265     ANYOF_POSIXL_ZERO(ssc);
1266     if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1267      ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1268     }
1269    }
1270   }
1271   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1272     || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1273   {
1274    /* One or the other of P1, P2 is non-empty. */
1275    if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1276     ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1277    }
1278    ssc_union(ssc, anded_cp_list, FALSE);
1279   }
1280   else { /* P1 = P2 = empty */
1281    ssc_intersection(ssc, anded_cp_list, FALSE);
1282   }
1283  }
1284 }
1285
1286 STATIC void
1287 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1288    const regnode_charclass *or_with)
1289 {
1290  /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1291  * another SSC or a regular ANYOF class.  Can create false positives if
1292  * 'or_with' is to be inverted. */
1293
1294  SV* ored_cp_list;
1295  U8 ored_flags;
1296
1297  PERL_ARGS_ASSERT_SSC_OR;
1298
1299  assert(is_ANYOF_SYNTHETIC(ssc));
1300
1301  /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1302  * the code point inversion list and just the relevant flags */
1303  if (is_ANYOF_SYNTHETIC(or_with)) {
1304   ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1305   ored_flags = ANYOF_FLAGS(or_with);
1306  }
1307  else {
1308   ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1309   ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1310  }
1311
1312  ANYOF_FLAGS(ssc) |= ored_flags;
1313
1314  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1315  * C2 is the list of code points in 'or-with'; P2, its posix classes.
1316  * 'or_with' may be inverted.  When not inverted, we have the simple
1317  * situation of computing:
1318  *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1319  * If P1|P2 yields a situation with both a class and its complement are
1320  * set, like having both \w and \W, this matches all code points, and we
1321  * can delete these from the P component of the ssc going forward.  XXX We
1322  * might be able to delete all the P components, but I (khw) am not certain
1323  * about this, and it is better to be safe.
1324  *
1325  * Inverted, we have
1326  *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1327  *                         <=  (C1 | P1) | ~C2
1328  *                         <=  (C1 | ~C2) | P1
1329  * (which results in actually simpler code than the non-inverted case)
1330  * */
1331
1332  if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1333   && ! is_ANYOF_SYNTHETIC(or_with))
1334  {
1335   /* We ignore P2, leaving P1 going forward */
1336  }   /* else  Not inverted */
1337  else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1338   ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1339   if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1340    unsigned int i;
1341    for (i = 0; i < ANYOF_MAX; i += 2) {
1342     if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1343     {
1344      ssc_match_all_cp(ssc);
1345      ANYOF_POSIXL_CLEAR(ssc, i);
1346      ANYOF_POSIXL_CLEAR(ssc, i+1);
1347     }
1348    }
1349   }
1350  }
1351
1352  ssc_union(ssc,
1353    ored_cp_list,
1354    FALSE /* Already has been inverted */
1355    );
1356 }
1357
1358 PERL_STATIC_INLINE void
1359 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1360 {
1361  PERL_ARGS_ASSERT_SSC_UNION;
1362
1363  assert(is_ANYOF_SYNTHETIC(ssc));
1364
1365  _invlist_union_maybe_complement_2nd(ssc->invlist,
1366           invlist,
1367           invert2nd,
1368           &ssc->invlist);
1369 }
1370
1371 PERL_STATIC_INLINE void
1372 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1373       SV* const invlist,
1374       const bool invert2nd)
1375 {
1376  PERL_ARGS_ASSERT_SSC_INTERSECTION;
1377
1378  assert(is_ANYOF_SYNTHETIC(ssc));
1379
1380  _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1381            invlist,
1382            invert2nd,
1383            &ssc->invlist);
1384 }
1385
1386 PERL_STATIC_INLINE void
1387 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1388 {
1389  PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1390
1391  assert(is_ANYOF_SYNTHETIC(ssc));
1392
1393  ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1394 }
1395
1396 PERL_STATIC_INLINE void
1397 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1398 {
1399  /* AND just the single code point 'cp' into the SSC 'ssc' */
1400
1401  SV* cp_list = _new_invlist(2);
1402
1403  PERL_ARGS_ASSERT_SSC_CP_AND;
1404
1405  assert(is_ANYOF_SYNTHETIC(ssc));
1406
1407  cp_list = add_cp_to_invlist(cp_list, cp);
1408  ssc_intersection(ssc, cp_list,
1409      FALSE /* Not inverted */
1410      );
1411  SvREFCNT_dec_NN(cp_list);
1412 }
1413
1414 PERL_STATIC_INLINE void
1415 S_ssc_clear_locale(regnode_ssc *ssc)
1416 {
1417  /* Set the SSC 'ssc' to not match any locale things */
1418  PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1419
1420  assert(is_ANYOF_SYNTHETIC(ssc));
1421
1422  ANYOF_POSIXL_ZERO(ssc);
1423  ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1424 }
1425
1426 STATIC void
1427 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1428 {
1429  /* The inversion list in the SSC is marked mortal; now we need a more
1430  * permanent copy, which is stored the same way that is done in a regular
1431  * ANYOF node, with the first 256 code points in a bit map */
1432
1433  SV* invlist = invlist_clone(ssc->invlist);
1434
1435  PERL_ARGS_ASSERT_SSC_FINALIZE;
1436
1437  assert(is_ANYOF_SYNTHETIC(ssc));
1438
1439  /* The code in this file assumes that all but these flags aren't relevant
1440  * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1441  * time we reach here */
1442  assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1443
1444  populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1445
1446  set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1447         NULL, NULL, NULL, FALSE);
1448
1449  /* Make sure is clone-safe */
1450  ssc->invlist = NULL;
1451
1452  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1453   ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1454  }
1455
1456  assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1457 }
1458
1459 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1460 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1461 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1462 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1463        ? (TRIE_LIST_CUR( idx ) - 1)           \
1464        : 0 )
1465
1466
1467 #ifdef DEBUGGING
1468 /*
1469    dump_trie(trie,widecharmap,revcharmap)
1470    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1471    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1472
1473    These routines dump out a trie in a somewhat readable format.
1474    The _interim_ variants are used for debugging the interim
1475    tables that are used to generate the final compressed
1476    representation which is what dump_trie expects.
1477
1478    Part of the reason for their existence is to provide a form
1479    of documentation as to how the different representations function.
1480
1481 */
1482
1483 /*
1484   Dumps the final compressed table form of the trie to Perl_debug_log.
1485   Used for debugging make_trie().
1486 */
1487
1488 STATIC void
1489 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1490    AV *revcharmap, U32 depth)
1491 {
1492  U32 state;
1493  SV *sv=sv_newmortal();
1494  int colwidth= widecharmap ? 6 : 4;
1495  U16 word;
1496  GET_RE_DEBUG_FLAGS_DECL;
1497
1498  PERL_ARGS_ASSERT_DUMP_TRIE;
1499
1500  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1501   (int)depth * 2 + 2,"",
1502   "Match","Base","Ofs" );
1503
1504  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1505   SV ** const tmp = av_fetch( revcharmap, state, 0);
1506   if ( tmp ) {
1507    PerlIO_printf( Perl_debug_log, "%*s",
1508     colwidth,
1509     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1510        PL_colors[0], PL_colors[1],
1511        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1512        PERL_PV_ESCAPE_FIRSTCHAR
1513     )
1514    );
1515   }
1516  }
1517  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1518   (int)depth * 2 + 2,"");
1519
1520  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1521   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1522  PerlIO_printf( Perl_debug_log, "\n");
1523
1524  for( state = 1 ; state < trie->statecount ; state++ ) {
1525   const U32 base = trie->states[ state ].trans.base;
1526
1527   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1528          (int)depth * 2 + 2,"", (UV)state);
1529
1530   if ( trie->states[ state ].wordnum ) {
1531    PerlIO_printf( Perl_debug_log, " W%4X",
1532           trie->states[ state ].wordnum );
1533   } else {
1534    PerlIO_printf( Perl_debug_log, "%6s", "" );
1535   }
1536
1537   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1538
1539   if ( base ) {
1540    U32 ofs = 0;
1541
1542    while( ( base + ofs  < trie->uniquecharcount ) ||
1543     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1544      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1545                  != state))
1546      ofs++;
1547
1548    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1549
1550    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1551     if ( ( base + ofs >= trie->uniquecharcount )
1552       && ( base + ofs - trie->uniquecharcount
1553               < trie->lasttrans )
1554       && trie->trans[ base + ofs
1555          - trie->uniquecharcount ].check == state )
1556     {
1557     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1558      colwidth,
1559      (UV)trie->trans[ base + ofs
1560            - trie->uniquecharcount ].next );
1561     } else {
1562      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1563     }
1564    }
1565
1566    PerlIO_printf( Perl_debug_log, "]");
1567
1568   }
1569   PerlIO_printf( Perl_debug_log, "\n" );
1570  }
1571  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1572         (int)depth*2, "");
1573  for (word=1; word <= trie->wordcount; word++) {
1574   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1575    (int)word, (int)(trie->wordinfo[word].prev),
1576    (int)(trie->wordinfo[word].len));
1577  }
1578  PerlIO_printf(Perl_debug_log, "\n" );
1579 }
1580 /*
1581   Dumps a fully constructed but uncompressed trie in list form.
1582   List tries normally only are used for construction when the number of
1583   possible chars (trie->uniquecharcount) is very high.
1584   Used for debugging make_trie().
1585 */
1586 STATIC void
1587 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1588       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1589       U32 depth)
1590 {
1591  U32 state;
1592  SV *sv=sv_newmortal();
1593  int colwidth= widecharmap ? 6 : 4;
1594  GET_RE_DEBUG_FLAGS_DECL;
1595
1596  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1597
1598  /* print out the table precompression.  */
1599  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1600   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1601   "------:-----+-----------------\n" );
1602
1603  for( state=1 ; state < next_alloc ; state ++ ) {
1604   U16 charid;
1605
1606   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1607    (int)depth * 2 + 2,"", (UV)state  );
1608   if ( ! trie->states[ state ].wordnum ) {
1609    PerlIO_printf( Perl_debug_log, "%5s| ","");
1610   } else {
1611    PerlIO_printf( Perl_debug_log, "W%4x| ",
1612     trie->states[ state ].wordnum
1613    );
1614   }
1615   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1616    SV ** const tmp = av_fetch( revcharmap,
1617           TRIE_LIST_ITEM(state,charid).forid, 0);
1618    if ( tmp ) {
1619     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1620      colwidth,
1621      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1622        colwidth,
1623        PL_colors[0], PL_colors[1],
1624        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1625        | PERL_PV_ESCAPE_FIRSTCHAR
1626      ) ,
1627      TRIE_LIST_ITEM(state,charid).forid,
1628      (UV)TRIE_LIST_ITEM(state,charid).newstate
1629     );
1630     if (!(charid % 10))
1631      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1632       (int)((depth * 2) + 14), "");
1633    }
1634   }
1635   PerlIO_printf( Perl_debug_log, "\n");
1636  }
1637 }
1638
1639 /*
1640   Dumps a fully constructed but uncompressed trie in table form.
1641   This is the normal DFA style state transition table, with a few
1642   twists to facilitate compression later.
1643   Used for debugging make_trie().
1644 */
1645 STATIC void
1646 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1647       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1648       U32 depth)
1649 {
1650  U32 state;
1651  U16 charid;
1652  SV *sv=sv_newmortal();
1653  int colwidth= widecharmap ? 6 : 4;
1654  GET_RE_DEBUG_FLAGS_DECL;
1655
1656  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1657
1658  /*
1659  print out the table precompression so that we can do a visual check
1660  that they are identical.
1661  */
1662
1663  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1664
1665  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1666   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1667   if ( tmp ) {
1668    PerlIO_printf( Perl_debug_log, "%*s",
1669     colwidth,
1670     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1671        PL_colors[0], PL_colors[1],
1672        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1673        PERL_PV_ESCAPE_FIRSTCHAR
1674     )
1675    );
1676   }
1677  }
1678
1679  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1680
1681  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1682   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1683  }
1684
1685  PerlIO_printf( Perl_debug_log, "\n" );
1686
1687  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1688
1689   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1690    (int)depth * 2 + 2,"",
1691    (UV)TRIE_NODENUM( state ) );
1692
1693   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1694    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1695    if (v)
1696     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1697    else
1698     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1699   }
1700   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1701    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1702            (UV)trie->trans[ state ].check );
1703   } else {
1704    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1705            (UV)trie->trans[ state ].check,
1706    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1707   }
1708  }
1709 }
1710
1711 #endif
1712
1713
1714 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1715   startbranch: the first branch in the whole branch sequence
1716   first      : start branch of sequence of branch-exact nodes.
1717    May be the same as startbranch
1718   last       : Thing following the last branch.
1719    May be the same as tail.
1720   tail       : item following the branch sequence
1721   count      : words in the sequence
1722   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1723   depth      : indent depth
1724
1725 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1726
1727 A trie is an N'ary tree where the branches are determined by digital
1728 decomposition of the key. IE, at the root node you look up the 1st character and
1729 follow that branch repeat until you find the end of the branches. Nodes can be
1730 marked as "accepting" meaning they represent a complete word. Eg:
1731
1732   /he|she|his|hers/
1733
1734 would convert into the following structure. Numbers represent states, letters
1735 following numbers represent valid transitions on the letter from that state, if
1736 the number is in square brackets it represents an accepting state, otherwise it
1737 will be in parenthesis.
1738
1739  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1740  |    |
1741  |   (2)
1742  |    |
1743  (1)   +-i->(6)-+-s->[7]
1744  |
1745  +-s->(3)-+-h->(4)-+-e->[5]
1746
1747  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1748
1749 This shows that when matching against the string 'hers' we will begin at state 1
1750 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1751 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1752 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1753 single traverse. We store a mapping from accepting to state to which word was
1754 matched, and then when we have multiple possibilities we try to complete the
1755 rest of the regex in the order in which they occured in the alternation.
1756
1757 The only prior NFA like behaviour that would be changed by the TRIE support is
1758 the silent ignoring of duplicate alternations which are of the form:
1759
1760  / (DUPE|DUPE) X? (?{ ... }) Y /x
1761
1762 Thus EVAL blocks following a trie may be called a different number of times with
1763 and without the optimisation. With the optimisations dupes will be silently
1764 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1765 the following demonstrates:
1766
1767  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1768
1769 which prints out 'word' three times, but
1770
1771  'words'=~/(word|word|word)(?{ print $1 })S/
1772
1773 which doesnt print it out at all. This is due to other optimisations kicking in.
1774
1775 Example of what happens on a structural level:
1776
1777 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1778
1779    1: CURLYM[1] {1,32767}(18)
1780    5:   BRANCH(8)
1781    6:     EXACT <ac>(16)
1782    8:   BRANCH(11)
1783    9:     EXACT <ad>(16)
1784   11:   BRANCH(14)
1785   12:     EXACT <ab>(16)
1786   16:   SUCCEED(0)
1787   17:   NOTHING(18)
1788   18: END(0)
1789
1790 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1791 and should turn into:
1792
1793    1: CURLYM[1] {1,32767}(18)
1794    5:   TRIE(16)
1795   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1796   <ac>
1797   <ad>
1798   <ab>
1799   16:   SUCCEED(0)
1800   17:   NOTHING(18)
1801   18: END(0)
1802
1803 Cases where tail != last would be like /(?foo|bar)baz/:
1804
1805    1: BRANCH(4)
1806    2:   EXACT <foo>(8)
1807    4: BRANCH(7)
1808    5:   EXACT <bar>(8)
1809    7: TAIL(8)
1810    8: EXACT <baz>(10)
1811   10: END(0)
1812
1813 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1814 and would end up looking like:
1815
1816  1: TRIE(8)
1817  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1818   <foo>
1819   <bar>
1820    7: TAIL(8)
1821    8: EXACT <baz>(10)
1822   10: END(0)
1823
1824  d = uvchr_to_utf8_flags(d, uv, 0);
1825
1826 is the recommended Unicode-aware way of saying
1827
1828  *(d++) = uv;
1829 */
1830
1831 #define TRIE_STORE_REVCHAR(val)                                            \
1832  STMT_START {                                                           \
1833   if (UTF) {          \
1834    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1835    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1836    unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1837    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1838    SvPOK_on(zlopp);         \
1839    SvUTF8_on(zlopp);         \
1840    av_push(revcharmap, zlopp);        \
1841   } else {          \
1842    char ooooff = (char)val;                                           \
1843    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1844   }           \
1845   } STMT_END
1846
1847 /* This gets the next character from the input, folding it if not already
1848  * folded. */
1849 #define TRIE_READ_CHAR STMT_START {                                           \
1850  wordlen++;                                                                \
1851  if ( UTF ) {                                                              \
1852   /* if it is UTF then it is either already folded, or does not need    \
1853   * folding */                                                         \
1854   uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1855  }                                                                         \
1856  else if (folder == PL_fold_latin1) {                                      \
1857   /* This folder implies Unicode rules, which in the range expressible  \
1858   *  by not UTF is the lower case, with the two exceptions, one of     \
1859   *  which should have been taken care of before calling this */       \
1860   assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1861   uvc = toLOWER_L1(*uc);                                                \
1862   if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1863   len = 1;                                                              \
1864  } else {                                                                  \
1865   /* raw data, will be folded later if needed */                        \
1866   uvc = (U32)*uc;                                                       \
1867   len = 1;                                                              \
1868  }                                                                         \
1869 } STMT_END
1870
1871
1872
1873 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1874  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1875   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1876   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1877  }                                                           \
1878  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1879  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1880  TRIE_LIST_CUR( state )++;                                   \
1881 } STMT_END
1882
1883 #define TRIE_LIST_NEW(state) STMT_START {                       \
1884  Newxz( trie->states[ state ].trans.list,               \
1885   4, reg_trie_trans_le );                                 \
1886  TRIE_LIST_CUR( state ) = 1;                                \
1887  TRIE_LIST_LEN( state ) = 4;                                \
1888 } STMT_END
1889
1890 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1891  U16 dupe= trie->states[ state ].wordnum;                    \
1892  regnode * const noper_next = regnext( noper );              \
1893                 \
1894  DEBUG_r({                                                   \
1895   /* store the word for dumping */                        \
1896   SV* tmp;                                                \
1897   if (OP(noper) != NOTHING)                               \
1898    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1899   else                                                    \
1900    tmp = newSVpvn_utf8( "", 0, UTF );   \
1901   av_push( trie_words, tmp );                             \
1902  });                                                         \
1903                 \
1904  curword++;                                                  \
1905  trie->wordinfo[curword].prev   = 0;                         \
1906  trie->wordinfo[curword].len    = wordlen;                   \
1907  trie->wordinfo[curword].accept = state;                     \
1908                 \
1909  if ( noper_next < tail ) {                                  \
1910   if (!trie->jump)                                        \
1911    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1912             sizeof(U16) ); \
1913   trie->jump[curword] = (U16)(noper_next - convert);      \
1914   if (!jumper)                                            \
1915    jumper = noper_next;                                \
1916   if (!nextbranch)                                        \
1917    nextbranch= regnext(cur);                           \
1918  }                                                           \
1919                 \
1920  if ( dupe ) {                                               \
1921   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1922   /* chain, so that when the bits of chain are later    */\
1923   /* linked together, the dups appear in the chain      */\
1924   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1925   trie->wordinfo[dupe].prev = curword;                    \
1926  } else {                                                    \
1927   /* we haven't inserted this word yet.                */ \
1928   trie->states[ state ].wordnum = curword;                \
1929  }                                                           \
1930 } STMT_END
1931
1932
1933 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1934  ( ( base + charid >=  ucharcount     \
1935   && base + charid < ubound     \
1936   && state == trie->trans[ base - ucharcount + charid ].check \
1937   && trie->trans[ base - ucharcount + charid ].next )  \
1938   ? trie->trans[ base - ucharcount + charid ].next  \
1939   : ( state==1 ? special : 0 )     \
1940  )
1941
1942 #define MADE_TRIE       1
1943 #define MADE_JUMP_TRIE  2
1944 #define MADE_EXACT_TRIE 4
1945
1946 STATIC I32
1947 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1948     regnode *first, regnode *last, regnode *tail,
1949     U32 word_count, U32 flags, U32 depth)
1950 {
1951  /* first pass, loop through and scan words */
1952  reg_trie_data *trie;
1953  HV *widecharmap = NULL;
1954  AV *revcharmap = newAV();
1955  regnode *cur;
1956  STRLEN len = 0;
1957  UV uvc = 0;
1958  U16 curword = 0;
1959  U32 next_alloc = 0;
1960  regnode *jumper = NULL;
1961  regnode *nextbranch = NULL;
1962  regnode *convert = NULL;
1963  U32 *prev_states; /* temp array mapping each state to previous one */
1964  /* we just use folder as a flag in utf8 */
1965  const U8 * folder = NULL;
1966
1967 #ifdef DEBUGGING
1968  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1969  AV *trie_words = NULL;
1970  /* along with revcharmap, this only used during construction but both are
1971  * useful during debugging so we store them in the struct when debugging.
1972  */
1973 #else
1974  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1975  STRLEN trie_charcount=0;
1976 #endif
1977  SV *re_trie_maxbuff;
1978  GET_RE_DEBUG_FLAGS_DECL;
1979
1980  PERL_ARGS_ASSERT_MAKE_TRIE;
1981 #ifndef DEBUGGING
1982  PERL_UNUSED_ARG(depth);
1983 #endif
1984
1985  switch (flags) {
1986   case EXACT: break;
1987   case EXACTFA:
1988   case EXACTFU_SS:
1989   case EXACTFU: folder = PL_fold_latin1; break;
1990   case EXACTF:  folder = PL_fold; break;
1991   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1992  }
1993
1994  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1995  trie->refcount = 1;
1996  trie->startstate = 1;
1997  trie->wordcount = word_count;
1998  RExC_rxi->data->data[ data_slot ] = (void*)trie;
1999  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2000  if (flags == EXACT)
2001   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2002  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2003      trie->wordcount+1, sizeof(reg_trie_wordinfo));
2004
2005  DEBUG_r({
2006   trie_words = newAV();
2007  });
2008
2009  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2010  assert(re_trie_maxbuff);
2011  if (!SvIOK(re_trie_maxbuff)) {
2012   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2013  }
2014  DEBUG_TRIE_COMPILE_r({
2015   PerlIO_printf( Perl_debug_log,
2016   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2017   (int)depth * 2 + 2, "",
2018   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2019   REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2020  });
2021
2022    /* Find the node we are going to overwrite */
2023  if ( first == startbranch && OP( last ) != BRANCH ) {
2024   /* whole branch chain */
2025   convert = first;
2026  } else {
2027   /* branch sub-chain */
2028   convert = NEXTOPER( first );
2029  }
2030
2031  /*  -- First loop and Setup --
2032
2033  We first traverse the branches and scan each word to determine if it
2034  contains widechars, and how many unique chars there are, this is
2035  important as we have to build a table with at least as many columns as we
2036  have unique chars.
2037
2038  We use an array of integers to represent the character codes 0..255
2039  (trie->charmap) and we use a an HV* to store Unicode characters. We use
2040  the native representation of the character value as the key and IV's for
2041  the coded index.
2042
2043  *TODO* If we keep track of how many times each character is used we can
2044  remap the columns so that the table compression later on is more
2045  efficient in terms of memory by ensuring the most common value is in the
2046  middle and the least common are on the outside.  IMO this would be better
2047  than a most to least common mapping as theres a decent chance the most
2048  common letter will share a node with the least common, meaning the node
2049  will not be compressible. With a middle is most common approach the worst
2050  case is when we have the least common nodes twice.
2051
2052  */
2053
2054  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2055   regnode *noper = NEXTOPER( cur );
2056   const U8 *uc = (U8*)STRING( noper );
2057   const U8 *e  = uc + STR_LEN( noper );
2058   int foldlen = 0;
2059   U32 wordlen      = 0;         /* required init */
2060   STRLEN minchars = 0;
2061   STRLEN maxchars = 0;
2062   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2063            bitmap?*/
2064
2065   if (OP(noper) == NOTHING) {
2066    regnode *noper_next= regnext(noper);
2067    if (noper_next != tail && OP(noper_next) == flags) {
2068     noper = noper_next;
2069     uc= (U8*)STRING(noper);
2070     e= uc + STR_LEN(noper);
2071     trie->minlen= STR_LEN(noper);
2072    } else {
2073     trie->minlen= 0;
2074     continue;
2075    }
2076   }
2077
2078   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2079    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2080           regardless of encoding */
2081    if (OP( noper ) == EXACTFU_SS) {
2082     /* false positives are ok, so just set this */
2083     TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2084    }
2085   }
2086   for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2087           branch */
2088    TRIE_CHARCOUNT(trie)++;
2089    TRIE_READ_CHAR;
2090
2091    /* TRIE_READ_CHAR returns the current character, or its fold if /i
2092    * is in effect.  Under /i, this character can match itself, or
2093    * anything that folds to it.  If not under /i, it can match just
2094    * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2095    * all fold to k, and all are single characters.   But some folds
2096    * expand to more than one character, so for example LATIN SMALL
2097    * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2098    * the string beginning at 'uc' is 'ffi', it could be matched by
2099    * three characters, or just by the one ligature character. (It
2100    * could also be matched by two characters: LATIN SMALL LIGATURE FF
2101    * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2102    * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2103    * match.)  The trie needs to know the minimum and maximum number
2104    * of characters that could match so that it can use size alone to
2105    * quickly reject many match attempts.  The max is simple: it is
2106    * the number of folded characters in this branch (since a fold is
2107    * never shorter than what folds to it. */
2108
2109    maxchars++;
2110
2111    /* And the min is equal to the max if not under /i (indicated by
2112    * 'folder' being NULL), or there are no multi-character folds.  If
2113    * there is a multi-character fold, the min is incremented just
2114    * once, for the character that folds to the sequence.  Each
2115    * character in the sequence needs to be added to the list below of
2116    * characters in the trie, but we count only the first towards the
2117    * min number of characters needed.  This is done through the
2118    * variable 'foldlen', which is returned by the macros that look
2119    * for these sequences as the number of bytes the sequence
2120    * occupies.  Each time through the loop, we decrement 'foldlen' by
2121    * how many bytes the current char occupies.  Only when it reaches
2122    * 0 do we increment 'minchars' or look for another multi-character
2123    * sequence. */
2124    if (folder == NULL) {
2125     minchars++;
2126    }
2127    else if (foldlen > 0) {
2128     foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2129    }
2130    else {
2131     minchars++;
2132
2133     /* See if *uc is the beginning of a multi-character fold.  If
2134     * so, we decrement the length remaining to look at, to account
2135     * for the current character this iteration.  (We can use 'uc'
2136     * instead of the fold returned by TRIE_READ_CHAR because for
2137     * non-UTF, the latin1_safe macro is smart enough to account
2138     * for all the unfolded characters, and because for UTF, the
2139     * string will already have been folded earlier in the
2140     * compilation process */
2141     if (UTF) {
2142      if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2143       foldlen -= UTF8SKIP(uc);
2144      }
2145     }
2146     else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2147      foldlen--;
2148     }
2149    }
2150
2151    /* The current character (and any potential folds) should be added
2152    * to the possible matching characters for this position in this
2153    * branch */
2154    if ( uvc < 256 ) {
2155     if ( folder ) {
2156      U8 folded= folder[ (U8) uvc ];
2157      if ( !trie->charmap[ folded ] ) {
2158       trie->charmap[ folded ]=( ++trie->uniquecharcount );
2159       TRIE_STORE_REVCHAR( folded );
2160      }
2161     }
2162     if ( !trie->charmap[ uvc ] ) {
2163      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2164      TRIE_STORE_REVCHAR( uvc );
2165     }
2166     if ( set_bit ) {
2167      /* store the codepoint in the bitmap, and its folded
2168      * equivalent. */
2169      TRIE_BITMAP_SET(trie, uvc);
2170
2171      /* store the folded codepoint */
2172      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2173
2174      if ( !UTF ) {
2175       /* store first byte of utf8 representation of
2176       variant codepoints */
2177       if (! UVCHR_IS_INVARIANT(uvc)) {
2178        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2179       }
2180      }
2181      set_bit = 0; /* We've done our bit :-) */
2182     }
2183    } else {
2184
2185     /* XXX We could come up with the list of code points that fold
2186     * to this using PL_utf8_foldclosures, except not for
2187     * multi-char folds, as there may be multiple combinations
2188     * there that could work, which needs to wait until runtime to
2189     * resolve (The comment about LIGATURE FFI above is such an
2190     * example */
2191
2192     SV** svpp;
2193     if ( !widecharmap )
2194      widecharmap = newHV();
2195
2196     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2197
2198     if ( !svpp )
2199      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2200
2201     if ( !SvTRUE( *svpp ) ) {
2202      sv_setiv( *svpp, ++trie->uniquecharcount );
2203      TRIE_STORE_REVCHAR(uvc);
2204     }
2205    }
2206   } /* end loop through characters in this branch of the trie */
2207
2208   /* We take the min and max for this branch and combine to find the min
2209   * and max for all branches processed so far */
2210   if( cur == first ) {
2211    trie->minlen = minchars;
2212    trie->maxlen = maxchars;
2213   } else if (minchars < trie->minlen) {
2214    trie->minlen = minchars;
2215   } else if (maxchars > trie->maxlen) {
2216    trie->maxlen = maxchars;
2217   }
2218  } /* end first pass */
2219  DEBUG_TRIE_COMPILE_r(
2220   PerlIO_printf( Perl_debug_log,
2221     "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2222     (int)depth * 2 + 2,"",
2223     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2224     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2225     (int)trie->minlen, (int)trie->maxlen )
2226  );
2227
2228  /*
2229   We now know what we are dealing with in terms of unique chars and
2230   string sizes so we can calculate how much memory a naive
2231   representation using a flat table  will take. If it's over a reasonable
2232   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2233   conservative but potentially much slower representation using an array
2234   of lists.
2235
2236   At the end we convert both representations into the same compressed
2237   form that will be used in regexec.c for matching with. The latter
2238   is a form that cannot be used to construct with but has memory
2239   properties similar to the list form and access properties similar
2240   to the table form making it both suitable for fast searches and
2241   small enough that its feasable to store for the duration of a program.
2242
2243   See the comment in the code where the compressed table is produced
2244   inplace from the flat tabe representation for an explanation of how
2245   the compression works.
2246
2247  */
2248
2249
2250  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2251  prev_states[1] = 0;
2252
2253  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2254              > SvIV(re_trie_maxbuff) )
2255  {
2256   /*
2257    Second Pass -- Array Of Lists Representation
2258
2259    Each state will be represented by a list of charid:state records
2260    (reg_trie_trans_le) the first such element holds the CUR and LEN
2261    points of the allocated array. (See defines above).
2262
2263    We build the initial structure using the lists, and then convert
2264    it into the compressed table form which allows faster lookups
2265    (but cant be modified once converted).
2266   */
2267
2268   STRLEN transcount = 1;
2269
2270   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2271    "%*sCompiling trie using list compiler\n",
2272    (int)depth * 2 + 2, ""));
2273
2274   trie->states = (reg_trie_state *)
2275    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2276         sizeof(reg_trie_state) );
2277   TRIE_LIST_NEW(1);
2278   next_alloc = 2;
2279
2280   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2281
2282    regnode *noper   = NEXTOPER( cur );
2283    U8 *uc           = (U8*)STRING( noper );
2284    const U8 *e      = uc + STR_LEN( noper );
2285    U32 state        = 1;         /* required init */
2286    U16 charid       = 0;         /* sanity init */
2287    U32 wordlen      = 0;         /* required init */
2288
2289    if (OP(noper) == NOTHING) {
2290     regnode *noper_next= regnext(noper);
2291     if (noper_next != tail && OP(noper_next) == flags) {
2292      noper = noper_next;
2293      uc= (U8*)STRING(noper);
2294      e= uc + STR_LEN(noper);
2295     }
2296    }
2297
2298    if (OP(noper) != NOTHING) {
2299     for ( ; uc < e ; uc += len ) {
2300
2301      TRIE_READ_CHAR;
2302
2303      if ( uvc < 256 ) {
2304       charid = trie->charmap[ uvc ];
2305      } else {
2306       SV** const svpp = hv_fetch( widecharmap,
2307              (char*)&uvc,
2308              sizeof( UV ),
2309              0);
2310       if ( !svpp ) {
2311        charid = 0;
2312       } else {
2313        charid=(U16)SvIV( *svpp );
2314       }
2315      }
2316      /* charid is now 0 if we dont know the char read, or
2317      * nonzero if we do */
2318      if ( charid ) {
2319
2320       U16 check;
2321       U32 newstate = 0;
2322
2323       charid--;
2324       if ( !trie->states[ state ].trans.list ) {
2325        TRIE_LIST_NEW( state );
2326       }
2327       for ( check = 1;
2328        check <= TRIE_LIST_USED( state );
2329        check++ )
2330       {
2331        if ( TRIE_LIST_ITEM( state, check ).forid
2332                  == charid )
2333        {
2334         newstate = TRIE_LIST_ITEM( state, check ).newstate;
2335         break;
2336        }
2337       }
2338       if ( ! newstate ) {
2339        newstate = next_alloc++;
2340        prev_states[newstate] = state;
2341        TRIE_LIST_PUSH( state, charid, newstate );
2342        transcount++;
2343       }
2344       state = newstate;
2345      } else {
2346       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2347      }
2348     }
2349    }
2350    TRIE_HANDLE_WORD(state);
2351
2352   } /* end second pass */
2353
2354   /* next alloc is the NEXT state to be allocated */
2355   trie->statecount = next_alloc;
2356   trie->states = (reg_trie_state *)
2357    PerlMemShared_realloc( trie->states,
2358         next_alloc
2359         * sizeof(reg_trie_state) );
2360
2361   /* and now dump it out before we compress it */
2362   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2363               revcharmap, next_alloc,
2364               depth+1)
2365   );
2366
2367   trie->trans = (reg_trie_trans *)
2368    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2369   {
2370    U32 state;
2371    U32 tp = 0;
2372    U32 zp = 0;
2373
2374
2375    for( state=1 ; state < next_alloc ; state ++ ) {
2376     U32 base=0;
2377
2378     /*
2379     DEBUG_TRIE_COMPILE_MORE_r(
2380      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2381     );
2382     */
2383
2384     if (trie->states[state].trans.list) {
2385      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2386      U16 maxid=minid;
2387      U16 idx;
2388
2389      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2390       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2391       if ( forid < minid ) {
2392        minid=forid;
2393       } else if ( forid > maxid ) {
2394        maxid=forid;
2395       }
2396      }
2397      if ( transcount < tp + maxid - minid + 1) {
2398       transcount *= 2;
2399       trie->trans = (reg_trie_trans *)
2400        PerlMemShared_realloc( trie->trans,
2401              transcount
2402              * sizeof(reg_trie_trans) );
2403       Zero( trie->trans + (transcount / 2),
2404        transcount / 2,
2405        reg_trie_trans );
2406      }
2407      base = trie->uniquecharcount + tp - minid;
2408      if ( maxid == minid ) {
2409       U32 set = 0;
2410       for ( ; zp < tp ; zp++ ) {
2411        if ( ! trie->trans[ zp ].next ) {
2412         base = trie->uniquecharcount + zp - minid;
2413         trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2414                 1).newstate;
2415         trie->trans[ zp ].check = state;
2416         set = 1;
2417         break;
2418        }
2419       }
2420       if ( !set ) {
2421        trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2422                 1).newstate;
2423        trie->trans[ tp ].check = state;
2424        tp++;
2425        zp = tp;
2426       }
2427      } else {
2428       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2429        const U32 tid = base
2430           - trie->uniquecharcount
2431           + TRIE_LIST_ITEM( state, idx ).forid;
2432        trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2433                 idx ).newstate;
2434        trie->trans[ tid ].check = state;
2435       }
2436       tp += ( maxid - minid + 1 );
2437      }
2438      Safefree(trie->states[ state ].trans.list);
2439     }
2440     /*
2441     DEBUG_TRIE_COMPILE_MORE_r(
2442      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2443     );
2444     */
2445     trie->states[ state ].trans.base=base;
2446    }
2447    trie->lasttrans = tp + 1;
2448   }
2449  } else {
2450   /*
2451   Second Pass -- Flat Table Representation.
2452
2453   we dont use the 0 slot of either trans[] or states[] so we add 1 to
2454   each.  We know that we will need Charcount+1 trans at most to store
2455   the data (one row per char at worst case) So we preallocate both
2456   structures assuming worst case.
2457
2458   We then construct the trie using only the .next slots of the entry
2459   structs.
2460
2461   We use the .check field of the first entry of the node temporarily
2462   to make compression both faster and easier by keeping track of how
2463   many non zero fields are in the node.
2464
2465   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2466   transition.
2467
2468   There are two terms at use here: state as a TRIE_NODEIDX() which is
2469   a number representing the first entry of the node, and state as a
2470   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2471   and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2472   if there are 2 entrys per node. eg:
2473
2474    A B       A B
2475   1. 2 4    1. 3 7
2476   2. 0 3    3. 0 5
2477   3. 0 0    5. 0 0
2478   4. 0 0    7. 0 0
2479
2480   The table is internally in the right hand, idx form. However as we
2481   also have to deal with the states array which is indexed by nodenum
2482   we have to use TRIE_NODENUM() to convert.
2483
2484   */
2485   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2486    "%*sCompiling trie using table compiler\n",
2487    (int)depth * 2 + 2, ""));
2488
2489   trie->trans = (reg_trie_trans *)
2490    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2491         * trie->uniquecharcount + 1,
2492         sizeof(reg_trie_trans) );
2493   trie->states = (reg_trie_state *)
2494    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2495         sizeof(reg_trie_state) );
2496   next_alloc = trie->uniquecharcount + 1;
2497
2498
2499   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2500
2501    regnode *noper   = NEXTOPER( cur );
2502    const U8 *uc     = (U8*)STRING( noper );
2503    const U8 *e      = uc + STR_LEN( noper );
2504
2505    U32 state        = 1;         /* required init */
2506
2507    U16 charid       = 0;         /* sanity init */
2508    U32 accept_state = 0;         /* sanity init */
2509
2510    U32 wordlen      = 0;         /* required init */
2511
2512    if (OP(noper) == NOTHING) {
2513     regnode *noper_next= regnext(noper);
2514     if (noper_next != tail && OP(noper_next) == flags) {
2515      noper = noper_next;
2516      uc= (U8*)STRING(noper);
2517      e= uc + STR_LEN(noper);
2518     }
2519    }
2520
2521    if ( OP(noper) != NOTHING ) {
2522     for ( ; uc < e ; uc += len ) {
2523
2524      TRIE_READ_CHAR;
2525
2526      if ( uvc < 256 ) {
2527       charid = trie->charmap[ uvc ];
2528      } else {
2529       SV* const * const svpp = hv_fetch( widecharmap,
2530               (char*)&uvc,
2531               sizeof( UV ),
2532               0);
2533       charid = svpp ? (U16)SvIV(*svpp) : 0;
2534      }
2535      if ( charid ) {
2536       charid--;
2537       if ( !trie->trans[ state + charid ].next ) {
2538        trie->trans[ state + charid ].next = next_alloc;
2539        trie->trans[ state ].check++;
2540        prev_states[TRIE_NODENUM(next_alloc)]
2541          = TRIE_NODENUM(state);
2542        next_alloc += trie->uniquecharcount;
2543       }
2544       state = trie->trans[ state + charid ].next;
2545      } else {
2546       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2547      }
2548      /* charid is now 0 if we dont know the char read, or
2549      * nonzero if we do */
2550     }
2551    }
2552    accept_state = TRIE_NODENUM( state );
2553    TRIE_HANDLE_WORD(accept_state);
2554
2555   } /* end second pass */
2556
2557   /* and now dump it out before we compress it */
2558   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2559               revcharmap,
2560               next_alloc, depth+1));
2561
2562   {
2563   /*
2564   * Inplace compress the table.*
2565
2566   For sparse data sets the table constructed by the trie algorithm will
2567   be mostly 0/FAIL transitions or to put it another way mostly empty.
2568   (Note that leaf nodes will not contain any transitions.)
2569
2570   This algorithm compresses the tables by eliminating most such
2571   transitions, at the cost of a modest bit of extra work during lookup:
2572
2573   - Each states[] entry contains a .base field which indicates the
2574   index in the state[] array wheres its transition data is stored.
2575
2576   - If .base is 0 there are no valid transitions from that node.
2577
2578   - If .base is nonzero then charid is added to it to find an entry in
2579   the trans array.
2580
2581   -If trans[states[state].base+charid].check!=state then the
2582   transition is taken to be a 0/Fail transition. Thus if there are fail
2583   transitions at the front of the node then the .base offset will point
2584   somewhere inside the previous nodes data (or maybe even into a node
2585   even earlier), but the .check field determines if the transition is
2586   valid.
2587
2588   XXX - wrong maybe?
2589   The following process inplace converts the table to the compressed
2590   table: We first do not compress the root node 1,and mark all its
2591   .check pointers as 1 and set its .base pointer as 1 as well. This
2592   allows us to do a DFA construction from the compressed table later,
2593   and ensures that any .base pointers we calculate later are greater
2594   than 0.
2595
2596   - We set 'pos' to indicate the first entry of the second node.
2597
2598   - We then iterate over the columns of the node, finding the first and
2599   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2600   and set the .check pointers accordingly, and advance pos
2601   appropriately and repreat for the next node. Note that when we copy
2602   the next pointers we have to convert them from the original
2603   NODEIDX form to NODENUM form as the former is not valid post
2604   compression.
2605
2606   - If a node has no transitions used we mark its base as 0 and do not
2607   advance the pos pointer.
2608
2609   - If a node only has one transition we use a second pointer into the
2610   structure to fill in allocated fail transitions from other states.
2611   This pointer is independent of the main pointer and scans forward
2612   looking for null transitions that are allocated to a state. When it
2613   finds one it writes the single transition into the "hole".  If the
2614   pointer doesnt find one the single transition is appended as normal.
2615
2616   - Once compressed we can Renew/realloc the structures to release the
2617   excess space.
2618
2619   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2620   specifically Fig 3.47 and the associated pseudocode.
2621
2622   demq
2623   */
2624   const U32 laststate = TRIE_NODENUM( next_alloc );
2625   U32 state, charid;
2626   U32 pos = 0, zp=0;
2627   trie->statecount = laststate;
2628
2629   for ( state = 1 ; state < laststate ; state++ ) {
2630    U8 flag = 0;
2631    const U32 stateidx = TRIE_NODEIDX( state );
2632    const U32 o_used = trie->trans[ stateidx ].check;
2633    U32 used = trie->trans[ stateidx ].check;
2634    trie->trans[ stateidx ].check = 0;
2635
2636    for ( charid = 0;
2637     used && charid < trie->uniquecharcount;
2638     charid++ )
2639    {
2640     if ( flag || trie->trans[ stateidx + charid ].next ) {
2641      if ( trie->trans[ stateidx + charid ].next ) {
2642       if (o_used == 1) {
2643        for ( ; zp < pos ; zp++ ) {
2644         if ( ! trie->trans[ zp ].next ) {
2645          break;
2646         }
2647        }
2648        trie->states[ state ].trans.base
2649              = zp
2650              + trie->uniquecharcount
2651              - charid ;
2652        trie->trans[ zp ].next
2653         = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2654                + charid ].next );
2655        trie->trans[ zp ].check = state;
2656        if ( ++zp > pos ) pos = zp;
2657        break;
2658       }
2659       used--;
2660      }
2661      if ( !flag ) {
2662       flag = 1;
2663       trie->states[ state ].trans.base
2664          = pos + trie->uniquecharcount - charid ;
2665      }
2666      trie->trans[ pos ].next
2667       = SAFE_TRIE_NODENUM(
2668          trie->trans[ stateidx + charid ].next );
2669      trie->trans[ pos ].check = state;
2670      pos++;
2671     }
2672    }
2673   }
2674   trie->lasttrans = pos + 1;
2675   trie->states = (reg_trie_state *)
2676    PerlMemShared_realloc( trie->states, laststate
2677         * sizeof(reg_trie_state) );
2678   DEBUG_TRIE_COMPILE_MORE_r(
2679    PerlIO_printf( Perl_debug_log,
2680     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2681     (int)depth * 2 + 2,"",
2682     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2683      + 1 ),
2684     (IV)next_alloc,
2685     (IV)pos,
2686     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2687    );
2688
2689   } /* end table compress */
2690  }
2691  DEBUG_TRIE_COMPILE_MORE_r(
2692    PerlIO_printf(Perl_debug_log,
2693     "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2694     (int)depth * 2 + 2, "",
2695     (UV)trie->statecount,
2696     (UV)trie->lasttrans)
2697  );
2698  /* resize the trans array to remove unused space */
2699  trie->trans = (reg_trie_trans *)
2700   PerlMemShared_realloc( trie->trans, trie->lasttrans
2701        * sizeof(reg_trie_trans) );
2702
2703  {   /* Modify the program and insert the new TRIE node */
2704   U8 nodetype =(U8)(flags & 0xFF);
2705   char *str=NULL;
2706
2707 #ifdef DEBUGGING
2708   regnode *optimize = NULL;
2709 #ifdef RE_TRACK_PATTERN_OFFSETS
2710
2711   U32 mjd_offset = 0;
2712   U32 mjd_nodelen = 0;
2713 #endif /* RE_TRACK_PATTERN_OFFSETS */
2714 #endif /* DEBUGGING */
2715   /*
2716   This means we convert either the first branch or the first Exact,
2717   depending on whether the thing following (in 'last') is a branch
2718   or not and whther first is the startbranch (ie is it a sub part of
2719   the alternation or is it the whole thing.)
2720   Assuming its a sub part we convert the EXACT otherwise we convert
2721   the whole branch sequence, including the first.
2722   */
2723   /* Find the node we are going to overwrite */
2724   if ( first != startbranch || OP( last ) == BRANCH ) {
2725    /* branch sub-chain */
2726    NEXT_OFF( first ) = (U16)(last - first);
2727 #ifdef RE_TRACK_PATTERN_OFFSETS
2728    DEBUG_r({
2729     mjd_offset= Node_Offset((convert));
2730     mjd_nodelen= Node_Length((convert));
2731    });
2732 #endif
2733    /* whole branch chain */
2734   }
2735 #ifdef RE_TRACK_PATTERN_OFFSETS
2736   else {
2737    DEBUG_r({
2738     const  regnode *nop = NEXTOPER( convert );
2739     mjd_offset= Node_Offset((nop));
2740     mjd_nodelen= Node_Length((nop));
2741    });
2742   }
2743   DEBUG_OPTIMISE_r(
2744    PerlIO_printf(Perl_debug_log,
2745     "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2746     (int)depth * 2 + 2, "",
2747     (UV)mjd_offset, (UV)mjd_nodelen)
2748   );
2749 #endif
2750   /* But first we check to see if there is a common prefix we can
2751   split out as an EXACT and put in front of the TRIE node.  */
2752   trie->startstate= 1;
2753   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2754    U32 state;
2755    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2756     U32 ofs = 0;
2757     I32 idx = -1;
2758     U32 count = 0;
2759     const U32 base = trie->states[ state ].trans.base;
2760
2761     if ( trie->states[state].wordnum )
2762       count = 1;
2763
2764     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2765      if ( ( base + ofs >= trie->uniquecharcount ) &&
2766       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2767       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2768      {
2769       if ( ++count > 1 ) {
2770        SV **tmp = av_fetch( revcharmap, ofs, 0);
2771        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2772        if ( state == 1 ) break;
2773        if ( count == 2 ) {
2774         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2775         DEBUG_OPTIMISE_r(
2776          PerlIO_printf(Perl_debug_log,
2777           "%*sNew Start State=%"UVuf" Class: [",
2778           (int)depth * 2 + 2, "",
2779           (UV)state));
2780         if (idx >= 0) {
2781          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2782          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2783
2784          TRIE_BITMAP_SET(trie,*ch);
2785          if ( folder )
2786           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2787          DEBUG_OPTIMISE_r(
2788           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2789          );
2790         }
2791        }
2792        TRIE_BITMAP_SET(trie,*ch);
2793        if ( folder )
2794         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2795        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2796       }
2797       idx = ofs;
2798      }
2799     }
2800     if ( count == 1 ) {
2801      SV **tmp = av_fetch( revcharmap, idx, 0);
2802      STRLEN len;
2803      char *ch = SvPV( *tmp, len );
2804      DEBUG_OPTIMISE_r({
2805       SV *sv=sv_newmortal();
2806       PerlIO_printf( Perl_debug_log,
2807        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2808        (int)depth * 2 + 2, "",
2809        (UV)state, (UV)idx,
2810        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2811         PL_colors[0], PL_colors[1],
2812         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2813         PERL_PV_ESCAPE_FIRSTCHAR
2814        )
2815       );
2816      });
2817      if ( state==1 ) {
2818       OP( convert ) = nodetype;
2819       str=STRING(convert);
2820       STR_LEN(convert)=0;
2821      }
2822      STR_LEN(convert) += len;
2823      while (len--)
2824       *str++ = *ch++;
2825     } else {
2826 #ifdef DEBUGGING
2827      if (state>1)
2828       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2829 #endif
2830      break;
2831     }
2832    }
2833    trie->prefixlen = (state-1);
2834    if (str) {
2835     regnode *n = convert+NODE_SZ_STR(convert);
2836     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2837     trie->startstate = state;
2838     trie->minlen -= (state - 1);
2839     trie->maxlen -= (state - 1);
2840 #ifdef DEBUGGING
2841    /* At least the UNICOS C compiler choked on this
2842     * being argument to DEBUG_r(), so let's just have
2843     * it right here. */
2844    if (
2845 #ifdef PERL_EXT_RE_BUILD
2846     1
2847 #else
2848     DEBUG_r_TEST
2849 #endif
2850     ) {
2851     regnode *fix = convert;
2852     U32 word = trie->wordcount;
2853     mjd_nodelen++;
2854     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2855     while( ++fix < n ) {
2856      Set_Node_Offset_Length(fix, 0, 0);
2857     }
2858     while (word--) {
2859      SV ** const tmp = av_fetch( trie_words, word, 0 );
2860      if (tmp) {
2861       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2862        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2863       else
2864        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2865      }
2866     }
2867    }
2868 #endif
2869     if (trie->maxlen) {
2870      convert = n;
2871     } else {
2872      NEXT_OFF(convert) = (U16)(tail - convert);
2873      DEBUG_r(optimize= n);
2874     }
2875    }
2876   }
2877   if (!jumper)
2878    jumper = last;
2879   if ( trie->maxlen ) {
2880    NEXT_OFF( convert ) = (U16)(tail - convert);
2881    ARG_SET( convert, data_slot );
2882    /* Store the offset to the first unabsorbed branch in
2883    jump[0], which is otherwise unused by the jump logic.
2884    We use this when dumping a trie and during optimisation. */
2885    if (trie->jump)
2886     trie->jump[0] = (U16)(nextbranch - convert);
2887
2888    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2889    *   and there is a bitmap
2890    *   and the first "jump target" node we found leaves enough room
2891    * then convert the TRIE node into a TRIEC node, with the bitmap
2892    * embedded inline in the opcode - this is hypothetically faster.
2893    */
2894    if ( !trie->states[trie->startstate].wordnum
2895     && trie->bitmap
2896     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2897    {
2898     OP( convert ) = TRIEC;
2899     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2900     PerlMemShared_free(trie->bitmap);
2901     trie->bitmap= NULL;
2902    } else
2903     OP( convert ) = TRIE;
2904
2905    /* store the type in the flags */
2906    convert->flags = nodetype;
2907    DEBUG_r({
2908    optimize = convert
2909      + NODE_STEP_REGNODE
2910      + regarglen[ OP( convert ) ];
2911    });
2912    /* XXX We really should free up the resource in trie now,
2913     as we won't use them - (which resources?) dmq */
2914   }
2915   /* needed for dumping*/
2916   DEBUG_r(if (optimize) {
2917    regnode *opt = convert;
2918
2919    while ( ++opt < optimize) {
2920     Set_Node_Offset_Length(opt,0,0);
2921    }
2922    /*
2923     Try to clean up some of the debris left after the
2924     optimisation.
2925    */
2926    while( optimize < jumper ) {
2927     mjd_nodelen += Node_Length((optimize));
2928     OP( optimize ) = OPTIMIZED;
2929     Set_Node_Offset_Length(optimize,0,0);
2930     optimize++;
2931    }
2932    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2933   });
2934  } /* end node insert */
2935  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2936
2937  /*  Finish populating the prev field of the wordinfo array.  Walk back
2938  *  from each accept state until we find another accept state, and if
2939  *  so, point the first word's .prev field at the second word. If the
2940  *  second already has a .prev field set, stop now. This will be the
2941  *  case either if we've already processed that word's accept state,
2942  *  or that state had multiple words, and the overspill words were
2943  *  already linked up earlier.
2944  */
2945  {
2946   U16 word;
2947   U32 state;
2948   U16 prev;
2949
2950   for (word=1; word <= trie->wordcount; word++) {
2951    prev = 0;
2952    if (trie->wordinfo[word].prev)
2953     continue;
2954    state = trie->wordinfo[word].accept;
2955    while (state) {
2956     state = prev_states[state];
2957     if (!state)
2958      break;
2959     prev = trie->states[state].wordnum;
2960     if (prev)
2961      break;
2962    }
2963    trie->wordinfo[word].prev = prev;
2964   }
2965   Safefree(prev_states);
2966  }
2967
2968
2969  /* and now dump out the compressed format */
2970  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2971
2972  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2973 #ifdef DEBUGGING
2974  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2975  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2976 #else
2977  SvREFCNT_dec_NN(revcharmap);
2978 #endif
2979  return trie->jump
2980   ? MADE_JUMP_TRIE
2981   : trie->startstate>1
2982    ? MADE_EXACT_TRIE
2983    : MADE_TRIE;
2984 }
2985
2986 STATIC regnode *
2987 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
2988 {
2989 /* The Trie is constructed and compressed now so we can build a fail array if
2990  * it's needed
2991
2992    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2993    3.32 in the
2994    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2995    Ullman 1985/88
2996    ISBN 0-201-10088-6
2997
2998    We find the fail state for each state in the trie, this state is the longest
2999    proper suffix of the current state's 'word' that is also a proper prefix of
3000    another word in our trie. State 1 represents the word '' and is thus the
3001    default fail state. This allows the DFA not to have to restart after its
3002    tried and failed a word at a given point, it simply continues as though it
3003    had been matching the other word in the first place.
3004    Consider
3005  'abcdgu'=~/abcdefg|cdgu/
3006    When we get to 'd' we are still matching the first word, we would encounter
3007    'g' which would fail, which would bring us to the state representing 'd' in
3008    the second word where we would try 'g' and succeed, proceeding to match
3009    'cdgu'.
3010  */
3011  /* add a fail transition */
3012  const U32 trie_offset = ARG(source);
3013  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3014  U32 *q;
3015  const U32 ucharcount = trie->uniquecharcount;
3016  const U32 numstates = trie->statecount;
3017  const U32 ubound = trie->lasttrans + ucharcount;
3018  U32 q_read = 0;
3019  U32 q_write = 0;
3020  U32 charid;
3021  U32 base = trie->states[ 1 ].trans.base;
3022  U32 *fail;
3023  reg_ac_data *aho;
3024  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3025  regnode *stclass;
3026  GET_RE_DEBUG_FLAGS_DECL;
3027
3028  PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3029  PERL_UNUSED_CONTEXT;
3030 #ifndef DEBUGGING
3031  PERL_UNUSED_ARG(depth);
3032 #endif
3033
3034  if ( OP(source) == TRIE ) {
3035   struct regnode_1 *op = (struct regnode_1 *)
3036    PerlMemShared_calloc(1, sizeof(struct regnode_1));
3037   StructCopy(source,op,struct regnode_1);
3038   stclass = (regnode *)op;
3039  } else {
3040   struct regnode_charclass *op = (struct regnode_charclass *)
3041    PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3042   StructCopy(source,op,struct regnode_charclass);
3043   stclass = (regnode *)op;
3044  }
3045  OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3046
3047  ARG_SET( stclass, data_slot );
3048  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3049  RExC_rxi->data->data[ data_slot ] = (void*)aho;
3050  aho->trie=trie_offset;
3051  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3052  Copy( trie->states, aho->states, numstates, reg_trie_state );
3053  Newxz( q, numstates, U32);
3054  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3055  aho->refcount = 1;
3056  fail = aho->fail;
3057  /* initialize fail[0..1] to be 1 so that we always have
3058  a valid final fail state */
3059  fail[ 0 ] = fail[ 1 ] = 1;
3060
3061  for ( charid = 0; charid < ucharcount ; charid++ ) {
3062   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3063   if ( newstate ) {
3064    q[ q_write ] = newstate;
3065    /* set to point at the root */
3066    fail[ q[ q_write++ ] ]=1;
3067   }
3068  }
3069  while ( q_read < q_write) {
3070   const U32 cur = q[ q_read++ % numstates ];
3071   base = trie->states[ cur ].trans.base;
3072
3073   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3074    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3075    if (ch_state) {
3076     U32 fail_state = cur;
3077     U32 fail_base;
3078     do {
3079      fail_state = fail[ fail_state ];
3080      fail_base = aho->states[ fail_state ].trans.base;
3081     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3082
3083     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3084     fail[ ch_state ] = fail_state;
3085     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3086     {
3087       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3088     }
3089     q[ q_write++ % numstates] = ch_state;
3090    }
3091   }
3092  }
3093  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3094  when we fail in state 1, this allows us to use the
3095  charclass scan to find a valid start char. This is based on the principle
3096  that theres a good chance the string being searched contains lots of stuff
3097  that cant be a start char.
3098  */
3099  fail[ 0 ] = fail[ 1 ] = 0;
3100  DEBUG_TRIE_COMPILE_r({
3101   PerlIO_printf(Perl_debug_log,
3102      "%*sStclass Failtable (%"UVuf" states): 0",
3103      (int)(depth * 2), "", (UV)numstates
3104   );
3105   for( q_read=1; q_read<numstates; q_read++ ) {
3106    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3107   }
3108   PerlIO_printf(Perl_debug_log, "\n");
3109  });
3110  Safefree(q);
3111  /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3112  return stclass;
3113 }
3114
3115
3116 #define DEBUG_PEEP(str,scan,depth) \
3117  DEBUG_OPTIMISE_r({if (scan){ \
3118  SV * const mysv=sv_newmortal(); \
3119  regnode *Next = regnext(scan); \
3120  regprop(RExC_rx, mysv, scan, NULL); \
3121  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3122  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3123  Next ? (REG_NODE_NUM(Next)) : 0 ); \
3124    }});
3125
3126
3127 /* The below joins as many adjacent EXACTish nodes as possible into a single
3128  * one.  The regop may be changed if the node(s) contain certain sequences that
3129  * require special handling.  The joining is only done if:
3130  * 1) there is room in the current conglomerated node to entirely contain the
3131  *    next one.
3132  * 2) they are the exact same node type
3133  *
3134  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3135  * these get optimized out
3136  *
3137  * If a node is to match under /i (folded), the number of characters it matches
3138  * can be different than its character length if it contains a multi-character
3139  * fold.  *min_subtract is set to the total delta number of characters of the
3140  * input nodes.
3141  *
3142  * And *unfolded_multi_char is set to indicate whether or not the node contains
3143  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3144  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3145  * SMALL LETTER SHARP S, as only if the target string being matched against
3146  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3147  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3148  * whose components are all above the Latin1 range are not run-time locale
3149  * dependent, and have already been folded by the time this function is
3150  * called.)
3151  *
3152  * This is as good a place as any to discuss the design of handling these
3153  * multi-character fold sequences.  It's been wrong in Perl for a very long
3154  * time.  There are three code points in Unicode whose multi-character folds
3155  * were long ago discovered to mess things up.  The previous designs for
3156  * dealing with these involved assigning a special node for them.  This
3157  * approach doesn't always work, as evidenced by this example:
3158  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3159  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3160  * would match just the \xDF, it won't be able to handle the case where a
3161  * successful match would have to cross the node's boundary.  The new approach
3162  * that hopefully generally solves the problem generates an EXACTFU_SS node
3163  * that is "sss" in this case.
3164  *
3165  * It turns out that there are problems with all multi-character folds, and not
3166  * just these three.  Now the code is general, for all such cases.  The
3167  * approach taken is:
3168  * 1)   This routine examines each EXACTFish node that could contain multi-
3169  *      character folded sequences.  Since a single character can fold into
3170  *      such a sequence, the minimum match length for this node is less than
3171  *      the number of characters in the node.  This routine returns in
3172  *      *min_subtract how many characters to subtract from the the actual
3173  *      length of the string to get a real minimum match length; it is 0 if
3174  *      there are no multi-char foldeds.  This delta is used by the caller to
3175  *      adjust the min length of the match, and the delta between min and max,
3176  *      so that the optimizer doesn't reject these possibilities based on size
3177  *      constraints.
3178  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3179  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3180  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3181  *      there is a possible fold length change.  That means that a regular
3182  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3183  *      with length changes, and so can be processed faster.  regexec.c takes
3184  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3185  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3186  *      known until runtime).  This saves effort in regex matching.  However,
3187  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3188  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3189  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3190  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3191  *      possibilities for the non-UTF8 patterns are quite simple, except for
3192  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3193  *      members of a fold-pair, and arrays are set up for all of them so that
3194  *      the other member of the pair can be found quickly.  Code elsewhere in
3195  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3196  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3197  *      described in the next item.
3198  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3199  *      validity of the fold won't be known until runtime, and so must remain
3200  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3201  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3202  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3203  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3204  *      The reason this is a problem is that the optimizer part of regexec.c
3205  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3206  *      that a character in the pattern corresponds to at most a single
3207  *      character in the target string.  (And I do mean character, and not byte
3208  *      here, unlike other parts of the documentation that have never been
3209  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3210  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3211  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3212  *      nodes, violate the assumption, and they are the only instances where it
3213  *      is violated.  I'm reluctant to try to change the assumption, as the
3214  *      code involved is impenetrable to me (khw), so instead the code here
3215  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3216  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3217  *      boolean indicating whether or not the node contains such a fold.  When
3218  *      it is true, the caller sets a flag that later causes the optimizer in
3219  *      this file to not set values for the floating and fixed string lengths,
3220  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3221  *      assumption.  Thus, there is no optimization based on string lengths for
3222  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3223  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3224  *      assumption is wrong only in these cases is that all other non-UTF-8
3225  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3226  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3227  *      EXACTF nodes because we don't know at compile time if it actually
3228  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3229  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3230  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3231  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3232  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3233  *      string would require the pattern to be forced into UTF-8, the overhead
3234  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3235  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3236  *      locale.)
3237  *
3238  *      Similarly, the code that generates tries doesn't currently handle
3239  *      not-already-folded multi-char folds, and it looks like a pain to change
3240  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3241  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3242  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3243  *      using /iaa matching will be doing so almost entirely with ASCII
3244  *      strings, so this should rarely be encountered in practice */
3245
3246 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3247  if (PL_regkind[OP(scan)] == EXACT) \
3248   join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3249
3250 STATIC U32
3251 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3252     UV *min_subtract, bool *unfolded_multi_char,
3253     U32 flags,regnode *val, U32 depth)
3254 {
3255  /* Merge several consecutive EXACTish nodes into one. */
3256  regnode *n = regnext(scan);
3257  U32 stringok = 1;
3258  regnode *next = scan + NODE_SZ_STR(scan);
3259  U32 merged = 0;
3260  U32 stopnow = 0;
3261 #ifdef DEBUGGING
3262  regnode *stop = scan;
3263  GET_RE_DEBUG_FLAGS_DECL;
3264 #else
3265  PERL_UNUSED_ARG(depth);
3266 #endif
3267
3268  PERL_ARGS_ASSERT_JOIN_EXACT;
3269 #ifndef EXPERIMENTAL_INPLACESCAN
3270  PERL_UNUSED_ARG(flags);
3271  PERL_UNUSED_ARG(val);
3272 #endif
3273  DEBUG_PEEP("join",scan,depth);
3274
3275  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3276  * EXACT ones that are mergeable to the current one. */
3277  while (n
3278   && (PL_regkind[OP(n)] == NOTHING
3279    || (stringok && OP(n) == OP(scan)))
3280   && NEXT_OFF(n)
3281   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3282  {
3283
3284   if (OP(n) == TAIL || n > next)
3285    stringok = 0;
3286   if (PL_regkind[OP(n)] == NOTHING) {
3287    DEBUG_PEEP("skip:",n,depth);
3288    NEXT_OFF(scan) += NEXT_OFF(n);
3289    next = n + NODE_STEP_REGNODE;
3290 #ifdef DEBUGGING
3291    if (stringok)
3292     stop = n;
3293 #endif
3294    n = regnext(n);
3295   }
3296   else if (stringok) {
3297    const unsigned int oldl = STR_LEN(scan);
3298    regnode * const nnext = regnext(n);
3299
3300    /* XXX I (khw) kind of doubt that this works on platforms (should
3301    * Perl ever run on one) where U8_MAX is above 255 because of lots
3302    * of other assumptions */
3303    /* Don't join if the sum can't fit into a single node */
3304    if (oldl + STR_LEN(n) > U8_MAX)
3305     break;
3306
3307    DEBUG_PEEP("merg",n,depth);
3308    merged++;
3309
3310    NEXT_OFF(scan) += NEXT_OFF(n);
3311    STR_LEN(scan) += STR_LEN(n);
3312    next = n + NODE_SZ_STR(n);
3313    /* Now we can overwrite *n : */
3314    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3315 #ifdef DEBUGGING
3316    stop = next - 1;
3317 #endif
3318    n = nnext;
3319    if (stopnow) break;
3320   }
3321
3322 #ifdef EXPERIMENTAL_INPLACESCAN
3323   if (flags && !NEXT_OFF(n)) {
3324    DEBUG_PEEP("atch", val, depth);
3325    if (reg_off_by_arg[OP(n)]) {
3326     ARG_SET(n, val - n);
3327    }
3328    else {
3329     NEXT_OFF(n) = val - n;
3330    }
3331    stopnow = 1;
3332   }
3333 #endif
3334  }
3335
3336  *min_subtract = 0;
3337  *unfolded_multi_char = FALSE;
3338
3339  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3340  * can now analyze for sequences of problematic code points.  (Prior to
3341  * this final joining, sequences could have been split over boundaries, and
3342  * hence missed).  The sequences only happen in folding, hence for any
3343  * non-EXACT EXACTish node */
3344  if (OP(scan) != EXACT) {
3345   U8* s0 = (U8*) STRING(scan);
3346   U8* s = s0;
3347   U8* s_end = s0 + STR_LEN(scan);
3348
3349   int total_count_delta = 0;  /* Total delta number of characters that
3350          multi-char folds expand to */
3351
3352   /* One pass is made over the node's string looking for all the
3353   * possibilities.  To avoid some tests in the loop, there are two main
3354   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3355   * non-UTF-8 */
3356   if (UTF) {
3357    U8* folded = NULL;
3358
3359    if (OP(scan) == EXACTFL) {
3360     U8 *d;
3361
3362     /* An EXACTFL node would already have been changed to another
3363     * node type unless there is at least one character in it that
3364     * is problematic; likely a character whose fold definition
3365     * won't be known until runtime, and so has yet to be folded.
3366     * For all but the UTF-8 locale, folds are 1-1 in length, but
3367     * to handle the UTF-8 case, we need to create a temporary
3368     * folded copy using UTF-8 locale rules in order to analyze it.
3369     * This is because our macros that look to see if a sequence is
3370     * a multi-char fold assume everything is folded (otherwise the
3371     * tests in those macros would be too complicated and slow).
3372     * Note that here, the non-problematic folds will have already
3373     * been done, so we can just copy such characters.  We actually
3374     * don't completely fold the EXACTFL string.  We skip the
3375     * unfolded multi-char folds, as that would just create work
3376     * below to figure out the size they already are */
3377
3378     Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3379     d = folded;
3380     while (s < s_end) {
3381      STRLEN s_len = UTF8SKIP(s);
3382      if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3383       Copy(s, d, s_len, U8);
3384       d += s_len;
3385      }
3386      else if (is_FOLDS_TO_MULTI_utf8(s)) {
3387       *unfolded_multi_char = TRUE;
3388       Copy(s, d, s_len, U8);
3389       d += s_len;
3390      }
3391      else if (isASCII(*s)) {
3392       *(d++) = toFOLD(*s);
3393      }
3394      else {
3395       STRLEN len;
3396       _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3397       d += len;
3398      }
3399      s += s_len;
3400     }
3401
3402     /* Point the remainder of the routine to look at our temporary
3403     * folded copy */
3404     s = folded;
3405     s_end = d;
3406    } /* End of creating folded copy of EXACTFL string */
3407
3408    /* Examine the string for a multi-character fold sequence.  UTF-8
3409    * patterns have all characters pre-folded by the time this code is
3410    * executed */
3411    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3412          length sequence we are looking for is 2 */
3413    {
3414     int count = 0;  /* How many characters in a multi-char fold */
3415     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3416     if (! len) {    /* Not a multi-char fold: get next char */
3417      s += UTF8SKIP(s);
3418      continue;
3419     }
3420
3421     /* Nodes with 'ss' require special handling, except for
3422     * EXACTFA-ish for which there is no multi-char fold to this */
3423     if (len == 2 && *s == 's' && *(s+1) == 's'
3424      && OP(scan) != EXACTFA
3425      && OP(scan) != EXACTFA_NO_TRIE)
3426     {
3427      count = 2;
3428      if (OP(scan) != EXACTFL) {
3429       OP(scan) = EXACTFU_SS;
3430      }
3431      s += 2;
3432     }
3433     else { /* Here is a generic multi-char fold. */
3434      U8* multi_end  = s + len;
3435
3436      /* Count how many characters are in it.  In the case of
3437      * /aa, no folds which contain ASCII code points are
3438      * allowed, so check for those, and skip if found. */
3439      if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3440       count = utf8_length(s, multi_end);
3441       s = multi_end;
3442      }
3443      else {
3444       while (s < multi_end) {
3445        if (isASCII(*s)) {
3446         s++;
3447         goto next_iteration;
3448        }
3449        else {
3450         s += UTF8SKIP(s);
3451        }
3452        count++;
3453       }
3454      }
3455     }
3456
3457     /* The delta is how long the sequence is minus 1 (1 is how long
3458     * the character that folds to the sequence is) */
3459     total_count_delta += count - 1;
3460    next_iteration: ;
3461    }
3462
3463    /* We created a temporary folded copy of the string in EXACTFL
3464    * nodes.  Therefore we need to be sure it doesn't go below zero,
3465    * as the real string could be shorter */
3466    if (OP(scan) == EXACTFL) {
3467     int total_chars = utf8_length((U8*) STRING(scan),
3468           (U8*) STRING(scan) + STR_LEN(scan));
3469     if (total_count_delta > total_chars) {
3470      total_count_delta = total_chars;
3471     }
3472    }
3473
3474    *min_subtract += total_count_delta;
3475    Safefree(folded);
3476   }
3477   else if (OP(scan) == EXACTFA) {
3478
3479    /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3480    * fold to the ASCII range (and there are no existing ones in the
3481    * upper latin1 range).  But, as outlined in the comments preceding
3482    * this function, we need to flag any occurrences of the sharp s.
3483    * This character forbids trie formation (because of added
3484    * complexity) */
3485    while (s < s_end) {
3486     if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3487      OP(scan) = EXACTFA_NO_TRIE;
3488      *unfolded_multi_char = TRUE;
3489      break;
3490     }
3491     s++;
3492     continue;
3493    }
3494   }
3495   else {
3496
3497    /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3498    * folds that are all Latin1.  As explained in the comments
3499    * preceding this function, we look also for the sharp s in EXACTF
3500    * and EXACTFL nodes; it can be in the final position.  Otherwise
3501    * we can stop looking 1 byte earlier because have to find at least
3502    * two characters for a multi-fold */
3503    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3504        ? s_end
3505        : s_end -1;
3506
3507    while (s < upper) {
3508     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3509     if (! len) {    /* Not a multi-char fold. */
3510      if (*s == LATIN_SMALL_LETTER_SHARP_S
3511       && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3512      {
3513       *unfolded_multi_char = TRUE;
3514      }
3515      s++;
3516      continue;
3517     }
3518
3519     if (len == 2
3520      && isARG2_lower_or_UPPER_ARG1('s', *s)
3521      && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3522     {
3523
3524      /* EXACTF nodes need to know that the minimum length
3525      * changed so that a sharp s in the string can match this
3526      * ss in the pattern, but they remain EXACTF nodes, as they
3527      * won't match this unless the target string is is UTF-8,
3528      * which we don't know until runtime.  EXACTFL nodes can't
3529      * transform into EXACTFU nodes */
3530      if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3531       OP(scan) = EXACTFU_SS;
3532      }
3533     }
3534
3535     *min_subtract += len - 1;
3536     s += len;
3537    }
3538   }
3539  }
3540
3541 #ifdef DEBUGGING
3542  /* Allow dumping but overwriting the collection of skipped
3543  * ops and/or strings with fake optimized ops */
3544  n = scan + NODE_SZ_STR(scan);
3545  while (n <= stop) {
3546   OP(n) = OPTIMIZED;
3547   FLAGS(n) = 0;
3548   NEXT_OFF(n) = 0;
3549   n++;
3550  }
3551 #endif
3552  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3553  return stopnow;
3554 }
3555
3556 /* REx optimizer.  Converts nodes into quicker variants "in place".
3557    Finds fixed substrings.  */
3558
3559 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3560    to the position after last scanned or to NULL. */
3561
3562 #define INIT_AND_WITHP \
3563  assert(!and_withp); \
3564  Newx(and_withp,1, regnode_ssc); \
3565  SAVEFREEPV(and_withp)
3566
3567 /* this is a chain of data about sub patterns we are processing that
3568    need to be handled separately/specially in study_chunk. Its so
3569    we can simulate recursion without losing state.  */
3570 struct scan_frame;
3571 typedef struct scan_frame {
3572  regnode *last;  /* last node to process in this frame */
3573  regnode *next;  /* next node to process when last is reached */
3574  struct scan_frame *prev; /*previous frame*/
3575  U32 prev_recursed_depth;
3576  I32 stop; /* what stopparen do we use */
3577 } scan_frame;
3578
3579
3580 STATIC SSize_t
3581 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3582       SSize_t *minlenp, SSize_t *deltap,
3583       regnode *last,
3584       scan_data_t *data,
3585       I32 stopparen,
3586       U32 recursed_depth,
3587       regnode_ssc *and_withp,
3588       U32 flags, U32 depth)
3589       /* scanp: Start here (read-write). */
3590       /* deltap: Write maxlen-minlen here. */
3591       /* last: Stop before this one. */
3592       /* data: string data about the pattern */
3593       /* stopparen: treat close N as END */
3594       /* recursed: which subroutines have we recursed into */
3595       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3596 {
3597  /* There must be at least this number of characters to match */
3598  SSize_t min = 0;
3599  I32 pars = 0, code;
3600  regnode *scan = *scanp, *next;
3601  SSize_t delta = 0;
3602  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3603  int is_inf_internal = 0;  /* The studied chunk is infinite */
3604  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3605  scan_data_t data_fake;
3606  SV *re_trie_maxbuff = NULL;
3607  regnode *first_non_open = scan;
3608  SSize_t stopmin = SSize_t_MAX;
3609  scan_frame *frame = NULL;
3610  GET_RE_DEBUG_FLAGS_DECL;
3611
3612  PERL_ARGS_ASSERT_STUDY_CHUNK;
3613
3614 #ifdef DEBUGGING
3615  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3616 #endif
3617  if ( depth == 0 ) {
3618   while (first_non_open && OP(first_non_open) == OPEN)
3619    first_non_open=regnext(first_non_open);
3620  }
3621
3622
3623   fake_study_recurse:
3624  while ( scan && OP(scan) != END && scan < last ){
3625   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3626         node length to get a real minimum (because
3627         the folded version may be shorter) */
3628   bool unfolded_multi_char = FALSE;
3629   /* Peephole optimizer: */
3630   DEBUG_OPTIMISE_MORE_r(
3631   {
3632    PerlIO_printf(Perl_debug_log,
3633     "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3634     ((int) depth*2), "", (long)stopparen,
3635     (unsigned long)depth, (unsigned long)recursed_depth);
3636    if (recursed_depth) {
3637     U32 i;
3638     U32 j;
3639     for ( j = 0 ; j < recursed_depth ; j++ ) {
3640      PerlIO_printf(Perl_debug_log,"[");
3641      for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3642       PerlIO_printf(Perl_debug_log,"%d",
3643        PAREN_TEST(RExC_study_chunk_recursed +
3644          (j * RExC_study_chunk_recursed_bytes), i)
3645        ? 1 : 0
3646       );
3647      PerlIO_printf(Perl_debug_log,"]");
3648     }
3649    }
3650    PerlIO_printf(Perl_debug_log,"\n");
3651   }
3652   );
3653   DEBUG_STUDYDATA("Peep:", data, depth);
3654   DEBUG_PEEP("Peep", scan, depth);
3655
3656
3657   /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3658   * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3659   * by a different invocation of reg() -- Yves
3660   */
3661   JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3662
3663   /* Follow the next-chain of the current node and optimize
3664   away all the NOTHINGs from it.  */
3665   if (OP(scan) != CURLYX) {
3666    const int max = (reg_off_by_arg[OP(scan)]
3667      ? I32_MAX
3668      /* I32 may be smaller than U16 on CRAYs! */
3669      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3670    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3671    int noff;
3672    regnode *n = scan;
3673
3674    /* Skip NOTHING and LONGJMP. */
3675    while ((n = regnext(n))
3676     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3677      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3678     && off + noff < max)
3679     off += noff;
3680    if (reg_off_by_arg[OP(scan)])
3681     ARG(scan) = off;
3682    else
3683     NEXT_OFF(scan) = off;
3684   }
3685
3686
3687
3688   /* The principal pseudo-switch.  Cannot be a switch, since we
3689   look into several different things.  */
3690   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3691     || OP(scan) == IFTHEN) {
3692    next = regnext(scan);
3693    code = OP(scan);
3694    /* demq: the op(next)==code check is to see if we have
3695    * "branch-branch" AFAICT */
3696
3697    if (OP(next) == code || code == IFTHEN) {
3698     /* NOTE - There is similar code to this block below for
3699     * handling TRIE nodes on a re-study.  If you change stuff here
3700     * check there too. */
3701     SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3702     regnode_ssc accum;
3703     regnode * const startbranch=scan;
3704
3705     if (flags & SCF_DO_SUBSTR) {
3706      /* Cannot merge strings after this. */
3707      scan_commit(pRExC_state, data, minlenp, is_inf);
3708     }
3709
3710     if (flags & SCF_DO_STCLASS)
3711      ssc_init_zero(pRExC_state, &accum);
3712
3713     while (OP(scan) == code) {
3714      SSize_t deltanext, minnext, fake;
3715      I32 f = 0;
3716      regnode_ssc this_class;
3717
3718      num++;
3719      data_fake.flags = 0;
3720      if (data) {
3721       data_fake.whilem_c = data->whilem_c;
3722       data_fake.last_closep = data->last_closep;
3723      }
3724      else
3725       data_fake.last_closep = &fake;
3726
3727      data_fake.pos_delta = delta;
3728      next = regnext(scan);
3729      scan = NEXTOPER(scan);
3730      if (code != BRANCH)
3731       scan = NEXTOPER(scan);
3732      if (flags & SCF_DO_STCLASS) {
3733       ssc_init(pRExC_state, &this_class);
3734       data_fake.start_class = &this_class;
3735       f = SCF_DO_STCLASS_AND;
3736      }
3737      if (flags & SCF_WHILEM_VISITED_POS)
3738       f |= SCF_WHILEM_VISITED_POS;
3739
3740      /* we suppose the run is continuous, last=next...*/
3741      minnext = study_chunk(pRExC_state, &scan, minlenp,
3742          &deltanext, next, &data_fake, stopparen,
3743          recursed_depth, NULL, f,depth+1);
3744      if (min1 > minnext)
3745       min1 = minnext;
3746      if (deltanext == SSize_t_MAX) {
3747       is_inf = is_inf_internal = 1;
3748       max1 = SSize_t_MAX;
3749      } else if (max1 < minnext + deltanext)
3750       max1 = minnext + deltanext;
3751      scan = next;
3752      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3753       pars++;
3754      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3755       if ( stopmin > minnext)
3756        stopmin = min + min1;
3757       flags &= ~SCF_DO_SUBSTR;
3758       if (data)
3759        data->flags |= SCF_SEEN_ACCEPT;
3760      }
3761      if (data) {
3762       if (data_fake.flags & SF_HAS_EVAL)
3763        data->flags |= SF_HAS_EVAL;
3764       data->whilem_c = data_fake.whilem_c;
3765      }
3766      if (flags & SCF_DO_STCLASS)
3767       ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3768     }
3769     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3770      min1 = 0;
3771     if (flags & SCF_DO_SUBSTR) {
3772      data->pos_min += min1;
3773      if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3774       data->pos_delta = SSize_t_MAX;
3775      else
3776       data->pos_delta += max1 - min1;
3777      if (max1 != min1 || is_inf)
3778       data->longest = &(data->longest_float);
3779     }
3780     min += min1;
3781     if (delta == SSize_t_MAX
3782     || SSize_t_MAX - delta - (max1 - min1) < 0)
3783      delta = SSize_t_MAX;
3784     else
3785      delta += max1 - min1;
3786     if (flags & SCF_DO_STCLASS_OR) {
3787      ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3788      if (min1) {
3789       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3790       flags &= ~SCF_DO_STCLASS;
3791      }
3792     }
3793     else if (flags & SCF_DO_STCLASS_AND) {
3794      if (min1) {
3795       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3796       flags &= ~SCF_DO_STCLASS;
3797      }
3798      else {
3799       /* Switch to OR mode: cache the old value of
3800       * data->start_class */
3801       INIT_AND_WITHP;
3802       StructCopy(data->start_class, and_withp, regnode_ssc);
3803       flags &= ~SCF_DO_STCLASS_AND;
3804       StructCopy(&accum, data->start_class, regnode_ssc);
3805       flags |= SCF_DO_STCLASS_OR;
3806      }
3807     }
3808
3809     if (PERL_ENABLE_TRIE_OPTIMISATION &&
3810       OP( startbranch ) == BRANCH )
3811     {
3812     /* demq.
3813
3814     Assuming this was/is a branch we are dealing with: 'scan'
3815     now points at the item that follows the branch sequence,
3816     whatever it is. We now start at the beginning of the
3817     sequence and look for subsequences of
3818
3819     BRANCH->EXACT=>x1
3820     BRANCH->EXACT=>x2
3821     tail
3822
3823     which would be constructed from a pattern like
3824     /A|LIST|OF|WORDS/
3825
3826     If we can find such a subsequence we need to turn the first
3827     element into a trie and then add the subsequent branch exact
3828     strings to the trie.
3829
3830     We have two cases
3831
3832      1. patterns where the whole set of branches can be
3833       converted.
3834
3835      2. patterns where only a subset can be converted.
3836
3837     In case 1 we can replace the whole set with a single regop
3838     for the trie. In case 2 we need to keep the start and end
3839     branches so
3840
3841      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3842      becomes BRANCH TRIE; BRANCH X;
3843
3844     There is an additional case, that being where there is a
3845     common prefix, which gets split out into an EXACT like node
3846     preceding the TRIE node.
3847
3848     If x(1..n)==tail then we can do a simple trie, if not we make
3849     a "jump" trie, such that when we match the appropriate word
3850     we "jump" to the appropriate tail node. Essentially we turn
3851     a nested if into a case structure of sorts.
3852
3853     */
3854
3855      int made=0;
3856      if (!re_trie_maxbuff) {
3857       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3858       if (!SvIOK(re_trie_maxbuff))
3859        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3860      }
3861      if ( SvIV(re_trie_maxbuff)>=0  ) {
3862       regnode *cur;
3863       regnode *first = (regnode *)NULL;
3864       regnode *last = (regnode *)NULL;
3865       regnode *tail = scan;
3866       U8 trietype = 0;
3867       U32 count=0;
3868
3869 #ifdef DEBUGGING
3870       SV * const mysv = sv_newmortal();   /* for dumping */
3871 #endif
3872       /* var tail is used because there may be a TAIL
3873       regop in the way. Ie, the exacts will point to the
3874       thing following the TAIL, but the last branch will
3875       point at the TAIL. So we advance tail. If we
3876       have nested (?:) we may have to move through several
3877       tails.
3878       */
3879
3880       while ( OP( tail ) == TAIL ) {
3881        /* this is the TAIL generated by (?:) */
3882        tail = regnext( tail );
3883       }
3884
3885
3886       DEBUG_TRIE_COMPILE_r({
3887        regprop(RExC_rx, mysv, tail, NULL);
3888        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3889        (int)depth * 2 + 2, "",
3890        "Looking for TRIE'able sequences. Tail node is: ",
3891        SvPV_nolen_const( mysv )
3892        );
3893       });
3894
3895       /*
3896
3897        Step through the branches
3898         cur represents each branch,
3899         noper is the first thing to be matched as part
3900          of that branch
3901         noper_next is the regnext() of that node.
3902
3903        We normally handle a case like this
3904        /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3905        support building with NOJUMPTRIE, which restricts
3906        the trie logic to structures like /FOO|BAR/.
3907
3908        If noper is a trieable nodetype then the branch is
3909        a possible optimization target. If we are building
3910        under NOJUMPTRIE then we require that noper_next is
3911        the same as scan (our current position in the regex
3912        program).
3913
3914        Once we have two or more consecutive such branches
3915        we can create a trie of the EXACT's contents and
3916        stitch it in place into the program.
3917
3918        If the sequence represents all of the branches in
3919        the alternation we replace the entire thing with a
3920        single TRIE node.
3921
3922        Otherwise when it is a subsequence we need to
3923        stitch it in place and replace only the relevant
3924        branches. This means the first branch has to remain
3925        as it is used by the alternation logic, and its
3926        next pointer, and needs to be repointed at the item
3927        on the branch chain following the last branch we
3928        have optimized away.
3929
3930        This could be either a BRANCH, in which case the
3931        subsequence is internal, or it could be the item
3932        following the branch sequence in which case the
3933        subsequence is at the end (which does not
3934        necessarily mean the first node is the start of the
3935        alternation).
3936
3937        TRIE_TYPE(X) is a define which maps the optype to a
3938        trietype.
3939
3940         optype          |  trietype
3941         ----------------+-----------
3942         NOTHING         | NOTHING
3943         EXACT           | EXACT
3944         EXACTFU         | EXACTFU
3945         EXACTFU_SS      | EXACTFU
3946         EXACTFA         | EXACTFA
3947
3948
3949       */
3950 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3951      ( EXACT == (X) )   ? EXACT :        \
3952      ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3953      ( EXACTFA == (X) ) ? EXACTFA :        \
3954      0 )
3955
3956       /* dont use tail as the end marker for this traverse */
3957       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3958        regnode * const noper = NEXTOPER( cur );
3959        U8 noper_type = OP( noper );
3960        U8 noper_trietype = TRIE_TYPE( noper_type );
3961 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3962        regnode * const noper_next = regnext( noper );
3963        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3964        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3965 #endif
3966
3967        DEBUG_TRIE_COMPILE_r({
3968         regprop(RExC_rx, mysv, cur, NULL);
3969         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3970         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3971
3972         regprop(RExC_rx, mysv, noper, NULL);
3973         PerlIO_printf( Perl_debug_log, " -> %s",
3974          SvPV_nolen_const(mysv));
3975
3976         if ( noper_next ) {
3977         regprop(RExC_rx, mysv, noper_next, NULL);
3978         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3979          SvPV_nolen_const(mysv));
3980         }
3981         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3982         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3983         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3984         );
3985        });
3986
3987        /* Is noper a trieable nodetype that can be merged
3988        * with the current trie (if there is one)? */
3989        if ( noper_trietype
3990         &&
3991         (
3992           ( noper_trietype == NOTHING)
3993           || ( trietype == NOTHING )
3994           || ( trietype == noper_trietype )
3995         )
3996 #ifdef NOJUMPTRIE
3997         && noper_next == tail
3998 #endif
3999         && count < U16_MAX)
4000        {
4001         /* Handle mergable triable node Either we are
4002         * the first node in a new trieable sequence,
4003         * in which case we do some bookkeeping,
4004         * otherwise we update the end pointer. */
4005         if ( !first ) {
4006          first = cur;
4007          if ( noper_trietype == NOTHING ) {
4008 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4009           regnode * const noper_next = regnext( noper );
4010           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4011           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4012 #endif
4013
4014           if ( noper_next_trietype ) {
4015            trietype = noper_next_trietype;
4016           } else if (noper_next_type)  {
4017            /* a NOTHING regop is 1 regop wide.
4018            * We need at least two for a trie
4019            * so we can't merge this in */
4020            first = NULL;
4021           }
4022          } else {
4023           trietype = noper_trietype;
4024          }
4025         } else {
4026          if ( trietype == NOTHING )
4027           trietype = noper_trietype;
4028          last = cur;
4029         }
4030         if (first)
4031          count++;
4032        } /* end handle mergable triable node */
4033        else {
4034         /* handle unmergable node -
4035         * noper may either be a triable node which can
4036         * not be tried together with the current trie,
4037         * or a non triable node */
4038         if ( last ) {
4039          /* If last is set and trietype is not
4040          * NOTHING then we have found at least two
4041          * triable branch sequences in a row of a
4042          * similar trietype so we can turn them
4043          * into a trie. If/when we allow NOTHING to
4044          * start a trie sequence this condition
4045          * will be required, and it isn't expensive
4046          * so we leave it in for now. */
4047          if ( trietype && trietype != NOTHING )
4048           make_trie( pRExC_state,
4049             startbranch, first, cur, tail,
4050             count, trietype, depth+1 );
4051          last = NULL; /* note: we clear/update
4052              first, trietype etc below,
4053              so we dont do it here */
4054         }
4055         if ( noper_trietype
4056 #ifdef NOJUMPTRIE
4057          && noper_next == tail
4058 #endif
4059         ){
4060          /* noper is triable, so we can start a new
4061          * trie sequence */
4062          count = 1;
4063          first = cur;
4064          trietype = noper_trietype;
4065         } else if (first) {
4066          /* if we already saw a first but the
4067          * current node is not triable then we have
4068          * to reset the first information. */
4069          count = 0;
4070          first = NULL;
4071          trietype = 0;
4072         }
4073        } /* end handle unmergable node */
4074       } /* loop over branches */
4075       DEBUG_TRIE_COMPILE_r({
4076        regprop(RExC_rx, mysv, cur, NULL);
4077        PerlIO_printf( Perl_debug_log,
4078        "%*s- %s (%d) <SCAN FINISHED>\n",
4079        (int)depth * 2 + 2,
4080        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4081
4082       });
4083       if ( last && trietype ) {
4084        if ( trietype != NOTHING ) {
4085         /* the last branch of the sequence was part of
4086         * a trie, so we have to construct it here
4087         * outside of the loop */
4088         made= make_trie( pRExC_state, startbranch,
4089             first, scan, tail, count,
4090             trietype, depth+1 );
4091 #ifdef TRIE_STUDY_OPT
4092         if ( ((made == MADE_EXACT_TRIE &&
4093          startbranch == first)
4094          || ( first_non_open == first )) &&
4095          depth==0 ) {
4096          flags |= SCF_TRIE_RESTUDY;
4097          if ( startbranch == first
4098           && scan == tail )
4099          {
4100           RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4101          }
4102         }
4103 #endif
4104        } else {
4105         /* at this point we know whatever we have is a
4106         * NOTHING sequence/branch AND if 'startbranch'
4107         * is 'first' then we can turn the whole thing
4108         * into a NOTHING
4109         */
4110         if ( startbranch == first ) {
4111          regnode *opt;
4112          /* the entire thing is a NOTHING sequence,
4113          * something like this: (?:|) So we can
4114          * turn it into a plain NOTHING op. */
4115          DEBUG_TRIE_COMPILE_r({
4116           regprop(RExC_rx, mysv, cur, NULL);
4117           PerlIO_printf( Perl_debug_log,
4118           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4119           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4120
4121          });
4122          OP(startbranch)= NOTHING;
4123          NEXT_OFF(startbranch)= tail - startbranch;
4124          for ( opt= startbranch + 1; opt < tail ; opt++ )
4125           OP(opt)= OPTIMIZED;
4126         }
4127        }
4128       } /* end if ( last) */
4129      } /* TRIE_MAXBUF is non zero */
4130
4131     } /* do trie */
4132
4133    }
4134    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4135     scan = NEXTOPER(NEXTOPER(scan));
4136    } else   /* single branch is optimized. */
4137     scan = NEXTOPER(scan);
4138    continue;
4139   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4140    scan_frame *newframe = NULL;
4141    I32 paren;
4142    regnode *start;
4143    regnode *end;
4144    U32 my_recursed_depth= recursed_depth;
4145
4146    if (OP(scan) != SUSPEND) {
4147     /* set the pointer */
4148     if (OP(scan) == GOSUB) {
4149      paren = ARG(scan);
4150      RExC_recurse[ARG2L(scan)] = scan;
4151      start = RExC_open_parens[paren-1];
4152      end   = RExC_close_parens[paren-1];
4153     } else {
4154      paren = 0;
4155      start = RExC_rxi->program + 1;
4156      end   = RExC_opend;
4157     }
4158     if (!recursed_depth
4159      ||
4160      !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4161     ) {
4162      if (!recursed_depth) {
4163       Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4164      } else {
4165       Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4166        RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4167        RExC_study_chunk_recursed_bytes, U8);
4168      }
4169      /* we havent recursed into this paren yet, so recurse into it */
4170      DEBUG_STUDYDATA("set:", data,depth);
4171      PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4172      my_recursed_depth= recursed_depth + 1;
4173      Newx(newframe,1,scan_frame);
4174     } else {
4175      DEBUG_STUDYDATA("inf:", data,depth);
4176      /* some form of infinite recursion, assume infinite length
4177      * */
4178      if (flags & SCF_DO_SUBSTR) {
4179       scan_commit(pRExC_state, data, minlenp, is_inf);
4180       data->longest = &(data->longest_float);
4181      }
4182      is_inf = is_inf_internal = 1;
4183      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4184       ssc_anything(data->start_class);
4185      flags &= ~SCF_DO_STCLASS;
4186     }
4187    } else {
4188     Newx(newframe,1,scan_frame);
4189     paren = stopparen;
4190     start = scan+2;
4191     end = regnext(scan);
4192    }
4193    if (newframe) {
4194     assert(start);
4195     assert(end);
4196     SAVEFREEPV(newframe);
4197     newframe->next = regnext(scan);
4198     newframe->last = last;
4199     newframe->stop = stopparen;
4200     newframe->prev = frame;
4201     newframe->prev_recursed_depth = recursed_depth;
4202
4203     DEBUG_STUDYDATA("frame-new:",data,depth);
4204     DEBUG_PEEP("fnew", scan, depth);
4205
4206     frame = newframe;
4207     scan =  start;
4208     stopparen = paren;
4209     last = end;
4210     depth = depth + 1;
4211     recursed_depth= my_recursed_depth;
4212
4213     continue;
4214    }
4215   }
4216   else if (OP(scan) == EXACT) {
4217    SSize_t l = STR_LEN(scan);
4218    UV uc;
4219    if (UTF) {
4220     const U8 * const s = (U8*)STRING(scan);
4221     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4222     l = utf8_length(s, s + l);
4223    } else {
4224     uc = *((U8*)STRING(scan));
4225    }
4226    min += l;
4227    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4228     /* The code below prefers earlier match for fixed
4229     offset, later match for variable offset.  */
4230     if (data->last_end == -1) { /* Update the start info. */
4231      data->last_start_min = data->pos_min;
4232      data->last_start_max = is_inf
4233       ? SSize_t_MAX : data->pos_min + data->pos_delta;
4234     }
4235     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4236     if (UTF)
4237      SvUTF8_on(data->last_found);
4238     {
4239      SV * const sv = data->last_found;
4240      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4241       mg_find(sv, PERL_MAGIC_utf8) : NULL;
4242      if (mg && mg->mg_len >= 0)
4243       mg->mg_len += utf8_length((U8*)STRING(scan),
4244            (U8*)STRING(scan)+STR_LEN(scan));
4245     }
4246     data->last_end = data->pos_min + l;
4247     data->pos_min += l; /* As in the first entry. */
4248     data->flags &= ~SF_BEFORE_EOL;
4249    }
4250
4251    /* ANDing the code point leaves at most it, and not in locale, and
4252    * can't match null string */
4253    if (flags & SCF_DO_STCLASS_AND) {
4254     ssc_cp_and(data->start_class, uc);
4255     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4256     ssc_clear_locale(data->start_class);
4257    }
4258    else if (flags & SCF_DO_STCLASS_OR) {
4259     ssc_add_cp(data->start_class, uc);
4260     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4261
4262     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4263     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4264    }
4265    flags &= ~SCF_DO_STCLASS;
4266   }
4267   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
4268              EXACTFish */
4269    SSize_t l = STR_LEN(scan);
4270    UV uc = *((U8*)STRING(scan));
4271    SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4272              separate code points */
4273    const U8 * s = (U8*)STRING(scan);
4274
4275    /* Search for fixed substrings supports EXACT only. */
4276    if (flags & SCF_DO_SUBSTR) {
4277     assert(data);
4278     scan_commit(pRExC_state, data, minlenp, is_inf);
4279    }
4280    if (UTF) {
4281     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4282     l = utf8_length(s, s + l);
4283    }
4284    if (unfolded_multi_char) {
4285     RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4286    }
4287    min += l - min_subtract;
4288    assert (min >= 0);
4289    delta += min_subtract;
4290    if (flags & SCF_DO_SUBSTR) {
4291     data->pos_min += l - min_subtract;
4292     if (data->pos_min < 0) {
4293      data->pos_min = 0;
4294     }
4295     data->pos_delta += min_subtract;
4296     if (min_subtract) {
4297      data->longest = &(data->longest_float);
4298     }
4299    }
4300
4301    if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4302     ssc_clear_locale(data->start_class);
4303    }
4304
4305    if (! UTF) {
4306
4307     /* We punt and assume can match anything if the node begins
4308     * with a multi-character fold.  Things are complicated.  For
4309     * example, /ffi/i could match any of:
4310     *  "\N{LATIN SMALL LIGATURE FFI}"
4311     *  "\N{LATIN SMALL LIGATURE FF}I"
4312     *  "F\N{LATIN SMALL LIGATURE FI}"
4313     *  plus several other things; and making sure we have all the
4314     *  possibilities is hard. */
4315     if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4316      EXACTF_invlist =
4317        _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4318     }
4319     else {
4320
4321      /* Any Latin1 range character can potentially match any
4322      * other depending on the locale */
4323      if (OP(scan) == EXACTFL) {
4324       _invlist_union(EXACTF_invlist, PL_Latin1,
4325                &EXACTF_invlist);
4326      }
4327      else {
4328       /* But otherwise, it matches at least itself.  We can
4329       * quickly tell if it has a distinct fold, and if so,
4330       * it matches that as well */
4331       EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4332       if (IS_IN_SOME_FOLD_L1(uc)) {
4333        EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4334               PL_fold_latin1[uc]);
4335       }
4336      }
4337
4338      /* Some characters match above-Latin1 ones under /i.  This
4339      * is true of EXACTFL ones when the locale is UTF-8 */
4340      if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4341       && (! isASCII(uc) || (OP(scan) != EXACTFA
4342            && OP(scan) != EXACTFA_NO_TRIE)))
4343      {
4344       add_above_Latin1_folds(pRExC_state,
4345            (U8) uc,
4346            &EXACTF_invlist);
4347      }
4348     }
4349    }
4350    else {  /* Pattern is UTF-8 */
4351     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4352     STRLEN foldlen = UTF8SKIP(s);
4353     const U8* e = s + STR_LEN(scan);
4354     SV** listp;
4355
4356     /* The only code points that aren't folded in a UTF EXACTFish
4357     * node are are the problematic ones in EXACTFL nodes */
4358     if (OP(scan) == EXACTFL
4359      && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4360     {
4361      /* We need to check for the possibility that this EXACTFL
4362      * node begins with a multi-char fold.  Therefore we fold
4363      * the first few characters of it so that we can make that
4364      * check */
4365      U8 *d = folded;
4366      int i;
4367
4368      for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4369       if (isASCII(*s)) {
4370        *(d++) = (U8) toFOLD(*s);
4371        s++;
4372       }
4373       else {
4374        STRLEN len;
4375        to_utf8_fold(s, d, &len);
4376        d += len;
4377        s += UTF8SKIP(s);
4378       }
4379      }
4380
4381      /* And set up so the code below that looks in this folded
4382      * buffer instead of the node's string */
4383      e = d;
4384      foldlen = UTF8SKIP(folded);
4385      s = folded;
4386     }
4387
4388     /* When we reach here 's' points to the fold of the first
4389     * character(s) of the node; and 'e' points to far enough along
4390     * the folded string to be just past any possible multi-char
4391     * fold. 'foldlen' is the length in bytes of the first
4392     * character in 's'
4393     *
4394     * Unlike the non-UTF-8 case, the macro for determining if a
4395     * string is a multi-char fold requires all the characters to
4396     * already be folded.  This is because of all the complications
4397     * if not.  Note that they are folded anyway, except in EXACTFL
4398     * nodes.  Like the non-UTF case above, we punt if the node
4399     * begins with a multi-char fold  */
4400
4401     if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4402      EXACTF_invlist =
4403        _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4404     }
4405     else {  /* Single char fold */
4406
4407      /* It matches all the things that fold to it, which are
4408      * found in PL_utf8_foldclosures (including itself) */
4409      EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4410      if (! PL_utf8_foldclosures) {
4411       _load_PL_utf8_foldclosures();
4412      }
4413      if ((listp = hv_fetch(PL_utf8_foldclosures,
4414           (char *) s, foldlen, FALSE)))
4415      {
4416       AV* list = (AV*) *listp;
4417       IV k;
4418       for (k = 0; k <= av_tindex(list); k++) {
4419        SV** c_p = av_fetch(list, k, FALSE);
4420        UV c;
4421        assert(c_p);
4422
4423        c = SvUV(*c_p);
4424
4425        /* /aa doesn't allow folds between ASCII and non- */
4426        if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4427         && isASCII(c) != isASCII(uc))
4428        {
4429         continue;
4430        }
4431
4432        EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4433       }
4434      }
4435     }
4436    }
4437    if (flags & SCF_DO_STCLASS_AND) {
4438     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4439     ANYOF_POSIXL_ZERO(data->start_class);
4440     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4441    }
4442    else if (flags & SCF_DO_STCLASS_OR) {
4443     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4444     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4445
4446     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4447     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4448    }
4449    flags &= ~SCF_DO_STCLASS;
4450    SvREFCNT_dec(EXACTF_invlist);
4451   }
4452   else if (REGNODE_VARIES(OP(scan))) {
4453    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4454    I32 fl = 0, f = flags;
4455    regnode * const oscan = scan;
4456    regnode_ssc this_class;
4457    regnode_ssc *oclass = NULL;
4458    I32 next_is_eval = 0;
4459
4460    switch (PL_regkind[OP(scan)]) {
4461    case WHILEM:  /* End of (?:...)* . */
4462     scan = NEXTOPER(scan);
4463     goto finish;
4464    case PLUS:
4465     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4466      next = NEXTOPER(scan);
4467      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4468       mincount = 1;
4469       maxcount = REG_INFTY;
4470       next = regnext(scan);
4471       scan = NEXTOPER(scan);
4472       goto do_curly;
4473      }
4474     }
4475     if (flags & SCF_DO_SUBSTR)
4476      data->pos_min++;
4477     min++;
4478     /* FALLTHROUGH */
4479    case STAR:
4480     if (flags & SCF_DO_STCLASS) {
4481      mincount = 0;
4482      maxcount = REG_INFTY;
4483      next = regnext(scan);
4484      scan = NEXTOPER(scan);
4485      goto do_curly;
4486     }
4487     if (flags & SCF_DO_SUBSTR) {
4488      scan_commit(pRExC_state, data, minlenp, is_inf);
4489      /* Cannot extend fixed substrings */
4490      data->longest = &(data->longest_float);
4491     }
4492     is_inf = is_inf_internal = 1;
4493     scan = regnext(scan);
4494     goto optimize_curly_tail;
4495    case CURLY:
4496     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4497      && (scan->flags == stopparen))
4498     {
4499      mincount = 1;
4500      maxcount = 1;
4501     } else {
4502      mincount = ARG1(scan);
4503      maxcount = ARG2(scan);
4504     }
4505     next = regnext(scan);
4506     if (OP(scan) == CURLYX) {
4507      I32 lp = (data ? *(data->last_closep) : 0);
4508      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4509     }
4510     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4511     next_is_eval = (OP(scan) == EVAL);
4512    do_curly:
4513     if (flags & SCF_DO_SUBSTR) {
4514      if (mincount == 0)
4515       scan_commit(pRExC_state, data, minlenp, is_inf);
4516      /* Cannot extend fixed substrings */
4517      pos_before = data->pos_min;
4518     }
4519     if (data) {
4520      fl = data->flags;
4521      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4522      if (is_inf)
4523       data->flags |= SF_IS_INF;
4524     }
4525     if (flags & SCF_DO_STCLASS) {
4526      ssc_init(pRExC_state, &this_class);
4527      oclass = data->start_class;
4528      data->start_class = &this_class;
4529      f |= SCF_DO_STCLASS_AND;
4530      f &= ~SCF_DO_STCLASS_OR;
4531     }
4532     /* Exclude from super-linear cache processing any {n,m}
4533     regops for which the combination of input pos and regex
4534     pos is not enough information to determine if a match
4535     will be possible.
4536
4537     For example, in the regex /foo(bar\s*){4,8}baz/ with the
4538     regex pos at the \s*, the prospects for a match depend not
4539     only on the input position but also on how many (bar\s*)
4540     repeats into the {4,8} we are. */
4541    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4542      f &= ~SCF_WHILEM_VISITED_POS;
4543
4544     /* This will finish on WHILEM, setting scan, or on NULL: */
4545     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4546         last, data, stopparen, recursed_depth, NULL,
4547         (mincount == 0
4548         ? (f & ~SCF_DO_SUBSTR)
4549         : f)
4550         ,depth+1);
4551
4552     if (flags & SCF_DO_STCLASS)
4553      data->start_class = oclass;
4554     if (mincount == 0 || minnext == 0) {
4555      if (flags & SCF_DO_STCLASS_OR) {
4556       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4557      }
4558      else if (flags & SCF_DO_STCLASS_AND) {
4559       /* Switch to OR mode: cache the old value of
4560       * data->start_class */
4561       INIT_AND_WITHP;
4562       StructCopy(data->start_class, and_withp, regnode_ssc);
4563       flags &= ~SCF_DO_STCLASS_AND;
4564       StructCopy(&this_class, data->start_class, regnode_ssc);
4565       flags |= SCF_DO_STCLASS_OR;
4566       ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4567      }
4568     } else {  /* Non-zero len */
4569      if (flags & SCF_DO_STCLASS_OR) {
4570       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4571       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4572      }
4573      else if (flags & SCF_DO_STCLASS_AND)
4574       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4575      flags &= ~SCF_DO_STCLASS;
4576     }
4577     if (!scan)   /* It was not CURLYX, but CURLY. */
4578      scan = next;
4579     if (!(flags & SCF_TRIE_DOING_RESTUDY)
4580      /* ? quantifier ok, except for (?{ ... }) */
4581      && (next_is_eval || !(mincount == 0 && maxcount == 1))
4582      && (minnext == 0) && (deltanext == 0)
4583      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4584      && maxcount <= REG_INFTY/3) /* Complement check for big
4585             count */
4586     {
4587      /* Fatal warnings may leak the regexp without this: */
4588      SAVEFREESV(RExC_rx_sv);
4589      ckWARNreg(RExC_parse,
4590        "Quantifier unexpected on zero-length expression");
4591      (void)ReREFCNT_inc(RExC_rx_sv);
4592     }
4593
4594     min += minnext * mincount;
4595     is_inf_internal |= deltanext == SSize_t_MAX
4596       || (maxcount == REG_INFTY && minnext + deltanext > 0);
4597     is_inf |= is_inf_internal;
4598     if (is_inf) {
4599      delta = SSize_t_MAX;
4600     } else {
4601      delta += (minnext + deltanext) * maxcount
4602        - minnext * mincount;
4603     }
4604     /* Try powerful optimization CURLYX => CURLYN. */
4605     if (  OP(oscan) == CURLYX && data
4606      && data->flags & SF_IN_PAR
4607      && !(data->flags & SF_HAS_EVAL)
4608      && !deltanext && minnext == 1 ) {
4609      /* Try to optimize to CURLYN.  */
4610      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4611      regnode * const nxt1 = nxt;
4612 #ifdef DEBUGGING
4613      regnode *nxt2;
4614 #endif
4615
4616      /* Skip open. */
4617      nxt = regnext(nxt);
4618      if (!REGNODE_SIMPLE(OP(nxt))
4619       && !(PL_regkind[OP(nxt)] == EXACT
4620        && STR_LEN(nxt) == 1))
4621       goto nogo;
4622 #ifdef DEBUGGING
4623      nxt2 = nxt;
4624 #endif
4625      nxt = regnext(nxt);
4626      if (OP(nxt) != CLOSE)
4627       goto nogo;
4628      if (RExC_open_parens) {
4629       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4630       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4631      }
4632      /* Now we know that nxt2 is the only contents: */
4633      oscan->flags = (U8)ARG(nxt);
4634      OP(oscan) = CURLYN;
4635      OP(nxt1) = NOTHING; /* was OPEN. */
4636
4637 #ifdef DEBUGGING
4638      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4639      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4640      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4641      OP(nxt) = OPTIMIZED; /* was CLOSE. */
4642      OP(nxt + 1) = OPTIMIZED; /* was count. */
4643      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4644 #endif
4645     }
4646    nogo:
4647
4648     /* Try optimization CURLYX => CURLYM. */
4649     if (  OP(oscan) == CURLYX && data
4650      && !(data->flags & SF_HAS_PAR)
4651      && !(data->flags & SF_HAS_EVAL)
4652      && !deltanext /* atom is fixed width */
4653      && minnext != 0 /* CURLYM can't handle zero width */
4654
4655       /* Nor characters whose fold at run-time may be
4656       * multi-character */
4657      && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4658     ) {
4659      /* XXXX How to optimize if data == 0? */
4660      /* Optimize to a simpler form.  */
4661      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4662      regnode *nxt2;
4663
4664      OP(oscan) = CURLYM;
4665      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4666        && (OP(nxt2) != WHILEM))
4667       nxt = nxt2;
4668      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4669      /* Need to optimize away parenths. */
4670      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4671       /* Set the parenth number.  */
4672       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4673
4674       oscan->flags = (U8)ARG(nxt);
4675       if (RExC_open_parens) {
4676        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4677        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4678       }
4679       OP(nxt1) = OPTIMIZED; /* was OPEN. */
4680       OP(nxt) = OPTIMIZED; /* was CLOSE. */
4681
4682 #ifdef DEBUGGING
4683       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4684       OP(nxt + 1) = OPTIMIZED; /* was count. */
4685       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4686       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4687 #endif
4688 #if 0
4689       while ( nxt1 && (OP(nxt1) != WHILEM)) {
4690        regnode *nnxt = regnext(nxt1);
4691        if (nnxt == nxt) {
4692         if (reg_off_by_arg[OP(nxt1)])
4693          ARG_SET(nxt1, nxt2 - nxt1);
4694         else if (nxt2 - nxt1 < U16_MAX)
4695          NEXT_OFF(nxt1) = nxt2 - nxt1;
4696         else
4697          OP(nxt) = NOTHING; /* Cannot beautify */
4698        }
4699        nxt1 = nnxt;
4700       }
4701 #endif
4702       /* Optimize again: */
4703       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4704          NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4705      }
4706      else
4707       oscan->flags = 0;
4708     }
4709     else if ((OP(oscan) == CURLYX)
4710       && (flags & SCF_WHILEM_VISITED_POS)
4711       /* See the comment on a similar expression above.
4712        However, this time it's not a subexpression
4713        we care about, but the expression itself. */
4714       && (maxcount == REG_INFTY)
4715       && data && ++data->whilem_c < 16) {
4716      /* This stays as CURLYX, we can put the count/of pair. */
4717      /* Find WHILEM (as in regexec.c) */
4718      regnode *nxt = oscan + NEXT_OFF(oscan);
4719
4720      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4721       nxt += ARG(nxt);
4722      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4723       | (RExC_whilem_seen << 4)); /* On WHILEM */
4724     }
4725     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4726      pars++;
4727     if (flags & SCF_DO_SUBSTR) {
4728      SV *last_str = NULL;
4729      STRLEN last_chrs = 0;
4730      int counted = mincount != 0;
4731
4732      if (data->last_end > 0 && mincount != 0) { /* Ends with a
4733                 string. */
4734       SSize_t b = pos_before >= data->last_start_min
4735        ? pos_before : data->last_start_min;
4736       STRLEN l;
4737       const char * const s = SvPV_const(data->last_found, l);
4738       SSize_t old = b - data->last_start_min;
4739
4740       if (UTF)
4741        old = utf8_hop((U8*)s, old) - (U8*)s;
4742       l -= old;
4743       /* Get the added string: */
4744       last_str = newSVpvn_utf8(s  + old, l, UTF);
4745       last_chrs = UTF ? utf8_length((U8*)(s + old),
4746            (U8*)(s + old + l)) : l;
4747       if (deltanext == 0 && pos_before == b) {
4748        /* What was added is a constant string */
4749        if (mincount > 1) {
4750
4751         SvGROW(last_str, (mincount * l) + 1);
4752         repeatcpy(SvPVX(last_str) + l,
4753           SvPVX_const(last_str), l,
4754           mincount - 1);
4755         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4756         /* Add additional parts. */
4757         SvCUR_set(data->last_found,
4758           SvCUR(data->last_found) - l);
4759         sv_catsv(data->last_found, last_str);
4760         {
4761          SV * sv = data->last_found;
4762          MAGIC *mg =
4763           SvUTF8(sv) && SvMAGICAL(sv) ?
4764           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4765          if (mg && mg->mg_len >= 0)
4766           mg->mg_len += last_chrs * (mincount-1);
4767         }
4768         last_chrs *= mincount;
4769         data->last_end += l * (mincount - 1);
4770        }
4771       } else {
4772        /* start offset must point into the last copy */
4773        data->last_start_min += minnext * (mincount - 1);
4774        data->last_start_max += is_inf ? SSize_t_MAX
4775         : (maxcount - 1) * (minnext + data->pos_delta);
4776       }
4777      }
4778      /* It is counted once already... */
4779      data->pos_min += minnext * (mincount - counted);
4780 #if 0
4781 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4782        " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4783        " maxcount=%"UVuf" mincount=%"UVuf"\n",
4784  (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4785  (UV)mincount);
4786 if (deltanext != SSize_t_MAX)
4787 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4788  (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4789   - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4790 #endif
4791      if (deltanext == SSize_t_MAX
4792       || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4793       data->pos_delta = SSize_t_MAX;
4794      else
4795       data->pos_delta += - counted * deltanext +
4796       (minnext + deltanext) * maxcount - minnext * mincount;
4797      if (mincount != maxcount) {
4798       /* Cannot extend fixed substrings found inside
4799        the group.  */
4800       scan_commit(pRExC_state, data, minlenp, is_inf);
4801       if (mincount && last_str) {
4802        SV * const sv = data->last_found;
4803        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4804         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4805
4806        if (mg)
4807         mg->mg_len = -1;
4808        sv_setsv(sv, last_str);
4809        data->last_end = data->pos_min;
4810        data->last_start_min = data->pos_min - last_chrs;
4811        data->last_start_max = is_inf
4812         ? SSize_t_MAX
4813         : data->pos_min + data->pos_delta - last_chrs;
4814       }
4815       data->longest = &(data->longest_float);
4816      }
4817      SvREFCNT_dec(last_str);
4818     }
4819     if (data && (fl & SF_HAS_EVAL))
4820      data->flags |= SF_HAS_EVAL;
4821    optimize_curly_tail:
4822     if (OP(oscan) != CURLYX) {
4823      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4824       && NEXT_OFF(next))
4825       NEXT_OFF(oscan) += NEXT_OFF(next);
4826     }
4827     continue;
4828
4829    default:
4830 #ifdef DEBUGGING
4831     Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4832                  OP(scan));
4833 #endif
4834    case REF:
4835    case CLUMP:
4836     if (flags & SCF_DO_SUBSTR) {
4837      /* Cannot expect anything... */
4838      scan_commit(pRExC_state, data, minlenp, is_inf);
4839      data->longest = &(data->longest_float);
4840     }
4841     is_inf = is_inf_internal = 1;
4842     if (flags & SCF_DO_STCLASS_OR) {
4843      if (OP(scan) == CLUMP) {
4844       /* Actually is any start char, but very few code points
4845       * aren't start characters */
4846       ssc_match_all_cp(data->start_class);
4847      }
4848      else {
4849       ssc_anything(data->start_class);
4850      }
4851     }
4852     flags &= ~SCF_DO_STCLASS;
4853     break;
4854    }
4855   }
4856   else if (OP(scan) == LNBREAK) {
4857    if (flags & SCF_DO_STCLASS) {
4858      if (flags & SCF_DO_STCLASS_AND) {
4859      ssc_intersection(data->start_class,
4860          PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4861      ssc_clear_locale(data->start_class);
4862      ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4863     }
4864     else if (flags & SCF_DO_STCLASS_OR) {
4865      ssc_union(data->start_class,
4866        PL_XPosix_ptrs[_CC_VERTSPACE],
4867        FALSE);
4868      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4869
4870      /* See commit msg for
4871      * 749e076fceedeb708a624933726e7989f2302f6a */
4872      ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4873     }
4874     flags &= ~SCF_DO_STCLASS;
4875    }
4876    min++;
4877    delta++;    /* Because of the 2 char string cr-lf */
4878    if (flags & SCF_DO_SUBSTR) {
4879     /* Cannot expect anything... */
4880     scan_commit(pRExC_state, data, minlenp, is_inf);
4881      data->pos_min += 1;
4882     data->pos_delta += 1;
4883     data->longest = &(data->longest_float);
4884     }
4885   }
4886   else if (REGNODE_SIMPLE(OP(scan))) {
4887
4888    if (flags & SCF_DO_SUBSTR) {
4889     scan_commit(pRExC_state, data, minlenp, is_inf);
4890     data->pos_min++;
4891    }
4892    min++;
4893    if (flags & SCF_DO_STCLASS) {
4894     bool invert = 0;
4895     SV* my_invlist = sv_2mortal(_new_invlist(0));
4896     U8 namedclass;
4897
4898     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4899     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4900
4901     /* Some of the logic below assumes that switching
4902     locale on will only add false positives. */
4903     switch (OP(scan)) {
4904
4905     default:
4906 #ifdef DEBUGGING
4907     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4908                  OP(scan));
4909 #endif
4910     case CANY:
4911     case SANY:
4912      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4913       ssc_match_all_cp(data->start_class);
4914      break;
4915
4916     case REG_ANY:
4917      {
4918       SV* REG_ANY_invlist = _new_invlist(2);
4919       REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4920                '\n');
4921       if (flags & SCF_DO_STCLASS_OR) {
4922        ssc_union(data->start_class,
4923          REG_ANY_invlist,
4924          TRUE /* TRUE => invert, hence all but \n
4925            */
4926          );
4927       }
4928       else if (flags & SCF_DO_STCLASS_AND) {
4929        ssc_intersection(data->start_class,
4930            REG_ANY_invlist,
4931            TRUE  /* TRUE => invert */
4932            );
4933        ssc_clear_locale(data->start_class);
4934       }
4935       SvREFCNT_dec_NN(REG_ANY_invlist);
4936      }
4937      break;
4938
4939     case ANYOF:
4940      if (flags & SCF_DO_STCLASS_AND)
4941       ssc_and(pRExC_state, data->start_class,
4942         (regnode_charclass *) scan);
4943      else
4944       ssc_or(pRExC_state, data->start_class,
4945               (regnode_charclass *) scan);
4946      break;
4947
4948     case NPOSIXL:
4949      invert = 1;
4950      /* FALLTHROUGH */
4951
4952     case POSIXL:
4953      namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4954      if (flags & SCF_DO_STCLASS_AND) {
4955       bool was_there = cBOOL(
4956           ANYOF_POSIXL_TEST(data->start_class,
4957                 namedclass));
4958       ANYOF_POSIXL_ZERO(data->start_class);
4959       if (was_there) {    /* Do an AND */
4960        ANYOF_POSIXL_SET(data->start_class, namedclass);
4961       }
4962       /* No individual code points can now match */
4963       data->start_class->invlist
4964             = sv_2mortal(_new_invlist(0));
4965      }
4966      else {
4967       int complement = namedclass + ((invert) ? -1 : 1);
4968
4969       assert(flags & SCF_DO_STCLASS_OR);
4970
4971       /* If the complement of this class was already there,
4972       * the result is that they match all code points,
4973       * (\d + \D == everything).  Remove the classes from
4974       * future consideration.  Locale is not relevant in
4975       * this case */
4976       if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4977        ssc_match_all_cp(data->start_class);
4978        ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4979        ANYOF_POSIXL_CLEAR(data->start_class, complement);
4980       }
4981       else {  /* The usual case; just add this class to the
4982         existing set */
4983        ANYOF_POSIXL_SET(data->start_class, namedclass);
4984       }
4985      }
4986      break;
4987
4988     case NPOSIXA:   /* For these, we always know the exact set of
4989         what's matched */
4990      invert = 1;
4991      /* FALLTHROUGH */
4992     case POSIXA:
4993      if (FLAGS(scan) == _CC_ASCII) {
4994       my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4995      }
4996      else {
4997       _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4998            PL_XPosix_ptrs[_CC_ASCII],
4999            &my_invlist);
5000      }
5001      goto join_posix;
5002
5003     case NPOSIXD:
5004     case NPOSIXU:
5005      invert = 1;
5006      /* FALLTHROUGH */
5007     case POSIXD:
5008     case POSIXU:
5009      my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5010
5011      /* NPOSIXD matches all upper Latin1 code points unless the
5012      * target string being matched is UTF-8, which is
5013      * unknowable until match time.  Since we are going to
5014      * invert, we want to get rid of all of them so that the
5015      * inversion will match all */
5016      if (OP(scan) == NPOSIXD) {
5017       _invlist_subtract(my_invlist, PL_UpperLatin1,
5018           &my_invlist);
5019      }
5020
5021     join_posix:
5022
5023      if (flags & SCF_DO_STCLASS_AND) {
5024       ssc_intersection(data->start_class, my_invlist, invert);
5025       ssc_clear_locale(data->start_class);
5026      }
5027      else {
5028       assert(flags & SCF_DO_STCLASS_OR);
5029       ssc_union(data->start_class, my_invlist, invert);
5030      }
5031     }
5032     if (flags & SCF_DO_STCLASS_OR)
5033      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5034     flags &= ~SCF_DO_STCLASS;
5035    }
5036   }
5037   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5038    data->flags |= (OP(scan) == MEOL
5039        ? SF_BEFORE_MEOL
5040        : SF_BEFORE_SEOL);
5041    scan_commit(pRExC_state, data, minlenp, is_inf);
5042
5043   }
5044   else if (  PL_regkind[OP(scan)] == BRANCHJ
5045     /* Lookbehind, or need to calculate parens/evals/stclass: */
5046     && (scan->flags || data || (flags & SCF_DO_STCLASS))
5047     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
5048    if ( OP(scan) == UNLESSM &&
5049     scan->flags == 0 &&
5050     OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5051     OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5052    ) {
5053     regnode *opt;
5054     regnode *upto= regnext(scan);
5055     DEBUG_PARSE_r({
5056      SV * const mysv_val=sv_newmortal();
5057      DEBUG_STUDYDATA("OPFAIL",data,depth);
5058
5059      /*DEBUG_PARSE_MSG("opfail");*/
5060      regprop(RExC_rx, mysv_val, upto, NULL);
5061      PerlIO_printf(Perl_debug_log,
5062       "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5063       SvPV_nolen_const(mysv_val),
5064       (IV)REG_NODE_NUM(upto),
5065       (IV)(upto - scan)
5066      );
5067     });
5068     OP(scan) = OPFAIL;
5069     NEXT_OFF(scan) = upto - scan;
5070     for (opt= scan + 1; opt < upto ; opt++)
5071      OP(opt) = OPTIMIZED;
5072     scan= upto;
5073     continue;
5074    }
5075    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5076     || OP(scan) == UNLESSM )
5077    {
5078     /* Negative Lookahead/lookbehind
5079     In this case we can't do fixed string optimisation.
5080     */
5081
5082     SSize_t deltanext, minnext, fake = 0;
5083     regnode *nscan;
5084     regnode_ssc intrnl;
5085     int f = 0;
5086
5087     data_fake.flags = 0;
5088     if (data) {
5089      data_fake.whilem_c = data->whilem_c;
5090      data_fake.last_closep = data->last_closep;
5091     }
5092     else
5093      data_fake.last_closep = &fake;
5094     data_fake.pos_delta = delta;
5095     if ( flags & SCF_DO_STCLASS && !scan->flags
5096      && OP(scan) == IFMATCH ) { /* Lookahead */
5097      ssc_init(pRExC_state, &intrnl);
5098      data_fake.start_class = &intrnl;
5099      f |= SCF_DO_STCLASS_AND;
5100     }
5101     if (flags & SCF_WHILEM_VISITED_POS)
5102      f |= SCF_WHILEM_VISITED_POS;
5103     next = regnext(scan);
5104     nscan = NEXTOPER(NEXTOPER(scan));
5105     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5106          last, &data_fake, stopparen,
5107          recursed_depth, NULL, f, depth+1);
5108     if (scan->flags) {
5109      if (deltanext) {
5110       FAIL("Variable length lookbehind not implemented");
5111      }
5112      else if (minnext > (I32)U8_MAX) {
5113       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5114        (UV)U8_MAX);
5115      }
5116      scan->flags = (U8)minnext;
5117     }
5118     if (data) {
5119      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5120       pars++;
5121      if (data_fake.flags & SF_HAS_EVAL)
5122       data->flags |= SF_HAS_EVAL;
5123      data->whilem_c = data_fake.whilem_c;
5124     }
5125     if (f & SCF_DO_STCLASS_AND) {
5126      if (flags & SCF_DO_STCLASS_OR) {
5127       /* OR before, AND after: ideally we would recurse with
5128       * data_fake to get the AND applied by study of the
5129       * remainder of the pattern, and then derecurse;
5130       * *** HACK *** for now just treat as "no information".
5131       * See [perl #56690].
5132       */
5133       ssc_init(pRExC_state, data->start_class);
5134      }  else {
5135       /* AND before and after: combine and continue */
5136       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5137      }
5138     }
5139    }
5140 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5141    else {
5142     /* Positive Lookahead/lookbehind
5143     In this case we can do fixed string optimisation,
5144     but we must be careful about it. Note in the case of
5145     lookbehind the positions will be offset by the minimum
5146     length of the pattern, something we won't know about
5147     until after the recurse.
5148     */
5149     SSize_t deltanext, fake = 0;
5150     regnode *nscan;
5151     regnode_ssc intrnl;
5152     int f = 0;
5153     /* We use SAVEFREEPV so that when the full compile
5154      is finished perl will clean up the allocated
5155      minlens when it's all done. This way we don't
5156      have to worry about freeing them when we know
5157      they wont be used, which would be a pain.
5158     */
5159     SSize_t *minnextp;
5160     Newx( minnextp, 1, SSize_t );
5161     SAVEFREEPV(minnextp);
5162
5163     if (data) {
5164      StructCopy(data, &data_fake, scan_data_t);
5165      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5166       f |= SCF_DO_SUBSTR;
5167       if (scan->flags)
5168        scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5169       data_fake.last_found=newSVsv(data->last_found);
5170      }
5171     }
5172     else
5173      data_fake.last_closep = &fake;
5174     data_fake.flags = 0;
5175     data_fake.pos_delta = delta;
5176     if (is_inf)
5177      data_fake.flags |= SF_IS_INF;
5178     if ( flags & SCF_DO_STCLASS && !scan->flags
5179      && OP(scan) == IFMATCH ) { /* Lookahead */
5180      ssc_init(pRExC_state, &intrnl);
5181      data_fake.start_class = &intrnl;
5182      f |= SCF_DO_STCLASS_AND;
5183     }
5184     if (flags & SCF_WHILEM_VISITED_POS)
5185      f |= SCF_WHILEM_VISITED_POS;
5186     next = regnext(scan);
5187     nscan = NEXTOPER(NEXTOPER(scan));
5188
5189     *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5190           &deltanext, last, &data_fake,
5191           stopparen, recursed_depth, NULL,
5192           f,depth+1);
5193     if (scan->flags) {
5194      if (deltanext) {
5195       FAIL("Variable length lookbehind not implemented");
5196      }
5197      else if (*minnextp > (I32)U8_MAX) {
5198       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5199        (UV)U8_MAX);
5200      }
5201      scan->flags = (U8)*minnextp;
5202     }
5203
5204     *minnextp += min;
5205
5206     if (f & SCF_DO_STCLASS_AND) {
5207      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5208     }
5209     if (data) {
5210      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5211       pars++;
5212      if (data_fake.flags & SF_HAS_EVAL)
5213       data->flags |= SF_HAS_EVAL;
5214      data->whilem_c = data_fake.whilem_c;
5215      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5216       if (RExC_rx->minlen<*minnextp)
5217        RExC_rx->minlen=*minnextp;
5218       scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5219       SvREFCNT_dec_NN(data_fake.last_found);
5220
5221       if ( data_fake.minlen_fixed != minlenp )
5222       {
5223        data->offset_fixed= data_fake.offset_fixed;
5224        data->minlen_fixed= data_fake.minlen_fixed;
5225        data->lookbehind_fixed+= scan->flags;
5226       }
5227       if ( data_fake.minlen_float != minlenp )
5228       {
5229        data->minlen_float= data_fake.minlen_float;
5230        data->offset_float_min=data_fake.offset_float_min;
5231        data->offset_float_max=data_fake.offset_float_max;
5232        data->lookbehind_float+= scan->flags;
5233       }
5234      }
5235     }
5236    }
5237 #endif
5238   }
5239   else if (OP(scan) == OPEN) {
5240    if (stopparen != (I32)ARG(scan))
5241     pars++;
5242   }
5243   else if (OP(scan) == CLOSE) {
5244    if (stopparen == (I32)ARG(scan)) {
5245     break;
5246    }
5247    if ((I32)ARG(scan) == is_par) {
5248     next = regnext(scan);
5249
5250     if ( next && (OP(next) != WHILEM) && next < last)
5251      is_par = 0;  /* Disable optimization */
5252    }
5253    if (data)
5254     *(data->last_closep) = ARG(scan);
5255   }
5256   else if (OP(scan) == EVAL) {
5257     if (data)
5258      data->flags |= SF_HAS_EVAL;
5259   }
5260   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5261    if (flags & SCF_DO_SUBSTR) {
5262     scan_commit(pRExC_state, data, minlenp, is_inf);
5263     flags &= ~SCF_DO_SUBSTR;
5264    }
5265    if (data && OP(scan)==ACCEPT) {
5266     data->flags |= SCF_SEEN_ACCEPT;
5267     if (stopmin > min)
5268      stopmin = min;
5269    }
5270   }
5271   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5272   {
5273     if (flags & SCF_DO_SUBSTR) {
5274      scan_commit(pRExC_state, data, minlenp, is_inf);
5275      data->longest = &(data->longest_float);
5276     }
5277     is_inf = is_inf_internal = 1;
5278     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5279      ssc_anything(data->start_class);
5280     flags &= ~SCF_DO_STCLASS;
5281   }
5282   else if (OP(scan) == GPOS) {
5283    if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5284     !(delta || is_inf || (data && data->pos_delta)))
5285    {
5286     if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5287      RExC_rx->intflags |= PREGf_ANCH_GPOS;
5288     if (RExC_rx->gofs < (STRLEN)min)
5289      RExC_rx->gofs = min;
5290    } else {
5291     RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5292     RExC_rx->gofs = 0;
5293    }
5294   }
5295 #ifdef TRIE_STUDY_OPT
5296 #ifdef FULL_TRIE_STUDY
5297   else if (PL_regkind[OP(scan)] == TRIE) {
5298    /* NOTE - There is similar code to this block above for handling
5299    BRANCH nodes on the initial study.  If you change stuff here
5300    check there too. */
5301    regnode *trie_node= scan;
5302    regnode *tail= regnext(scan);
5303    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5304    SSize_t max1 = 0, min1 = SSize_t_MAX;
5305    regnode_ssc accum;
5306
5307    if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5308     /* Cannot merge strings after this. */
5309     scan_commit(pRExC_state, data, minlenp, is_inf);
5310    }
5311    if (flags & SCF_DO_STCLASS)
5312     ssc_init_zero(pRExC_state, &accum);
5313
5314    if (!trie->jump) {
5315     min1= trie->minlen;
5316     max1= trie->maxlen;
5317    } else {
5318     const regnode *nextbranch= NULL;
5319     U32 word;
5320
5321     for ( word=1 ; word <= trie->wordcount ; word++)
5322     {
5323      SSize_t deltanext=0, minnext=0, f = 0, fake;
5324      regnode_ssc this_class;
5325
5326      data_fake.flags = 0;
5327      if (data) {
5328       data_fake.whilem_c = data->whilem_c;
5329       data_fake.last_closep = data->last_closep;
5330      }
5331      else
5332       data_fake.last_closep = &fake;
5333      data_fake.pos_delta = delta;
5334      if (flags & SCF_DO_STCLASS) {
5335       ssc_init(pRExC_state, &this_class);
5336       data_fake.start_class = &this_class;
5337       f = SCF_DO_STCLASS_AND;
5338      }
5339      if (flags & SCF_WHILEM_VISITED_POS)
5340       f |= SCF_WHILEM_VISITED_POS;
5341
5342      if (trie->jump[word]) {
5343       if (!nextbranch)
5344        nextbranch = trie_node + trie->jump[0];
5345       scan= trie_node + trie->jump[word];
5346       /* We go from the jump point to the branch that follows
5347       it. Note this means we need the vestigal unused
5348       branches even though they arent otherwise used. */
5349       minnext = study_chunk(pRExC_state, &scan, minlenp,
5350        &deltanext, (regnode *)nextbranch, &data_fake,
5351        stopparen, recursed_depth, NULL, f,depth+1);
5352      }
5353      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5354       nextbranch= regnext((regnode*)nextbranch);
5355
5356      if (min1 > (SSize_t)(minnext + trie->minlen))
5357       min1 = minnext + trie->minlen;
5358      if (deltanext == SSize_t_MAX) {
5359       is_inf = is_inf_internal = 1;
5360       max1 = SSize_t_MAX;
5361      } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5362       max1 = minnext + deltanext + trie->maxlen;
5363
5364      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5365       pars++;
5366      if (data_fake.flags & SCF_SEEN_ACCEPT) {
5367       if ( stopmin > min + min1)
5368        stopmin = min + min1;
5369       flags &= ~SCF_DO_SUBSTR;
5370       if (data)
5371        data->flags |= SCF_SEEN_ACCEPT;
5372      }
5373      if (data) {
5374       if (data_fake.flags & SF_HAS_EVAL)
5375        data->flags |= SF_HAS_EVAL;
5376       data->whilem_c = data_fake.whilem_c;
5377      }
5378      if (flags & SCF_DO_STCLASS)
5379       ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5380     }
5381    }
5382    if (flags & SCF_DO_SUBSTR) {
5383     data->pos_min += min1;
5384     data->pos_delta += max1 - min1;
5385     if (max1 != min1 || is_inf)
5386      data->longest = &(data->longest_float);
5387    }
5388    min += min1;
5389    delta += max1 - min1;
5390    if (flags & SCF_DO_STCLASS_OR) {
5391     ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5392     if (min1) {
5393      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5394      flags &= ~SCF_DO_STCLASS;
5395     }
5396    }
5397    else if (flags & SCF_DO_STCLASS_AND) {
5398     if (min1) {
5399      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5400      flags &= ~SCF_DO_STCLASS;
5401     }
5402     else {
5403      /* Switch to OR mode: cache the old value of
5404      * data->start_class */
5405      INIT_AND_WITHP;
5406      StructCopy(data->start_class, and_withp, regnode_ssc);
5407      flags &= ~SCF_DO_STCLASS_AND;
5408      StructCopy(&accum, data->start_class, regnode_ssc);
5409      flags |= SCF_DO_STCLASS_OR;
5410     }
5411    }
5412    scan= tail;
5413    continue;
5414   }
5415 #else
5416   else if (PL_regkind[OP(scan)] == TRIE) {
5417    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5418    U8*bang=NULL;
5419
5420    min += trie->minlen;
5421    delta += (trie->maxlen - trie->minlen);
5422    flags &= ~SCF_DO_STCLASS; /* xxx */
5423    if (flags & SCF_DO_SUBSTR) {
5424     /* Cannot expect anything... */
5425     scan_commit(pRExC_state, data, minlenp, is_inf);
5426      data->pos_min += trie->minlen;
5427      data->pos_delta += (trie->maxlen - trie->minlen);
5428     if (trie->maxlen != trie->minlen)
5429      data->longest = &(data->longest_float);
5430     }
5431     if (trie->jump) /* no more substrings -- for now /grr*/
5432    flags &= ~SCF_DO_SUBSTR;
5433   }
5434 #endif /* old or new */
5435 #endif /* TRIE_STUDY_OPT */
5436
5437   /* Else: zero-length, ignore. */
5438   scan = regnext(scan);
5439  }
5440  /* If we are exiting a recursion we can unset its recursed bit
5441  * and allow ourselves to enter it again - no danger of an
5442  * infinite loop there.
5443  if (stopparen > -1 && recursed) {
5444   DEBUG_STUDYDATA("unset:", data,depth);
5445   PAREN_UNSET( recursed, stopparen);
5446  }
5447  */
5448  if (frame) {
5449   DEBUG_STUDYDATA("frame-end:",data,depth);
5450   DEBUG_PEEP("fend", scan, depth);
5451   /* restore previous context */
5452   last = frame->last;
5453   scan = frame->next;
5454   stopparen = frame->stop;
5455   recursed_depth = frame->prev_recursed_depth;
5456   depth = depth - 1;
5457
5458   frame = frame->prev;
5459   goto fake_study_recurse;
5460  }
5461
5462   finish:
5463  assert(!frame);
5464  DEBUG_STUDYDATA("pre-fin:",data,depth);
5465
5466  *scanp = scan;
5467  *deltap = is_inf_internal ? SSize_t_MAX : delta;
5468
5469  if (flags & SCF_DO_SUBSTR && is_inf)
5470   data->pos_delta = SSize_t_MAX - data->pos_min;
5471  if (is_par > (I32)U8_MAX)
5472   is_par = 0;
5473  if (is_par && pars==1 && data) {
5474   data->flags |= SF_IN_PAR;
5475   data->flags &= ~SF_HAS_PAR;
5476  }
5477  else if (pars && data) {
5478   data->flags |= SF_HAS_PAR;
5479   data->flags &= ~SF_IN_PAR;
5480  }
5481  if (flags & SCF_DO_STCLASS_OR)
5482   ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5483  if (flags & SCF_TRIE_RESTUDY)
5484   data->flags |=  SCF_TRIE_RESTUDY;
5485
5486  DEBUG_STUDYDATA("post-fin:",data,depth);
5487
5488  {
5489   SSize_t final_minlen= min < stopmin ? min : stopmin;
5490
5491   if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5492    RExC_maxlen = final_minlen + delta;
5493   }
5494   return final_minlen;
5495  }
5496  /* not-reached */
5497 }
5498
5499 STATIC U32
5500 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5501 {
5502  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5503
5504  PERL_ARGS_ASSERT_ADD_DATA;
5505
5506  Renewc(RExC_rxi->data,
5507   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5508   char, struct reg_data);
5509  if(count)
5510   Renew(RExC_rxi->data->what, count + n, U8);
5511  else
5512   Newx(RExC_rxi->data->what, n, U8);
5513  RExC_rxi->data->count = count + n;
5514  Copy(s, RExC_rxi->data->what + count, n, U8);
5515  return count;
5516 }
5517
5518 /*XXX: todo make this not included in a non debugging perl, but appears to be
5519  * used anyway there, in 'use re' */
5520 #ifndef PERL_IN_XSUB_RE
5521 void
5522 Perl_reginitcolors(pTHX)
5523 {
5524  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5525  if (s) {
5526   char *t = savepv(s);
5527   int i = 0;
5528   PL_colors[0] = t;
5529   while (++i < 6) {
5530    t = strchr(t, '\t');
5531    if (t) {
5532     *t = '\0';
5533     PL_colors[i] = ++t;
5534    }
5535    else
5536     PL_colors[i] = t = (char *)"";
5537   }
5538  } else {
5539   int i = 0;
5540   while (i < 6)
5541    PL_colors[i++] = (char *)"";
5542  }
5543  PL_colorset = 1;
5544 }
5545 #endif
5546
5547
5548 #ifdef TRIE_STUDY_OPT
5549 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5550  STMT_START {                                            \
5551   if (                                                \
5552    (data.flags & SCF_TRIE_RESTUDY)               \
5553    && ! restudied++                              \
5554   ) {                                                 \
5555    dOsomething;                                    \
5556    goto reStudy;                                   \
5557   }                                                   \
5558  } STMT_END
5559 #else
5560 #define CHECK_RESTUDY_GOTO_butfirst
5561 #endif
5562
5563 /*
5564  * pregcomp - compile a regular expression into internal code
5565  *
5566  * Decides which engine's compiler to call based on the hint currently in
5567  * scope
5568  */
5569
5570 #ifndef PERL_IN_XSUB_RE
5571
5572 /* return the currently in-scope regex engine (or the default if none)  */
5573
5574 regexp_engine const *
5575 Perl_current_re_engine(pTHX)
5576 {
5577  if (IN_PERL_COMPILETIME) {
5578   HV * const table = GvHV(PL_hintgv);
5579   SV **ptr;
5580
5581   if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5582    return &reh_regexp_engine;
5583   ptr = hv_fetchs(table, "regcomp", FALSE);
5584   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5585    return &reh_regexp_engine;
5586   return INT2PTR(regexp_engine*,SvIV(*ptr));
5587  }
5588  else {
5589   SV *ptr;
5590   if (!PL_curcop->cop_hints_hash)
5591    return &reh_regexp_engine;
5592   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5593   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5594    return &reh_regexp_engine;
5595   return INT2PTR(regexp_engine*,SvIV(ptr));
5596  }
5597 }
5598
5599
5600 REGEXP *
5601 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5602 {
5603  regexp_engine const *eng = current_re_engine();
5604  GET_RE_DEBUG_FLAGS_DECL;
5605
5606  PERL_ARGS_ASSERT_PREGCOMP;
5607
5608  /* Dispatch a request to compile a regexp to correct regexp engine. */
5609  DEBUG_COMPILE_r({
5610   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5611       PTR2UV(eng));
5612  });
5613  return CALLREGCOMP_ENG(eng, pattern, flags);
5614 }
5615 #endif
5616
5617 /* public(ish) entry point for the perl core's own regex compiling code.
5618  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5619  * pattern rather than a list of OPs, and uses the internal engine rather
5620  * than the current one */
5621
5622 REGEXP *
5623 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5624 {
5625  SV *pat = pattern; /* defeat constness! */
5626  PERL_ARGS_ASSERT_RE_COMPILE;
5627  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5628 #ifdef PERL_IN_XSUB_RE
5629         &my_reg_engine,
5630 #else
5631         &reh_regexp_engine,
5632 #endif
5633         NULL, NULL, rx_flags, 0);
5634 }
5635
5636
5637 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5638  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5639  * point to the realloced string and length.
5640  *
5641  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5642  * stuff added */
5643
5644 static void
5645 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5646      char **pat_p, STRLEN *plen_p, int num_code_blocks)
5647 {
5648  U8 *const src = (U8*)*pat_p;
5649  U8 *dst;
5650  int n=0;
5651  STRLEN s = 0, d = 0;
5652  bool do_end = 0;
5653  GET_RE_DEBUG_FLAGS_DECL;
5654
5655  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5656   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5657
5658  Newx(dst, *plen_p * 2 + 1, U8);
5659
5660  while (s < *plen_p) {
5661   if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5662    dst[d]   = src[s];
5663   else {
5664    dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5665    dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5666   }
5667   if (n < num_code_blocks) {
5668    if (!do_end && pRExC_state->code_blocks[n].start == s) {
5669     pRExC_state->code_blocks[n].start = d;
5670     assert(dst[d] == '(');
5671     do_end = 1;
5672    }
5673    else if (do_end && pRExC_state->code_blocks[n].end == s) {
5674     pRExC_state->code_blocks[n].end = d;
5675     assert(dst[d] == ')');
5676     do_end = 0;
5677     n++;
5678    }
5679   }
5680   s++;
5681   d++;
5682  }
5683  dst[d] = '\0';
5684  *plen_p = d;
5685  *pat_p = (char*) dst;
5686  SAVEFREEPV(*pat_p);
5687  RExC_orig_utf8 = RExC_utf8 = 1;
5688 }
5689
5690
5691
5692 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5693  * while recording any code block indices, and handling overloading,
5694  * nested qr// objects etc.  If pat is null, it will allocate a new
5695  * string, or just return the first arg, if there's only one.
5696  *
5697  * Returns the malloced/updated pat.
5698  * patternp and pat_count is the array of SVs to be concatted;
5699  * oplist is the optional list of ops that generated the SVs;
5700  * recompile_p is a pointer to a boolean that will be set if
5701  *   the regex will need to be recompiled.
5702  * delim, if non-null is an SV that will be inserted between each element
5703  */
5704
5705 static SV*
5706 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5707     SV *pat, SV ** const patternp, int pat_count,
5708     OP *oplist, bool *recompile_p, SV *delim)
5709 {
5710  SV **svp;
5711  int n = 0;
5712  bool use_delim = FALSE;
5713  bool alloced = FALSE;
5714
5715  /* if we know we have at least two args, create an empty string,
5716  * then concatenate args to that. For no args, return an empty string */
5717  if (!pat && pat_count != 1) {
5718   pat = newSVpvs("");
5719   SAVEFREESV(pat);
5720   alloced = TRUE;
5721  }
5722
5723  for (svp = patternp; svp < patternp + pat_count; svp++) {
5724   SV *sv;
5725   SV *rx  = NULL;
5726   STRLEN orig_patlen = 0;
5727   bool code = 0;
5728   SV *msv = use_delim ? delim : *svp;
5729   if (!msv) msv = &PL_sv_undef;
5730
5731   /* if we've got a delimiter, we go round the loop twice for each
5732   * svp slot (except the last), using the delimiter the second
5733   * time round */
5734   if (use_delim) {
5735    svp--;
5736    use_delim = FALSE;
5737   }
5738   else if (delim)
5739    use_delim = TRUE;
5740
5741   if (SvTYPE(msv) == SVt_PVAV) {
5742    /* we've encountered an interpolated array within
5743    * the pattern, e.g. /...@a..../. Expand the list of elements,
5744    * then recursively append elements.
5745    * The code in this block is based on S_pushav() */
5746
5747    AV *const av = (AV*)msv;
5748    const SSize_t maxarg = AvFILL(av) + 1;
5749    SV **array;
5750
5751    if (oplist) {
5752     assert(oplist->op_type == OP_PADAV
5753      || oplist->op_type == OP_RV2AV);
5754     oplist = OP_SIBLING(oplist);
5755    }
5756
5757    if (SvRMAGICAL(av)) {
5758     SSize_t i;
5759
5760     Newx(array, maxarg, SV*);
5761     SAVEFREEPV(array);
5762     for (i=0; i < maxarg; i++) {
5763      SV ** const svp = av_fetch(av, i, FALSE);
5764      array[i] = svp ? *svp : &PL_sv_undef;
5765     }
5766    }
5767    else
5768     array = AvARRAY(av);
5769
5770    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5771         array, maxarg, NULL, recompile_p,
5772         /* $" */
5773         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5774
5775    continue;
5776   }
5777
5778
5779   /* we make the assumption here that each op in the list of
5780   * op_siblings maps to one SV pushed onto the stack,
5781   * except for code blocks, with have both an OP_NULL and
5782   * and OP_CONST.
5783   * This allows us to match up the list of SVs against the
5784   * list of OPs to find the next code block.
5785   *
5786   * Note that       PUSHMARK PADSV PADSV ..
5787   * is optimised to
5788   *                 PADRANGE PADSV  PADSV  ..
5789   * so the alignment still works. */
5790
5791   if (oplist) {
5792    if (oplist->op_type == OP_NULL
5793     && (oplist->op_flags & OPf_SPECIAL))
5794    {
5795     assert(n < pRExC_state->num_code_blocks);
5796     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5797     pRExC_state->code_blocks[n].block = oplist;
5798     pRExC_state->code_blocks[n].src_regex = NULL;
5799     n++;
5800     code = 1;
5801     oplist = OP_SIBLING(oplist); /* skip CONST */
5802     assert(oplist);
5803    }
5804    oplist = OP_SIBLING(oplist);;
5805   }
5806
5807   /* apply magic and QR overloading to arg */
5808
5809   SvGETMAGIC(msv);
5810   if (SvROK(msv) && SvAMAGIC(msv)) {
5811    SV *sv = AMG_CALLunary(msv, regexp_amg);
5812    if (sv) {
5813     if (SvROK(sv))
5814      sv = SvRV(sv);
5815     if (SvTYPE(sv) != SVt_REGEXP)
5816      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5817     msv = sv;
5818    }
5819   }
5820
5821   /* try concatenation overload ... */
5822   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5823     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5824   {
5825    sv_setsv(pat, sv);
5826    /* overloading involved: all bets are off over literal
5827    * code. Pretend we haven't seen it */
5828    pRExC_state->num_code_blocks -= n;
5829    n = 0;
5830   }
5831   else  {
5832    /* ... or failing that, try "" overload */
5833    while (SvAMAGIC(msv)
5834      && (sv = AMG_CALLunary(msv, string_amg))
5835      && sv != msv
5836      &&  !(   SvROK(msv)
5837       && SvROK(sv)
5838       && SvRV(msv) == SvRV(sv))
5839    ) {
5840     msv = sv;
5841     SvGETMAGIC(msv);
5842    }
5843    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5844     msv = SvRV(msv);
5845
5846    if (pat) {
5847     /* this is a partially unrolled
5848     *     sv_catsv_nomg(pat, msv);
5849     * that allows us to adjust code block indices if
5850     * needed */
5851     STRLEN dlen;
5852     char *dst = SvPV_force_nomg(pat, dlen);
5853     orig_patlen = dlen;
5854     if (SvUTF8(msv) && !SvUTF8(pat)) {
5855      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5856      sv_setpvn(pat, dst, dlen);
5857      SvUTF8_on(pat);
5858     }
5859     sv_catsv_nomg(pat, msv);
5860     rx = msv;
5861    }
5862    else
5863     pat = msv;
5864
5865    if (code)
5866     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5867   }
5868
5869   /* extract any code blocks within any embedded qr//'s */
5870   if (rx && SvTYPE(rx) == SVt_REGEXP
5871    && RX_ENGINE((REGEXP*)rx)->op_comp)
5872   {
5873
5874    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5875    if (ri->num_code_blocks) {
5876     int i;
5877     /* the presence of an embedded qr// with code means
5878     * we should always recompile: the text of the
5879     * qr// may not have changed, but it may be a
5880     * different closure than last time */
5881     *recompile_p = 1;
5882     Renew(pRExC_state->code_blocks,
5883      pRExC_state->num_code_blocks + ri->num_code_blocks,
5884      struct reg_code_block);
5885     pRExC_state->num_code_blocks += ri->num_code_blocks;
5886
5887     for (i=0; i < ri->num_code_blocks; i++) {
5888      struct reg_code_block *src, *dst;
5889      STRLEN offset =  orig_patlen
5890       + ReANY((REGEXP *)rx)->pre_prefix;
5891      assert(n < pRExC_state->num_code_blocks);
5892      src = &ri->code_blocks[i];
5893      dst = &pRExC_state->code_blocks[n];
5894      dst->start     = src->start + offset;
5895      dst->end     = src->end   + offset;
5896      dst->block     = src->block;
5897      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5898            src->src_regex
5899             ? src->src_regex
5900             : (REGEXP*)rx);
5901      n++;
5902     }
5903    }
5904   }
5905  }
5906  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5907  if (alloced)
5908   SvSETMAGIC(pat);
5909
5910  return pat;
5911 }
5912
5913
5914
5915 /* see if there are any run-time code blocks in the pattern.
5916  * False positives are allowed */
5917
5918 static bool
5919 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5920      char *pat, STRLEN plen)
5921 {
5922  int n = 0;
5923  STRLEN s;
5924
5925  PERL_UNUSED_CONTEXT;
5926
5927  for (s = 0; s < plen; s++) {
5928   if (n < pRExC_state->num_code_blocks
5929    && s == pRExC_state->code_blocks[n].start)
5930   {
5931    s = pRExC_state->code_blocks[n].end;
5932    n++;
5933    continue;
5934   }
5935   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5936   * positives here */
5937   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5938    (pat[s+2] == '{'
5939     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5940   )
5941    return 1;
5942  }
5943  return 0;
5944 }
5945
5946 /* Handle run-time code blocks. We will already have compiled any direct
5947  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5948  * copy of it, but with any literal code blocks blanked out and
5949  * appropriate chars escaped; then feed it into
5950  *
5951  *    eval "qr'modified_pattern'"
5952  *
5953  * For example,
5954  *
5955  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5956  *
5957  * becomes
5958  *
5959  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5960  *
5961  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5962  * and merge them with any code blocks of the original regexp.
5963  *
5964  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5965  * instead, just save the qr and return FALSE; this tells our caller that
5966  * the original pattern needs upgrading to utf8.
5967  */
5968
5969 static bool
5970 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5971  char *pat, STRLEN plen)
5972 {
5973  SV *qr;
5974
5975  GET_RE_DEBUG_FLAGS_DECL;
5976
5977  if (pRExC_state->runtime_code_qr) {
5978   /* this is the second time we've been called; this should
5979   * only happen if the main pattern got upgraded to utf8
5980   * during compilation; re-use the qr we compiled first time
5981   * round (which should be utf8 too)
5982   */
5983   qr = pRExC_state->runtime_code_qr;
5984   pRExC_state->runtime_code_qr = NULL;
5985   assert(RExC_utf8 && SvUTF8(qr));
5986  }
5987  else {
5988   int n = 0;
5989   STRLEN s;
5990   char *p, *newpat;
5991   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5992   SV *sv, *qr_ref;
5993   dSP;
5994
5995   /* determine how many extra chars we need for ' and \ escaping */
5996   for (s = 0; s < plen; s++) {
5997    if (pat[s] == '\'' || pat[s] == '\\')
5998     newlen++;
5999   }
6000
6001   Newx(newpat, newlen, char);
6002   p = newpat;
6003   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6004
6005   for (s = 0; s < plen; s++) {
6006    if (n < pRExC_state->num_code_blocks
6007     && s == pRExC_state->code_blocks[n].start)
6008    {
6009     /* blank out literal code block */
6010     assert(pat[s] == '(');
6011     while (s <= pRExC_state->code_blocks[n].end) {
6012      *p++ = '_';
6013      s++;
6014     }
6015     s--;
6016     n++;
6017     continue;
6018    }
6019    if (pat[s] == '\'' || pat[s] == '\\')
6020     *p++ = '\\';
6021    *p++ = pat[s];
6022   }
6023   *p++ = '\'';
6024   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6025    *p++ = 'x';
6026   *p++ = '\0';
6027   DEBUG_COMPILE_r({
6028    PerlIO_printf(Perl_debug_log,
6029     "%sre-parsing pattern for runtime code:%s %s\n",
6030     PL_colors[4],PL_colors[5],newpat);
6031   });
6032
6033   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6034   Safefree(newpat);
6035
6036   ENTER;
6037   SAVETMPS;
6038   save_re_context();
6039   PUSHSTACKi(PERLSI_REQUIRE);
6040   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6041   * parsing qr''; normally only q'' does this. It also alters
6042   * hints handling */
6043   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6044   SvREFCNT_dec_NN(sv);
6045   SPAGAIN;
6046   qr_ref = POPs;
6047   PUTBACK;
6048   {
6049    SV * const errsv = ERRSV;
6050    if (SvTRUE_NN(errsv))
6051    {
6052     Safefree(pRExC_state->code_blocks);
6053     /* use croak_sv ? */
6054     Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6055    }
6056   }
6057   assert(SvROK(qr_ref));
6058   qr = SvRV(qr_ref);
6059   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6060   /* the leaving below frees the tmp qr_ref.
6061   * Give qr a life of its own */
6062   SvREFCNT_inc(qr);
6063   POPSTACK;
6064   FREETMPS;
6065   LEAVE;
6066
6067  }
6068
6069  if (!RExC_utf8 && SvUTF8(qr)) {
6070   /* first time through; the pattern got upgraded; save the
6071   * qr for the next time through */
6072   assert(!pRExC_state->runtime_code_qr);
6073   pRExC_state->runtime_code_qr = qr;
6074   return 0;
6075  }
6076
6077
6078  /* extract any code blocks within the returned qr//  */
6079
6080
6081  /* merge the main (r1) and run-time (r2) code blocks into one */
6082  {
6083   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6084   struct reg_code_block *new_block, *dst;
6085   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6086   int i1 = 0, i2 = 0;
6087
6088   if (!r2->num_code_blocks) /* we guessed wrong */
6089   {
6090    SvREFCNT_dec_NN(qr);
6091    return 1;
6092   }
6093
6094   Newx(new_block,
6095    r1->num_code_blocks + r2->num_code_blocks,
6096    struct reg_code_block);
6097   dst = new_block;
6098
6099   while (    i1 < r1->num_code_blocks
6100     || i2 < r2->num_code_blocks)
6101   {
6102    struct reg_code_block *src;
6103    bool is_qr = 0;
6104
6105    if (i1 == r1->num_code_blocks) {
6106     src = &r2->code_blocks[i2++];
6107     is_qr = 1;
6108    }
6109    else if (i2 == r2->num_code_blocks)
6110     src = &r1->code_blocks[i1++];
6111    else if (  r1->code_blocks[i1].start
6112      < r2->code_blocks[i2].start)
6113    {
6114     src = &r1->code_blocks[i1++];
6115     assert(src->end < r2->code_blocks[i2].start);
6116    }
6117    else {
6118     assert(  r1->code_blocks[i1].start
6119      > r2->code_blocks[i2].start);
6120     src = &r2->code_blocks[i2++];
6121     is_qr = 1;
6122     assert(src->end < r1->code_blocks[i1].start);
6123    }
6124
6125    assert(pat[src->start] == '(');
6126    assert(pat[src->end]   == ')');
6127    dst->start     = src->start;
6128    dst->end     = src->end;
6129    dst->block     = src->block;
6130    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6131          : src->src_regex;
6132    dst++;
6133   }
6134   r1->num_code_blocks += r2->num_code_blocks;
6135   Safefree(r1->code_blocks);
6136   r1->code_blocks = new_block;
6137  }
6138
6139  SvREFCNT_dec_NN(qr);
6140  return 1;
6141 }
6142
6143
6144 STATIC bool
6145 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6146      SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6147      SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6148      STRLEN longest_length, bool eol, bool meol)
6149 {
6150  /* This is the common code for setting up the floating and fixed length
6151  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6152  * as to whether succeeded or not */
6153
6154  I32 t;
6155  SSize_t ml;
6156
6157  if (! (longest_length
6158   || (eol /* Can't have SEOL and MULTI */
6159    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6160   )
6161    /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6162   || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6163  {
6164   return FALSE;
6165  }
6166
6167  /* copy the information about the longest from the reg_scan_data
6168   over to the program. */
6169  if (SvUTF8(sv_longest)) {
6170   *rx_utf8 = sv_longest;
6171   *rx_substr = NULL;
6172  } else {
6173   *rx_substr = sv_longest;
6174   *rx_utf8 = NULL;
6175  }
6176  /* end_shift is how many chars that must be matched that
6177   follow this item. We calculate it ahead of time as once the
6178   lookbehind offset is added in we lose the ability to correctly
6179   calculate it.*/
6180  ml = minlen ? *(minlen) : (SSize_t)longest_length;
6181  *rx_end_shift = ml - offset
6182   - longest_length + (SvTAIL(sv_longest) != 0)
6183   + lookbehind;
6184
6185  t = (eol/* Can't have SEOL and MULTI */
6186   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6187  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6188
6189  return TRUE;
6190 }
6191
6192 /*
6193  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6194  * regular expression into internal code.
6195  * The pattern may be passed either as:
6196  *    a list of SVs (patternp plus pat_count)
6197  *    a list of OPs (expr)
6198  * If both are passed, the SV list is used, but the OP list indicates
6199  * which SVs are actually pre-compiled code blocks
6200  *
6201  * The SVs in the list have magic and qr overloading applied to them (and
6202  * the list may be modified in-place with replacement SVs in the latter
6203  * case).
6204  *
6205  * If the pattern hasn't changed from old_re, then old_re will be
6206  * returned.
6207  *
6208  * eng is the current engine. If that engine has an op_comp method, then
6209  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6210  * do the initial concatenation of arguments and pass on to the external
6211  * engine.
6212  *
6213  * If is_bare_re is not null, set it to a boolean indicating whether the
6214  * arg list reduced (after overloading) to a single bare regex which has
6215  * been returned (i.e. /$qr/).
6216  *
6217  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6218  *
6219  * pm_flags contains the PMf_* flags, typically based on those from the
6220  * pm_flags field of the related PMOP. Currently we're only interested in
6221  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6222  *
6223  * We can't allocate space until we know how big the compiled form will be,
6224  * but we can't compile it (and thus know how big it is) until we've got a
6225  * place to put the code.  So we cheat:  we compile it twice, once with code
6226  * generation turned off and size counting turned on, and once "for real".
6227  * This also means that we don't allocate space until we are sure that the
6228  * thing really will compile successfully, and we never have to move the
6229  * code and thus invalidate pointers into it.  (Note that it has to be in
6230  * one piece because free() must be able to free it all.) [NB: not true in perl]
6231  *
6232  * Beware that the optimization-preparation code in here knows about some
6233  * of the structure of the compiled regexp.  [I'll say.]
6234  */
6235
6236 REGEXP *
6237 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6238      OP *expr, const regexp_engine* eng, REGEXP *old_re,
6239      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6240 {
6241  REGEXP *rx;
6242  struct regexp *r;
6243  regexp_internal *ri;
6244  STRLEN plen;
6245  char *exp;
6246  regnode *scan;
6247  I32 flags;
6248  SSize_t minlen = 0;
6249  U32 rx_flags;
6250  SV *pat;
6251  SV *code_blocksv = NULL;
6252  SV** new_patternp = patternp;
6253
6254  /* these are all flags - maybe they should be turned
6255  * into a single int with different bit masks */
6256  I32 sawlookahead = 0;
6257  I32 sawplus = 0;
6258  I32 sawopen = 0;
6259  I32 sawminmod = 0;
6260
6261  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6262  bool recompile = 0;
6263  bool runtime_code = 0;
6264  scan_data_t data;
6265  RExC_state_t RExC_state;
6266  RExC_state_t * const pRExC_state = &RExC_state;
6267 #ifdef TRIE_STUDY_OPT
6268  int restudied = 0;
6269  RExC_state_t copyRExC_state;
6270 #endif
6271  GET_RE_DEBUG_FLAGS_DECL;
6272
6273  PERL_ARGS_ASSERT_RE_OP_COMPILE;
6274
6275  DEBUG_r(if (!PL_colorset) reginitcolors());
6276
6277 #ifndef PERL_IN_XSUB_RE
6278  /* Initialize these here instead of as-needed, as is quick and avoids
6279  * having to test them each time otherwise */
6280  if (! PL_AboveLatin1) {
6281   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6282   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6283   PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6284   PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6285   PL_HasMultiCharFold =
6286      _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6287  }
6288 #endif
6289
6290  pRExC_state->code_blocks = NULL;
6291  pRExC_state->num_code_blocks = 0;
6292
6293  if (is_bare_re)
6294   *is_bare_re = FALSE;
6295
6296  if (expr && (expr->op_type == OP_LIST ||
6297     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6298   /* allocate code_blocks if needed */
6299   OP *o;
6300   int ncode = 0;
6301
6302   for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6303    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6304     ncode++; /* count of DO blocks */
6305   if (ncode) {
6306    pRExC_state->num_code_blocks = ncode;
6307    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6308   }
6309  }
6310
6311  if (!pat_count) {
6312   /* compile-time pattern with just OP_CONSTs and DO blocks */
6313
6314   int n;
6315   OP *o;
6316
6317   /* find how many CONSTs there are */
6318   assert(expr);
6319   n = 0;
6320   if (expr->op_type == OP_CONST)
6321    n = 1;
6322   else
6323    for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6324     if (o->op_type == OP_CONST)
6325      n++;
6326    }
6327
6328   /* fake up an SV array */
6329
6330   assert(!new_patternp);
6331   Newx(new_patternp, n, SV*);
6332   SAVEFREEPV(new_patternp);
6333   pat_count = n;
6334
6335   n = 0;
6336   if (expr->op_type == OP_CONST)
6337    new_patternp[n] = cSVOPx_sv(expr);
6338   else
6339    for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6340     if (o->op_type == OP_CONST)
6341      new_patternp[n++] = cSVOPo_sv;
6342    }
6343
6344  }
6345
6346  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6347   "Assembling pattern from %d elements%s\n", pat_count,
6348    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6349
6350  /* set expr to the first arg op */
6351
6352  if (pRExC_state->num_code_blocks
6353   && expr->op_type != OP_CONST)
6354  {
6355    expr = cLISTOPx(expr)->op_first;
6356    assert(   expr->op_type == OP_PUSHMARK
6357     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6358     || expr->op_type == OP_PADRANGE);
6359    expr = OP_SIBLING(expr);
6360  }
6361
6362  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6363       expr, &recompile, NULL);
6364
6365  /* handle bare (possibly after overloading) regex: foo =~ $re */
6366  {
6367   SV *re = pat;
6368   if (SvROK(re))
6369    re = SvRV(re);
6370   if (SvTYPE(re) == SVt_REGEXP) {
6371    if (is_bare_re)
6372     *is_bare_re = TRUE;
6373    SvREFCNT_inc(re);
6374    Safefree(pRExC_state->code_blocks);
6375    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6376     "Precompiled pattern%s\n",
6377      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6378
6379    return (REGEXP*)re;
6380   }
6381  }
6382
6383  exp = SvPV_nomg(pat, plen);
6384
6385  if (!eng->op_comp) {
6386   if ((SvUTF8(pat) && IN_BYTES)
6387     || SvGMAGICAL(pat) || SvAMAGIC(pat))
6388   {
6389    /* make a temporary copy; either to convert to bytes,
6390    * or to avoid repeating get-magic / overloaded stringify */
6391    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6392           (IN_BYTES ? 0 : SvUTF8(pat)));
6393   }
6394   Safefree(pRExC_state->code_blocks);
6395   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6396  }
6397
6398  /* ignore the utf8ness if the pattern is 0 length */
6399  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6400  RExC_uni_semantics = 0;
6401  RExC_contains_locale = 0;
6402  RExC_contains_i = 0;
6403  pRExC_state->runtime_code_qr = NULL;
6404
6405  DEBUG_COMPILE_r({
6406    SV *dsv= sv_newmortal();
6407    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6408    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6409       PL_colors[4],PL_colors[5],s);
6410   });
6411
6412   redo_first_pass:
6413  /* we jump here if we upgrade the pattern to utf8 and have to
6414  * recompile */
6415
6416  if ((pm_flags & PMf_USE_RE_EVAL)
6417     /* this second condition covers the non-regex literal case,
6418     * i.e.  $foo =~ '(?{})'. */
6419     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6420  )
6421   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6422
6423  /* return old regex if pattern hasn't changed */
6424  /* XXX: note in the below we have to check the flags as well as the
6425  * pattern.
6426  *
6427  * Things get a touch tricky as we have to compare the utf8 flag
6428  * independently from the compile flags.  */
6429
6430  if (   old_re
6431   && !recompile
6432   && !!RX_UTF8(old_re) == !!RExC_utf8
6433   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6434   && RX_PRECOMP(old_re)
6435   && RX_PRELEN(old_re) == plen
6436   && memEQ(RX_PRECOMP(old_re), exp, plen)
6437   && !runtime_code /* with runtime code, always recompile */ )
6438  {
6439   Safefree(pRExC_state->code_blocks);
6440   return old_re;
6441  }
6442
6443  rx_flags = orig_rx_flags;
6444
6445  if (rx_flags & PMf_FOLD) {
6446   RExC_contains_i = 1;
6447  }
6448  if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6449
6450   /* Set to use unicode semantics if the pattern is in utf8 and has the
6451   * 'depends' charset specified, as it means unicode when utf8  */
6452   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6453  }
6454
6455  RExC_precomp = exp;
6456  RExC_flags = rx_flags;
6457  RExC_pm_flags = pm_flags;
6458
6459  if (runtime_code) {
6460   if (TAINTING_get && TAINT_get)
6461    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6462
6463   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6464    /* whoops, we have a non-utf8 pattern, whilst run-time code
6465    * got compiled as utf8. Try again with a utf8 pattern */
6466    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6467          pRExC_state->num_code_blocks);
6468    goto redo_first_pass;
6469   }
6470  }
6471  assert(!pRExC_state->runtime_code_qr);
6472
6473  RExC_sawback = 0;
6474
6475  RExC_seen = 0;
6476  RExC_maxlen = 0;
6477  RExC_in_lookbehind = 0;
6478  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6479  RExC_extralen = 0;
6480  RExC_override_recoding = 0;
6481  RExC_in_multi_char_class = 0;
6482
6483  /* First pass: determine size, legality. */
6484  RExC_parse = exp;
6485  RExC_start = exp;
6486  RExC_end = exp + plen;
6487  RExC_naughty = 0;
6488  RExC_npar = 1;
6489  RExC_nestroot = 0;
6490  RExC_size = 0L;
6491  RExC_emit = (regnode *) &RExC_emit_dummy;
6492  RExC_whilem_seen = 0;
6493  RExC_open_parens = NULL;
6494  RExC_close_parens = NULL;
6495  RExC_opend = NULL;
6496  RExC_paren_names = NULL;
6497 #ifdef DEBUGGING
6498  RExC_paren_name_list = NULL;
6499 #endif
6500  RExC_recurse = NULL;
6501  RExC_study_chunk_recursed = NULL;
6502  RExC_study_chunk_recursed_bytes= 0;
6503  RExC_recurse_count = 0;
6504  pRExC_state->code_index = 0;
6505
6506 #if 0 /* REGC() is (currently) a NOP at the first pass.
6507  * Clever compilers notice this and complain. --jhi */
6508  REGC((U8)REG_MAGIC, (char*)RExC_emit);
6509 #endif
6510  DEBUG_PARSE_r(
6511   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6512   RExC_lastnum=0;
6513   RExC_lastparse=NULL;
6514  );
6515  /* reg may croak on us, not giving us a chance to free
6516  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6517  need it to survive as long as the regexp (qr/(?{})/).
6518  We must check that code_blocksv is not already set, because we may
6519  have jumped back to restart the sizing pass. */
6520  if (pRExC_state->code_blocks && !code_blocksv) {
6521   code_blocksv = newSV_type(SVt_PV);
6522   SAVEFREESV(code_blocksv);
6523   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6524   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6525  }
6526  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6527   /* It's possible to write a regexp in ascii that represents Unicode
6528   codepoints outside of the byte range, such as via \x{100}. If we
6529   detect such a sequence we have to convert the entire pattern to utf8
6530   and then recompile, as our sizing calculation will have been based
6531   on 1 byte == 1 character, but we will need to use utf8 to encode
6532   at least some part of the pattern, and therefore must convert the whole
6533   thing.
6534   -- dmq */
6535   if (flags & RESTART_UTF8) {
6536    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6537          pRExC_state->num_code_blocks);
6538    goto redo_first_pass;
6539   }
6540   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6541  }
6542  if (code_blocksv)
6543   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6544
6545  DEBUG_PARSE_r({
6546   PerlIO_printf(Perl_debug_log,
6547    "Required size %"IVdf" nodes\n"
6548    "Starting second pass (creation)\n",
6549    (IV)RExC_size);
6550   RExC_lastnum=0;
6551   RExC_lastparse=NULL;
6552  });
6553
6554  /* The first pass could have found things that force Unicode semantics */
6555  if ((RExC_utf8 || RExC_uni_semantics)
6556   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6557  {
6558   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6559  }
6560
6561  /* Small enough for pointer-storage convention?
6562  If extralen==0, this means that we will not need long jumps. */
6563  if (RExC_size >= 0x10000L && RExC_extralen)
6564   RExC_size += RExC_extralen;
6565  else
6566   RExC_extralen = 0;
6567  if (RExC_whilem_seen > 15)
6568   RExC_whilem_seen = 15;
6569
6570  /* Allocate space and zero-initialize. Note, the two step process
6571  of zeroing when in debug mode, thus anything assigned has to
6572  happen after that */
6573  rx = (REGEXP*) newSV_type(SVt_REGEXP);
6574  r = ReANY(rx);
6575  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6576   char, regexp_internal);
6577  if ( r == NULL || ri == NULL )
6578   FAIL("Regexp out of space");
6579 #ifdef DEBUGGING
6580  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6581  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6582   char);
6583 #else
6584  /* bulk initialize base fields with 0. */
6585  Zero(ri, sizeof(regexp_internal), char);
6586 #endif
6587
6588  /* non-zero initialization begins here */
6589  RXi_SET( r, ri );
6590  r->engine= eng;
6591  r->extflags = rx_flags;
6592  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6593
6594  if (pm_flags & PMf_IS_QR) {
6595   ri->code_blocks = pRExC_state->code_blocks;
6596   ri->num_code_blocks = pRExC_state->num_code_blocks;
6597  }
6598  else
6599  {
6600   int n;
6601   for (n = 0; n < pRExC_state->num_code_blocks; n++)
6602    if (pRExC_state->code_blocks[n].src_regex)
6603     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6604   SAVEFREEPV(pRExC_state->code_blocks);
6605  }
6606
6607  {
6608   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6609   bool has_charset = (get_regex_charset(r->extflags)
6610              != REGEX_DEPENDS_CHARSET);
6611
6612   /* The caret is output if there are any defaults: if not all the STD
6613   * flags are set, or if no character set specifier is needed */
6614   bool has_default =
6615      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6616      || ! has_charset);
6617   bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6618             == REG_RUN_ON_COMMENT_SEEN);
6619   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6620        >> RXf_PMf_STD_PMMOD_SHIFT);
6621   const char *fptr = STD_PAT_MODS;        /*"msix"*/
6622   char *p;
6623   /* Allocate for the worst case, which is all the std flags are turned
6624   * on.  If more precision is desired, we could do a population count of
6625   * the flags set.  This could be done with a small lookup table, or by
6626   * shifting, masking and adding, or even, when available, assembly
6627   * language for a machine-language population count.
6628   * We never output a minus, as all those are defaults, so are
6629   * covered by the caret */
6630   const STRLEN wraplen = plen + has_p + has_runon
6631    + has_default       /* If needs a caret */
6632
6633     /* If needs a character set specifier */
6634    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6635    + (sizeof(STD_PAT_MODS) - 1)
6636    + (sizeof("(?:)") - 1);
6637
6638   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6639   r->xpv_len_u.xpvlenu_pv = p;
6640   if (RExC_utf8)
6641    SvFLAGS(rx) |= SVf_UTF8;
6642   *p++='('; *p++='?';
6643
6644   /* If a default, cover it using the caret */
6645   if (has_default) {
6646    *p++= DEFAULT_PAT_MOD;
6647   }
6648   if (has_charset) {
6649    STRLEN len;
6650    const char* const name = get_regex_charset_name(r->extflags, &len);
6651    Copy(name, p, len, char);
6652    p += len;
6653   }
6654   if (has_p)
6655    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6656   {
6657    char ch;
6658    while((ch = *fptr++)) {
6659     if(reganch & 1)
6660      *p++ = ch;
6661     reganch >>= 1;
6662    }
6663   }
6664
6665   *p++ = ':';
6666   Copy(RExC_precomp, p, plen, char);
6667   assert ((RX_WRAPPED(rx) - p) < 16);
6668   r->pre_prefix = p - RX_WRAPPED(rx);
6669   p += plen;
6670   if (has_runon)
6671    *p++ = '\n';
6672   *p++ = ')';
6673   *p = 0;
6674   SvCUR_set(rx, p - RX_WRAPPED(rx));
6675  }
6676
6677  r->intflags = 0;
6678  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6679
6680  /* setup various meta data about recursion, this all requires
6681  * RExC_npar to be correctly set, and a bit later on we clear it */
6682  if (RExC_seen & REG_RECURSE_SEEN) {
6683   Newxz(RExC_open_parens, RExC_npar,regnode *);
6684   SAVEFREEPV(RExC_open_parens);
6685   Newxz(RExC_close_parens,RExC_npar,regnode *);
6686   SAVEFREEPV(RExC_close_parens);
6687  }
6688  if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6689   /* Note, RExC_npar is 1 + the number of parens in a pattern.
6690   * So its 1 if there are no parens. */
6691   RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6692           ((RExC_npar & 0x07) != 0);
6693   Newx(RExC_study_chunk_recursed,
6694    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6695   SAVEFREEPV(RExC_study_chunk_recursed);
6696  }
6697
6698  /* Useful during FAIL. */
6699 #ifdef RE_TRACK_PATTERN_OFFSETS
6700  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6701  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6702       "%s %"UVuf" bytes for offset annotations.\n",
6703       ri->u.offsets ? "Got" : "Couldn't get",
6704       (UV)((2*RExC_size+1) * sizeof(U32))));
6705 #endif
6706  SetProgLen(ri,RExC_size);
6707  RExC_rx_sv = rx;
6708  RExC_rx = r;
6709  RExC_rxi = ri;
6710  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6711
6712  /* Second pass: emit code. */
6713  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6714  RExC_pm_flags = pm_flags;
6715  RExC_parse = exp;
6716  RExC_end = exp + plen;
6717  RExC_naughty = 0;
6718  RExC_npar = 1;
6719  RExC_emit_start = ri->program;
6720  RExC_emit = ri->program;
6721  RExC_emit_bound = ri->program + RExC_size + 1;
6722  pRExC_state->code_index = 0;
6723
6724  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6725  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6726   ReREFCNT_dec(rx);
6727   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6728  }
6729  /* XXXX To minimize changes to RE engine we always allocate
6730  3-units-long substrs field. */
6731  Newx(r->substrs, 1, struct reg_substr_data);
6732  if (RExC_recurse_count) {
6733   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6734   SAVEFREEPV(RExC_recurse);
6735  }
6736
6737 reStudy:
6738  r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6739  Zero(r->substrs, 1, struct reg_substr_data);
6740  if (RExC_study_chunk_recursed)
6741   Zero(RExC_study_chunk_recursed,
6742    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6743
6744 #ifdef TRIE_STUDY_OPT
6745  if (!restudied) {
6746   StructCopy(&zero_scan_data, &data, scan_data_t);
6747   copyRExC_state = RExC_state;
6748  } else {
6749   U32 seen=RExC_seen;
6750   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6751
6752   RExC_state = copyRExC_state;
6753   if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6754    RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6755   else
6756    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6757   StructCopy(&zero_scan_data, &data, scan_data_t);
6758  }
6759 #else
6760  StructCopy(&zero_scan_data, &data, scan_data_t);
6761 #endif
6762
6763  /* Dig out information for optimizations. */
6764  r->extflags = RExC_flags; /* was pm_op */
6765  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6766
6767  if (UTF)
6768   SvUTF8_on(rx); /* Unicode in it? */
6769  ri->regstclass = NULL;
6770  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6771   r->intflags |= PREGf_NAUGHTY;
6772  scan = ri->program + 1;  /* First BRANCH. */
6773
6774  /* testing for BRANCH here tells us whether there is "must appear"
6775  data in the pattern. If there is then we can use it for optimisations */
6776  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6777             */
6778   SSize_t fake;
6779   STRLEN longest_float_length, longest_fixed_length;
6780   regnode_ssc ch_class; /* pointed to by data */
6781   int stclass_flag;
6782   SSize_t last_close = 0; /* pointed to by data */
6783   regnode *first= scan;
6784   regnode *first_next= regnext(first);
6785   /*
6786   * Skip introductions and multiplicators >= 1
6787   * so that we can extract the 'meat' of the pattern that must
6788   * match in the large if() sequence following.
6789   * NOTE that EXACT is NOT covered here, as it is normally
6790   * picked up by the optimiser separately.
6791   *
6792   * This is unfortunate as the optimiser isnt handling lookahead
6793   * properly currently.
6794   *
6795   */
6796   while ((OP(first) == OPEN && (sawopen = 1)) ||
6797    /* An OR of *one* alternative - should not happen now. */
6798    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6799    /* for now we can't handle lookbehind IFMATCH*/
6800    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6801    (OP(first) == PLUS) ||
6802    (OP(first) == MINMOD) ||
6803    /* An {n,m} with n>0 */
6804    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6805    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6806   {
6807     /*
6808     * the only op that could be a regnode is PLUS, all the rest
6809     * will be regnode_1 or regnode_2.
6810     *
6811     * (yves doesn't think this is true)
6812     */
6813     if (OP(first) == PLUS)
6814      sawplus = 1;
6815     else {
6816      if (OP(first) == MINMOD)
6817       sawminmod = 1;
6818      first += regarglen[OP(first)];
6819     }
6820     first = NEXTOPER(first);
6821     first_next= regnext(first);
6822   }
6823
6824   /* Starting-point info. */
6825  again:
6826   DEBUG_PEEP("first:",first,0);
6827   /* Ignore EXACT as we deal with it later. */
6828   if (PL_regkind[OP(first)] == EXACT) {
6829    if (OP(first) == EXACT)
6830     NOOP; /* Empty, get anchored substr later. */
6831    else
6832     ri->regstclass = first;
6833   }
6834 #ifdef TRIE_STCLASS
6835   else if (PL_regkind[OP(first)] == TRIE &&
6836     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6837   {
6838    /* this can happen only on restudy */
6839    ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6840   }
6841 #endif
6842   else if (REGNODE_SIMPLE(OP(first)))
6843    ri->regstclass = first;
6844   else if (PL_regkind[OP(first)] == BOUND ||
6845     PL_regkind[OP(first)] == NBOUND)
6846    ri->regstclass = first;
6847   else if (PL_regkind[OP(first)] == BOL) {
6848    r->intflags |= (OP(first) == MBOL
6849       ? PREGf_ANCH_MBOL
6850       : (OP(first) == SBOL
6851        ? PREGf_ANCH_SBOL
6852        : PREGf_ANCH_BOL));
6853    first = NEXTOPER(first);
6854    goto again;
6855   }
6856   else if (OP(first) == GPOS) {
6857    r->intflags |= PREGf_ANCH_GPOS;
6858    first = NEXTOPER(first);
6859    goto again;
6860   }
6861   else if ((!sawopen || !RExC_sawback) &&
6862    !sawlookahead &&
6863    (OP(first) == STAR &&
6864    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6865    !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6866   {
6867    /* turn .* into ^.* with an implied $*=1 */
6868    const int type =
6869     (OP(NEXTOPER(first)) == REG_ANY)
6870      ? PREGf_ANCH_MBOL
6871      : PREGf_ANCH_SBOL;
6872    r->intflags |= (type | PREGf_IMPLICIT);
6873    first = NEXTOPER(first);
6874    goto again;
6875   }
6876   if (sawplus && !sawminmod && !sawlookahead
6877    && (!sawopen || !RExC_sawback)
6878    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6879    /* x+ must match at the 1st pos of run of x's */
6880    r->intflags |= PREGf_SKIP;
6881
6882   /* Scan is after the zeroth branch, first is atomic matcher. */
6883 #ifdef TRIE_STUDY_OPT
6884   DEBUG_PARSE_r(
6885    if (!restudied)
6886     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6887        (IV)(first - scan + 1))
6888   );
6889 #else
6890   DEBUG_PARSE_r(
6891    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6892     (IV)(first - scan + 1))
6893   );
6894 #endif
6895
6896
6897   /*
6898   * If there's something expensive in the r.e., find the
6899   * longest literal string that must appear and make it the
6900   * regmust.  Resolve ties in favor of later strings, since
6901   * the regstart check works with the beginning of the r.e.
6902   * and avoiding duplication strengthens checking.  Not a
6903   * strong reason, but sufficient in the absence of others.
6904   * [Now we resolve ties in favor of the earlier string if
6905   * it happens that c_offset_min has been invalidated, since the
6906   * earlier string may buy us something the later one won't.]
6907   */
6908
6909   data.longest_fixed = newSVpvs("");
6910   data.longest_float = newSVpvs("");
6911   data.last_found = newSVpvs("");
6912   data.longest = &(data.longest_fixed);
6913   ENTER_with_name("study_chunk");
6914   SAVEFREESV(data.longest_fixed);
6915   SAVEFREESV(data.longest_float);
6916   SAVEFREESV(data.last_found);
6917   first = scan;
6918   if (!ri->regstclass) {
6919    ssc_init(pRExC_state, &ch_class);
6920    data.start_class = &ch_class;
6921    stclass_flag = SCF_DO_STCLASS_AND;
6922   } else    /* XXXX Check for BOUND? */
6923    stclass_flag = 0;
6924   data.last_closep = &last_close;
6925
6926   DEBUG_RExC_seen();
6927   minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6928        scan + RExC_size, /* Up to end */
6929    &data, -1, 0, NULL,
6930    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6931       | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6932    0);
6933
6934
6935   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6936
6937
6938   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6939    && data.last_start_min == 0 && data.last_end > 0
6940    && !RExC_seen_zerolen
6941    && !(RExC_seen & REG_VERBARG_SEEN)
6942    && !(RExC_seen & REG_GPOS_SEEN)
6943   ){
6944    r->extflags |= RXf_CHECK_ALL;
6945   }
6946   scan_commit(pRExC_state, &data,&minlen,0);
6947
6948   longest_float_length = CHR_SVLEN(data.longest_float);
6949
6950   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6951     && data.offset_fixed == data.offset_float_min
6952     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6953    && S_setup_longest (aTHX_ pRExC_state,
6954          data.longest_float,
6955          &(r->float_utf8),
6956          &(r->float_substr),
6957          &(r->float_end_shift),
6958          data.lookbehind_float,
6959          data.offset_float_min,
6960          data.minlen_float,
6961          longest_float_length,
6962          cBOOL(data.flags & SF_FL_BEFORE_EOL),
6963          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6964   {
6965    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6966    r->float_max_offset = data.offset_float_max;
6967    if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6968     r->float_max_offset -= data.lookbehind_float;
6969    SvREFCNT_inc_simple_void_NN(data.longest_float);
6970   }
6971   else {
6972    r->float_substr = r->float_utf8 = NULL;
6973    longest_float_length = 0;
6974   }
6975
6976   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6977
6978   if (S_setup_longest (aTHX_ pRExC_state,
6979         data.longest_fixed,
6980         &(r->anchored_utf8),
6981         &(r->anchored_substr),
6982         &(r->anchored_end_shift),
6983         data.lookbehind_fixed,
6984         data.offset_fixed,
6985         data.minlen_fixed,
6986         longest_fixed_length,
6987         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6988         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6989   {
6990    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6991    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6992   }
6993   else {
6994    r->anchored_substr = r->anchored_utf8 = NULL;
6995    longest_fixed_length = 0;
6996   }
6997   LEAVE_with_name("study_chunk");
6998
6999   if (ri->regstclass
7000    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7001    ri->regstclass = NULL;
7002
7003   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7004    && stclass_flag
7005    && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7006    && !ssc_is_anything(data.start_class))
7007   {
7008    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7009
7010    ssc_finalize(pRExC_state, data.start_class);
7011
7012    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7013    StructCopy(data.start_class,
7014      (regnode_ssc*)RExC_rxi->data->data[n],
7015      regnode_ssc);
7016    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7017    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7018    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7019      regprop(r, sv, (regnode*)data.start_class, NULL);
7020      PerlIO_printf(Perl_debug_log,
7021          "synthetic stclass \"%s\".\n",
7022          SvPVX_const(sv));});
7023    data.start_class = NULL;
7024   }
7025
7026   /* A temporary algorithm prefers floated substr to fixed one to dig
7027   * more info. */
7028   if (longest_fixed_length > longest_float_length) {
7029    r->substrs->check_ix = 0;
7030    r->check_end_shift = r->anchored_end_shift;
7031    r->check_substr = r->anchored_substr;
7032    r->check_utf8 = r->anchored_utf8;
7033    r->check_offset_min = r->check_offset_max = r->anchored_offset;
7034    if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7035     r->intflags |= PREGf_NOSCAN;
7036   }
7037   else {
7038    r->substrs->check_ix = 1;
7039    r->check_end_shift = r->float_end_shift;
7040    r->check_substr = r->float_substr;
7041    r->check_utf8 = r->float_utf8;
7042    r->check_offset_min = r->float_min_offset;
7043    r->check_offset_max = r->float_max_offset;
7044   }
7045   if ((r->check_substr || r->check_utf8) ) {
7046    r->extflags |= RXf_USE_INTUIT;
7047    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7048     r->extflags |= RXf_INTUIT_TAIL;
7049   }
7050   r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7051
7052   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7053   if ( (STRLEN)minlen < longest_float_length )
7054    minlen= longest_float_length;
7055   if ( (STRLEN)minlen < longest_fixed_length )
7056    minlen= longest_fixed_length;
7057   */
7058  }
7059  else {
7060   /* Several toplevels. Best we can is to set minlen. */
7061   SSize_t fake;
7062   regnode_ssc ch_class;
7063   SSize_t last_close = 0;
7064
7065   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7066
7067   scan = ri->program + 1;
7068   ssc_init(pRExC_state, &ch_class);
7069   data.start_class = &ch_class;
7070   data.last_closep = &last_close;
7071
7072   DEBUG_RExC_seen();
7073   minlen = study_chunk(pRExC_state,
7074    &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7075    SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7076              ? SCF_TRIE_DOING_RESTUDY
7077              : 0),
7078    0);
7079
7080   CHECK_RESTUDY_GOTO_butfirst(NOOP);
7081
7082   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7083     = r->float_substr = r->float_utf8 = NULL;
7084
7085   if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7086    && ! ssc_is_anything(data.start_class))
7087   {
7088    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7089
7090    ssc_finalize(pRExC_state, data.start_class);
7091
7092    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7093    StructCopy(data.start_class,
7094      (regnode_ssc*)RExC_rxi->data->data[n],
7095      regnode_ssc);
7096    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7097    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7098    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7099      regprop(r, sv, (regnode*)data.start_class, NULL);
7100      PerlIO_printf(Perl_debug_log,
7101          "synthetic stclass \"%s\".\n",
7102          SvPVX_const(sv));});
7103    data.start_class = NULL;
7104   }
7105  }
7106
7107  if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7108   r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7109   r->maxlen = REG_INFTY;
7110  }
7111  else {
7112   r->maxlen = RExC_maxlen;
7113  }
7114
7115  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7116  the "real" pattern. */
7117  DEBUG_OPTIMISE_r({
7118   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7119      (IV)minlen, (IV)r->minlen, RExC_maxlen);
7120  });
7121  r->minlenret = minlen;
7122  if (r->minlen < minlen)
7123   r->minlen = minlen;
7124
7125  if (RExC_seen & REG_GPOS_SEEN)
7126   r->intflags |= PREGf_GPOS_SEEN;
7127  if (RExC_seen & REG_LOOKBEHIND_SEEN)
7128   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7129             lookbehind */
7130  if (pRExC_state->num_code_blocks)
7131   r->extflags |= RXf_EVAL_SEEN;
7132  if (RExC_seen & REG_CANY_SEEN)
7133   r->intflags |= PREGf_CANY_SEEN;
7134  if (RExC_seen & REG_VERBARG_SEEN)
7135  {
7136   r->intflags |= PREGf_VERBARG_SEEN;
7137   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7138  }
7139  if (RExC_seen & REG_CUTGROUP_SEEN)
7140   r->intflags |= PREGf_CUTGROUP_SEEN;
7141  if (pm_flags & PMf_USE_RE_EVAL)
7142   r->intflags |= PREGf_USE_RE_EVAL;
7143  if (RExC_paren_names)
7144   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7145  else
7146   RXp_PAREN_NAMES(r) = NULL;
7147
7148  /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7149  * so it can be used in pp.c */
7150  if (r->intflags & PREGf_ANCH)
7151   r->extflags |= RXf_IS_ANCHORED;
7152
7153
7154  {
7155   /* this is used to identify "special" patterns that might result
7156   * in Perl NOT calling the regex engine and instead doing the match "itself",
7157   * particularly special cases in split//. By having the regex compiler
7158   * do this pattern matching at a regop level (instead of by inspecting the pattern)
7159   * we avoid weird issues with equivalent patterns resulting in different behavior,
7160   * AND we allow non Perl engines to get the same optimizations by the setting the
7161   * flags appropriately - Yves */
7162   regnode *first = ri->program + 1;
7163   U8 fop = OP(first);
7164   regnode *next = NEXTOPER(first);
7165   U8 nop = OP(next);
7166
7167   if (PL_regkind[fop] == NOTHING && nop == END)
7168    r->extflags |= RXf_NULL;
7169   else if (PL_regkind[fop] == BOL && nop == END)
7170    r->extflags |= RXf_START_ONLY;
7171   else if (fop == PLUS
7172     && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7173     && OP(regnext(first)) == END)
7174    r->extflags |= RXf_WHITE;
7175   else if ( r->extflags & RXf_SPLIT
7176     && fop == EXACT
7177     && STR_LEN(first) == 1
7178     && *(STRING(first)) == ' '
7179     && OP(regnext(first)) == END )
7180    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7181
7182  }
7183
7184  if (RExC_contains_locale) {
7185   RXp_EXTFLAGS(r) |= RXf_TAINTED;
7186  }
7187
7188 #ifdef DEBUGGING
7189  if (RExC_paren_names) {
7190   ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7191   ri->data->data[ri->name_list_idx]
7192         = (void*)SvREFCNT_inc(RExC_paren_name_list);
7193  } else
7194 #endif
7195   ri->name_list_idx = 0;
7196
7197  if (RExC_recurse_count) {
7198   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7199    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7200    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7201   }
7202  }
7203  Newxz(r->offs, RExC_npar, regexp_paren_pair);
7204  /* assume we don't need to swap parens around before we match */
7205
7206  DEBUG_DUMP_r({
7207   DEBUG_RExC_seen();
7208   PerlIO_printf(Perl_debug_log,"Final program:\n");
7209   regdump(r);
7210  });
7211 #ifdef RE_TRACK_PATTERN_OFFSETS
7212  DEBUG_OFFSETS_r(if (ri->u.offsets) {
7213   const STRLEN len = ri->u.offsets[0];
7214   STRLEN i;
7215   GET_RE_DEBUG_FLAGS_DECL;
7216   PerlIO_printf(Perl_debug_log,
7217      "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7218   for (i = 1; i <= len; i++) {
7219    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7220     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7221     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7222    }
7223   PerlIO_printf(Perl_debug_log, "\n");
7224  });
7225 #endif
7226
7227 #ifdef USE_ITHREADS
7228  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7229  * by setting the regexp SV to readonly-only instead. If the
7230  * pattern's been recompiled, the USEDness should remain. */
7231  if (old_re && SvREADONLY(old_re))
7232   SvREADONLY_on(rx);
7233 #endif
7234  return rx;
7235 }
7236
7237
7238 SV*
7239 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7240      const U32 flags)
7241 {
7242  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7243
7244  PERL_UNUSED_ARG(value);
7245
7246  if (flags & RXapif_FETCH) {
7247   return reg_named_buff_fetch(rx, key, flags);
7248  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7249   Perl_croak_no_modify();
7250   return NULL;
7251  } else if (flags & RXapif_EXISTS) {
7252   return reg_named_buff_exists(rx, key, flags)
7253    ? &PL_sv_yes
7254    : &PL_sv_no;
7255  } else if (flags & RXapif_REGNAMES) {
7256   return reg_named_buff_all(rx, flags);
7257  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7258   return reg_named_buff_scalar(rx, flags);
7259  } else {
7260   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7261   return NULL;
7262  }
7263 }
7264
7265 SV*
7266 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7267       const U32 flags)
7268 {
7269  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7270  PERL_UNUSED_ARG(lastkey);
7271
7272  if (flags & RXapif_FIRSTKEY)
7273   return reg_named_buff_firstkey(rx, flags);
7274  else if (flags & RXapif_NEXTKEY)
7275   return reg_named_buff_nextkey(rx, flags);
7276  else {
7277   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7278            (int)flags);
7279   return NULL;
7280  }
7281 }
7282
7283 SV*
7284 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7285       const U32 flags)
7286 {
7287  AV *retarray = NULL;
7288  SV *ret;
7289  struct regexp *const rx = ReANY(r);
7290
7291  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7292
7293  if (flags & RXapif_ALL)
7294   retarray=newAV();
7295
7296  if (rx && RXp_PAREN_NAMES(rx)) {
7297   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7298   if (he_str) {
7299    IV i;
7300    SV* sv_dat=HeVAL(he_str);
7301    I32 *nums=(I32*)SvPVX(sv_dat);
7302    for ( i=0; i<SvIVX(sv_dat); i++ ) {
7303     if ((I32)(rx->nparens) >= nums[i]
7304      && rx->offs[nums[i]].start != -1
7305      && rx->offs[nums[i]].end != -1)
7306     {
7307      ret = newSVpvs("");
7308      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7309      if (!retarray)
7310       return ret;
7311     } else {
7312      if (retarray)
7313       ret = newSVsv(&PL_sv_undef);
7314     }
7315     if (retarray)
7316      av_push(retarray, ret);
7317    }
7318    if (retarray)
7319     return newRV_noinc(MUTABLE_SV(retarray));
7320   }
7321  }
7322  return NULL;
7323 }
7324
7325 bool
7326 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7327       const U32 flags)
7328 {
7329  struct regexp *const rx = ReANY(r);
7330
7331  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7332
7333  if (rx && RXp_PAREN_NAMES(rx)) {
7334   if (flags & RXapif_ALL) {
7335    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7336   } else {
7337    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7338    if (sv) {
7339     SvREFCNT_dec_NN(sv);
7340     return TRUE;
7341    } else {
7342     return FALSE;
7343    }
7344   }
7345  } else {
7346   return FALSE;
7347  }
7348 }
7349
7350 SV*
7351 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7352 {
7353  struct regexp *const rx = ReANY(r);
7354
7355  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7356
7357  if ( rx && RXp_PAREN_NAMES(rx) ) {
7358   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7359
7360   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7361  } else {
7362   return FALSE;
7363  }
7364 }
7365
7366 SV*
7367 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7368 {
7369  struct regexp *const rx = ReANY(r);
7370  GET_RE_DEBUG_FLAGS_DECL;
7371
7372  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7373
7374  if (rx && RXp_PAREN_NAMES(rx)) {
7375   HV *hv = RXp_PAREN_NAMES(rx);
7376   HE *temphe;
7377   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7378    IV i;
7379    IV parno = 0;
7380    SV* sv_dat = HeVAL(temphe);
7381    I32 *nums = (I32*)SvPVX(sv_dat);
7382    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7383     if ((I32)(rx->lastparen) >= nums[i] &&
7384      rx->offs[nums[i]].start != -1 &&
7385      rx->offs[nums[i]].end != -1)
7386     {
7387      parno = nums[i];
7388      break;
7389     }
7390    }
7391    if (parno || flags & RXapif_ALL) {
7392     return newSVhek(HeKEY_hek(temphe));
7393    }
7394   }
7395  }
7396  return NULL;
7397 }
7398
7399 SV*
7400 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7401 {
7402  SV *ret;
7403  AV *av;
7404  SSize_t length;
7405  struct regexp *const rx = ReANY(r);
7406
7407  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7408
7409  if (rx && RXp_PAREN_NAMES(rx)) {
7410   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7411    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7412   } else if (flags & RXapif_ONE) {
7413    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7414    av = MUTABLE_AV(SvRV(ret));
7415    length = av_tindex(av);
7416    SvREFCNT_dec_NN(ret);
7417    return newSViv(length + 1);
7418   } else {
7419    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7420             (int)flags);
7421    return NULL;
7422   }
7423  }
7424  return &PL_sv_undef;
7425 }
7426
7427 SV*
7428 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7429 {
7430  struct regexp *const rx = ReANY(r);
7431  AV *av = newAV();
7432
7433  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7434
7435  if (rx && RXp_PAREN_NAMES(rx)) {
7436   HV *hv= RXp_PAREN_NAMES(rx);
7437   HE *temphe;
7438   (void)hv_iterinit(hv);
7439   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7440    IV i;
7441    IV parno = 0;
7442    SV* sv_dat = HeVAL(temphe);
7443    I32 *nums = (I32*)SvPVX(sv_dat);
7444    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7445     if ((I32)(rx->lastparen) >= nums[i] &&
7446      rx->offs[nums[i]].start != -1 &&
7447      rx->offs[nums[i]].end != -1)
7448     {
7449      parno = nums[i];
7450      break;
7451     }
7452    }
7453    if (parno || flags & RXapif_ALL) {
7454     av_push(av, newSVhek(HeKEY_hek(temphe)));
7455    }
7456   }
7457  }
7458
7459  return newRV_noinc(MUTABLE_SV(av));
7460 }
7461
7462 void
7463 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7464        SV * const sv)
7465 {
7466  struct regexp *const rx = ReANY(r);
7467  char *s = NULL;
7468  SSize_t i = 0;
7469  SSize_t s1, t1;
7470  I32 n = paren;
7471
7472  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7473
7474  if (      n == RX_BUFF_IDX_CARET_PREMATCH
7475   || n == RX_BUFF_IDX_CARET_FULLMATCH
7476   || n == RX_BUFF_IDX_CARET_POSTMATCH
7477  )
7478  {
7479   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7480   if (!keepcopy) {
7481    /* on something like
7482    *    $r = qr/.../;
7483    *    /$qr/p;
7484    * the KEEPCOPY is set on the PMOP rather than the regex */
7485    if (PL_curpm && r == PM_GETRE(PL_curpm))
7486     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7487   }
7488   if (!keepcopy)
7489    goto ret_undef;
7490  }
7491
7492  if (!rx->subbeg)
7493   goto ret_undef;
7494
7495  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7496   /* no need to distinguish between them any more */
7497   n = RX_BUFF_IDX_FULLMATCH;
7498
7499  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7500   && rx->offs[0].start != -1)
7501  {
7502   /* $`, ${^PREMATCH} */
7503   i = rx->offs[0].start;
7504   s = rx->subbeg;
7505  }
7506  else
7507  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7508   && rx->offs[0].end != -1)
7509  {
7510   /* $', ${^POSTMATCH} */
7511   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7512   i = rx->sublen + rx->suboffset - rx->offs[0].end;
7513  }
7514  else
7515  if ( 0 <= n && n <= (I32)rx->nparens &&
7516   (s1 = rx->offs[n].start) != -1 &&
7517   (t1 = rx->offs[n].end) != -1)
7518  {
7519   /* $&, ${^MATCH},  $1 ... */
7520   i = t1 - s1;
7521   s = rx->subbeg + s1 - rx->suboffset;
7522  } else {
7523   goto ret_undef;
7524  }
7525
7526  assert(s >= rx->subbeg);
7527  assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7528  if (i >= 0) {
7529 #ifdef NO_TAINT_SUPPORT
7530   sv_setpvn(sv, s, i);
7531 #else
7532   const int oldtainted = TAINT_get;
7533   TAINT_NOT;
7534   sv_setpvn(sv, s, i);
7535   TAINT_set(oldtainted);
7536 #endif
7537   if ( (rx->intflags & PREGf_CANY_SEEN)
7538    ? (RXp_MATCH_UTF8(rx)
7539       && (!i || is_utf8_string((U8*)s, i)))
7540    : (RXp_MATCH_UTF8(rx)) )
7541   {
7542    SvUTF8_on(sv);
7543   }
7544   else
7545    SvUTF8_off(sv);
7546   if (TAINTING_get) {
7547    if (RXp_MATCH_TAINTED(rx)) {
7548     if (SvTYPE(sv) >= SVt_PVMG) {
7549      MAGIC* const mg = SvMAGIC(sv);
7550      MAGIC* mgt;
7551      TAINT;
7552      SvMAGIC_set(sv, mg->mg_moremagic);
7553      SvTAINT(sv);
7554      if ((mgt = SvMAGIC(sv))) {
7555       mg->mg_moremagic = mgt;
7556       SvMAGIC_set(sv, mg);
7557      }
7558     } else {
7559      TAINT;
7560      SvTAINT(sv);
7561     }
7562    } else
7563     SvTAINTED_off(sv);
7564   }
7565  } else {
7566  ret_undef:
7567   sv_setsv(sv,&PL_sv_undef);
7568   return;
7569  }
7570 }
7571
7572 void
7573 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7574               SV const * const value)
7575 {
7576  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7577
7578  PERL_UNUSED_ARG(rx);
7579  PERL_UNUSED_ARG(paren);
7580  PERL_UNUSED_ARG(value);
7581
7582  if (!PL_localizing)
7583   Perl_croak_no_modify();
7584 }
7585
7586 I32
7587 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7588        const I32 paren)
7589 {
7590  struct regexp *const rx = ReANY(r);
7591  I32 i;
7592  I32 s1, t1;
7593
7594  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7595
7596  if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7597   || paren == RX_BUFF_IDX_CARET_FULLMATCH
7598   || paren == RX_BUFF_IDX_CARET_POSTMATCH
7599  )
7600  {
7601   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7602   if (!keepcopy) {
7603    /* on something like
7604    *    $r = qr/.../;
7605    *    /$qr/p;
7606    * the KEEPCOPY is set on the PMOP rather than the regex */
7607    if (PL_curpm && r == PM_GETRE(PL_curpm))
7608     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7609   }
7610   if (!keepcopy)
7611    goto warn_undef;
7612  }
7613
7614  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7615  switch (paren) {
7616  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7617  case RX_BUFF_IDX_PREMATCH:       /* $` */
7618   if (rx->offs[0].start != -1) {
7619       i = rx->offs[0].start;
7620       if (i > 0) {
7621         s1 = 0;
7622         t1 = i;
7623         goto getlen;
7624       }
7625    }
7626   return 0;
7627
7628  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7629  case RX_BUFF_IDX_POSTMATCH:       /* $' */
7630    if (rx->offs[0].end != -1) {
7631       i = rx->sublen - rx->offs[0].end;
7632       if (i > 0) {
7633         s1 = rx->offs[0].end;
7634         t1 = rx->sublen;
7635         goto getlen;
7636       }
7637    }
7638   return 0;
7639
7640  default: /* $& / ${^MATCH}, $1, $2, ... */
7641    if (paren <= (I32)rx->nparens &&
7642    (s1 = rx->offs[paren].start) != -1 &&
7643    (t1 = rx->offs[paren].end) != -1)
7644    {
7645    i = t1 - s1;
7646    goto getlen;
7647   } else {
7648   warn_undef:
7649    if (ckWARN(WARN_UNINITIALIZED))
7650     report_uninit((const SV *)sv);
7651    return 0;
7652   }
7653  }
7654   getlen:
7655  if (i > 0 && RXp_MATCH_UTF8(rx)) {
7656   const char * const s = rx->subbeg - rx->suboffset + s1;
7657   const U8 *ep;
7658   STRLEN el;
7659
7660   i = t1 - s1;
7661   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7662       i = el;
7663  }
7664  return i;
7665 }
7666
7667 SV*
7668 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7669 {
7670  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7671   PERL_UNUSED_ARG(rx);
7672   if (0)
7673    return NULL;
7674   else
7675    return newSVpvs("Regexp");
7676 }
7677
7678 /* Scans the name of a named buffer from the pattern.
7679  * If flags is REG_RSN_RETURN_NULL returns null.
7680  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7681  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7682  * to the parsed name as looked up in the RExC_paren_names hash.
7683  * If there is an error throws a vFAIL().. type exception.
7684  */
7685
7686 #define REG_RSN_RETURN_NULL    0
7687 #define REG_RSN_RETURN_NAME    1
7688 #define REG_RSN_RETURN_DATA    2
7689
7690 STATIC SV*
7691 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7692 {
7693  char *name_start = RExC_parse;
7694
7695  PERL_ARGS_ASSERT_REG_SCAN_NAME;
7696
7697  assert (RExC_parse <= RExC_end);
7698  if (RExC_parse == RExC_end) NOOP;
7699  else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7700   /* skip IDFIRST by using do...while */
7701   if (UTF)
7702    do {
7703     RExC_parse += UTF8SKIP(RExC_parse);
7704    } while (isWORDCHAR_utf8((U8*)RExC_parse));
7705   else
7706    do {
7707     RExC_parse++;
7708    } while (isWORDCHAR(*RExC_parse));
7709  } else {
7710   RExC_parse++; /* so the <- from the vFAIL is after the offending
7711       character */
7712   vFAIL("Group name must start with a non-digit word character");
7713  }
7714  if ( flags ) {
7715   SV* sv_name
7716    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7717        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7718   if ( flags == REG_RSN_RETURN_NAME)
7719    return sv_name;
7720   else if (flags==REG_RSN_RETURN_DATA) {
7721    HE *he_str = NULL;
7722    SV *sv_dat = NULL;
7723    if ( ! sv_name )      /* should not happen*/
7724     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7725    if (RExC_paren_names)
7726     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7727    if ( he_str )
7728     sv_dat = HeVAL(he_str);
7729    if ( ! sv_dat )
7730     vFAIL("Reference to nonexistent named group");
7731    return sv_dat;
7732   }
7733   else {
7734    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7735      (unsigned long) flags);
7736   }
7737   assert(0); /* NOT REACHED */
7738  }
7739  return NULL;
7740 }
7741
7742 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7743  int rem=(int)(RExC_end - RExC_parse);                       \
7744  int cut;                                                    \
7745  int num;                                                    \
7746  int iscut=0;                                                \
7747  if (rem>10) {                                               \
7748   rem=10;                                                 \
7749   iscut=1;                                                \
7750  }                                                           \
7751  cut=10-rem;                                                 \
7752  if (RExC_lastparse!=RExC_parse)                             \
7753   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7754    rem, RExC_parse,                                    \
7755    cut + 4,                                            \
7756    iscut ? "..." : "<"                                 \
7757   );                                                      \
7758  else                                                        \
7759   PerlIO_printf(Perl_debug_log,"%16s","");                \
7760                 \
7761  if (SIZE_ONLY)                                              \
7762  num = RExC_size + 1;                                     \
7763  else                                                        \
7764  num=REG_NODE_NUM(RExC_emit);                             \
7765  if (RExC_lastnum!=num)                                      \
7766  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7767  else                                                        \
7768  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7769  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7770   (int)((depth*2)), "",                                   \
7771   (funcname)                                              \
7772  );                                                          \
7773  RExC_lastnum=num;                                           \
7774  RExC_lastparse=RExC_parse;                                  \
7775 })
7776
7777
7778
7779 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7780  DEBUG_PARSE_MSG((funcname));                            \
7781  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7782 })
7783 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7784  DEBUG_PARSE_MSG((funcname));                            \
7785  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7786 })
7787
7788 /* This section of code defines the inversion list object and its methods.  The
7789  * interfaces are highly subject to change, so as much as possible is static to
7790  * this file.  An inversion list is here implemented as a malloc'd C UV array
7791  * as an SVt_INVLIST scalar.
7792  *
7793  * An inversion list for Unicode is an array of code points, sorted by ordinal
7794  * number.  The zeroth element is the first code point in the list.  The 1th
7795  * element is the first element beyond that not in the list.  In other words,
7796  * the first range is
7797  *  invlist[0]..(invlist[1]-1)
7798  * The other ranges follow.  Thus every element whose index is divisible by two
7799  * marks the beginning of a range that is in the list, and every element not
7800  * divisible by two marks the beginning of a range not in the list.  A single
7801  * element inversion list that contains the single code point N generally
7802  * consists of two elements
7803  *  invlist[0] == N
7804  *  invlist[1] == N+1
7805  * (The exception is when N is the highest representable value on the
7806  * machine, in which case the list containing just it would be a single
7807  * element, itself.  By extension, if the last range in the list extends to
7808  * infinity, then the first element of that range will be in the inversion list
7809  * at a position that is divisible by two, and is the final element in the
7810  * list.)
7811  * Taking the complement (inverting) an inversion list is quite simple, if the
7812  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7813  * This implementation reserves an element at the beginning of each inversion
7814  * list to always contain 0; there is an additional flag in the header which
7815  * indicates if the list begins at the 0, or is offset to begin at the next
7816  * element.
7817  *
7818  * More about inversion lists can be found in "Unicode Demystified"
7819  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7820  * More will be coming when functionality is added later.
7821  *
7822  * The inversion list data structure is currently implemented as an SV pointing
7823  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7824  * array of UV whose memory management is automatically handled by the existing
7825  * facilities for SV's.
7826  *
7827  * Some of the methods should always be private to the implementation, and some
7828  * should eventually be made public */
7829
7830 /* The header definitions are in F<inline_invlist.c> */
7831
7832 PERL_STATIC_INLINE UV*
7833 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7834 {
7835  /* Returns a pointer to the first element in the inversion list's array.
7836  * This is called upon initialization of an inversion list.  Where the
7837  * array begins depends on whether the list has the code point U+0000 in it
7838  * or not.  The other parameter tells it whether the code that follows this
7839  * call is about to put a 0 in the inversion list or not.  The first
7840  * element is either the element reserved for 0, if TRUE, or the element
7841  * after it, if FALSE */
7842
7843  bool* offset = get_invlist_offset_addr(invlist);
7844  UV* zero_addr = (UV *) SvPVX(invlist);
7845
7846  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7847
7848  /* Must be empty */
7849  assert(! _invlist_len(invlist));
7850
7851  *zero_addr = 0;
7852
7853  /* 1^1 = 0; 1^0 = 1 */
7854  *offset = 1 ^ will_have_0;
7855  return zero_addr + *offset;
7856 }
7857
7858 PERL_STATIC_INLINE UV*
7859 S_invlist_array(SV* const invlist)
7860 {
7861  /* Returns the pointer to the inversion list's array.  Every time the
7862  * length changes, this needs to be called in case malloc or realloc moved
7863  * it */
7864
7865  PERL_ARGS_ASSERT_INVLIST_ARRAY;
7866
7867  /* Must not be empty.  If these fail, you probably didn't check for <len>
7868  * being non-zero before trying to get the array */
7869  assert(_invlist_len(invlist));
7870
7871  /* The very first element always contains zero, The array begins either
7872  * there, or if the inversion list is offset, at the element after it.
7873  * The offset header field determines which; it contains 0 or 1 to indicate
7874  * how much additionally to add */
7875  assert(0 == *(SvPVX(invlist)));
7876  return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7877 }
7878
7879 PERL_STATIC_INLINE void
7880 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7881 {
7882  /* Sets the current number of elements stored in the inversion list.
7883  * Updates SvCUR correspondingly */
7884  PERL_UNUSED_CONTEXT;
7885  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7886
7887  assert(SvTYPE(invlist) == SVt_INVLIST);
7888
7889  SvCUR_set(invlist,
7890    (len == 0)
7891    ? 0
7892    : TO_INTERNAL_SIZE(len + offset));
7893  assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7894 }
7895
7896 PERL_STATIC_INLINE IV*
7897 S_get_invlist_previous_index_addr(SV* invlist)
7898 {
7899  /* Return the address of the IV that is reserved to hold the cached index
7900  * */
7901  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7902
7903  assert(SvTYPE(invlist) == SVt_INVLIST);
7904
7905  return &(((XINVLIST*) SvANY(invlist))->prev_index);
7906 }
7907
7908 PERL_STATIC_INLINE IV
7909 S_invlist_previous_index(SV* const invlist)
7910 {
7911  /* Returns cached index of previous search */
7912
7913  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7914
7915  return *get_invlist_previous_index_addr(invlist);
7916 }
7917
7918 PERL_STATIC_INLINE void
7919 S_invlist_set_previous_index(SV* const invlist, const IV index)
7920 {
7921  /* Caches <index> for later retrieval */
7922
7923  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7924
7925  assert(index == 0 || index < (int) _invlist_len(invlist));
7926
7927  *get_invlist_previous_index_addr(invlist) = index;
7928 }
7929
7930 PERL_STATIC_INLINE UV
7931 S_invlist_max(SV* const invlist)
7932 {
7933  /* Returns the maximum number of elements storable in the inversion list's
7934  * array, without having to realloc() */
7935
7936  PERL_ARGS_ASSERT_INVLIST_MAX;
7937
7938  assert(SvTYPE(invlist) == SVt_INVLIST);
7939
7940  /* Assumes worst case, in which the 0 element is not counted in the
7941  * inversion list, so subtracts 1 for that */
7942  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7943   ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7944   : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7945 }
7946
7947 #ifndef PERL_IN_XSUB_RE
7948 SV*
7949 Perl__new_invlist(pTHX_ IV initial_size)
7950 {
7951
7952  /* Return a pointer to a newly constructed inversion list, with enough
7953  * space to store 'initial_size' elements.  If that number is negative, a
7954  * system default is used instead */
7955
7956  SV* new_list;
7957
7958  if (initial_size < 0) {
7959   initial_size = 10;
7960  }
7961
7962  /* Allocate the initial space */
7963  new_list = newSV_type(SVt_INVLIST);
7964
7965  /* First 1 is in case the zero element isn't in the list; second 1 is for
7966  * trailing NUL */
7967  SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7968  invlist_set_len(new_list, 0, 0);
7969
7970  /* Force iterinit() to be used to get iteration to work */
7971  *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7972
7973  *get_invlist_previous_index_addr(new_list) = 0;
7974
7975  return new_list;
7976 }
7977
7978 SV*
7979 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7980 {
7981  /* Return a pointer to a newly constructed inversion list, initialized to
7982  * point to <list>, which has to be in the exact correct inversion list
7983  * form, including internal fields.  Thus this is a dangerous routine that
7984  * should not be used in the wrong hands.  The passed in 'list' contains
7985  * several header fields at the beginning that are not part of the
7986  * inversion list body proper */
7987
7988  const STRLEN length = (STRLEN) list[0];
7989  const UV version_id =          list[1];
7990  const bool offset   =    cBOOL(list[2]);
7991 #define HEADER_LENGTH 3
7992  /* If any of the above changes in any way, you must change HEADER_LENGTH
7993  * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7994  *      perl -E 'say int(rand 2**31-1)'
7995  */
7996 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7997           data structure type, so that one being
7998           passed in can be validated to be an
7999           inversion list of the correct vintage.
8000          */
8001
8002  SV* invlist = newSV_type(SVt_INVLIST);
8003
8004  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8005
8006  if (version_id != INVLIST_VERSION_ID) {
8007   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8008  }
8009
8010  /* The generated array passed in includes header elements that aren't part
8011  * of the list proper, so start it just after them */
8012  SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8013
8014  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8015        shouldn't touch it */
8016
8017  *(get_invlist_offset_addr(invlist)) = offset;
8018
8019  /* The 'length' passed to us is the physical number of elements in the
8020  * inversion list.  But if there is an offset the logical number is one
8021  * less than that */
8022  invlist_set_len(invlist, length  - offset, offset);
8023
8024  invlist_set_previous_index(invlist, 0);
8025
8026  /* Initialize the iteration pointer. */
8027  invlist_iterfinish(invlist);
8028
8029  SvREADONLY_on(invlist);
8030
8031  return invlist;
8032 }
8033 #endif /* ifndef PERL_IN_XSUB_RE */
8034
8035 STATIC void
8036 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8037 {
8038  /* Grow the maximum size of an inversion list */
8039
8040  PERL_ARGS_ASSERT_INVLIST_EXTEND;
8041
8042  assert(SvTYPE(invlist) == SVt_INVLIST);
8043
8044  /* Add one to account for the zero element at the beginning which may not
8045  * be counted by the calling parameters */
8046  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8047 }
8048
8049 PERL_STATIC_INLINE void
8050 S_invlist_trim(SV* const invlist)
8051 {
8052  PERL_ARGS_ASSERT_INVLIST_TRIM;
8053
8054  assert(SvTYPE(invlist) == SVt_INVLIST);
8055
8056  /* Change the length of the inversion list to how many entries it currently
8057  * has */
8058  SvPV_shrink_to_cur((SV *) invlist);
8059 }
8060
8061 STATIC void
8062 S__append_range_to_invlist(pTHX_ SV* const invlist,
8063         const UV start, const UV end)
8064 {
8065    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8066  * the end of the inversion list.  The range must be above any existing
8067  * ones. */
8068
8069  UV* array;
8070  UV max = invlist_max(invlist);
8071  UV len = _invlist_len(invlist);
8072  bool offset;
8073
8074  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8075
8076  if (len == 0) { /* Empty lists must be initialized */
8077   offset = start != 0;
8078   array = _invlist_array_init(invlist, ! offset);
8079  }
8080  else {
8081   /* Here, the existing list is non-empty. The current max entry in the
8082   * list is generally the first value not in the set, except when the
8083   * set extends to the end of permissible values, in which case it is
8084   * the first entry in that final set, and so this call is an attempt to
8085   * append out-of-order */
8086
8087   UV final_element = len - 1;
8088   array = invlist_array(invlist);
8089   if (array[final_element] > start
8090    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8091   {
8092    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8093      array[final_element], start,
8094      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8095   }
8096
8097   /* Here, it is a legal append.  If the new range begins with the first
8098   * value not in the set, it is extending the set, so the new first
8099   * value not in the set is one greater than the newly extended range.
8100   * */
8101   offset = *get_invlist_offset_addr(invlist);
8102   if (array[final_element] == start) {
8103    if (end != UV_MAX) {
8104     array[final_element] = end + 1;
8105    }
8106    else {
8107     /* But if the end is the maximum representable on the machine,
8108     * just let the range that this would extend to have no end */
8109     invlist_set_len(invlist, len - 1, offset);
8110    }
8111    return;
8112   }
8113  }
8114
8115  /* Here the new range doesn't extend any existing set.  Add it */
8116
8117  len += 2; /* Includes an element each for the start and end of range */
8118
8119  /* If wll overflow the existing space, extend, which may cause the array to
8120  * be moved */
8121  if (max < len) {
8122   invlist_extend(invlist, len);
8123
8124   /* Have to set len here to avoid assert failure in invlist_array() */
8125   invlist_set_len(invlist, len, offset);
8126
8127   array = invlist_array(invlist);
8128  }
8129  else {
8130   invlist_set_len(invlist, len, offset);
8131  }
8132
8133  /* The next item on the list starts the range, the one after that is
8134  * one past the new range.  */
8135  array[len - 2] = start;
8136  if (end != UV_MAX) {
8137   array[len - 1] = end + 1;
8138  }
8139  else {
8140   /* But if the end is the maximum representable on the machine, just let
8141   * the range have no end */
8142   invlist_set_len(invlist, len - 1, offset);
8143  }
8144 }
8145
8146 #ifndef PERL_IN_XSUB_RE
8147
8148 IV
8149 Perl__invlist_search(SV* const invlist, const UV cp)
8150 {
8151  /* Searches the inversion list for the entry that contains the input code
8152  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8153  * return value is the index into the list's array of the range that
8154  * contains <cp> */
8155
8156  IV low = 0;
8157  IV mid;
8158  IV high = _invlist_len(invlist);
8159  const IV highest_element = high - 1;
8160  const UV* array;
8161
8162  PERL_ARGS_ASSERT__INVLIST_SEARCH;
8163
8164  /* If list is empty, return failure. */
8165  if (high == 0) {
8166   return -1;
8167  }
8168
8169  /* (We can't get the array unless we know the list is non-empty) */
8170  array = invlist_array(invlist);
8171
8172  mid = invlist_previous_index(invlist);
8173  assert(mid >=0 && mid <= highest_element);
8174
8175  /* <mid> contains the cache of the result of the previous call to this
8176  * function (0 the first time).  See if this call is for the same result,
8177  * or if it is for mid-1.  This is under the theory that calls to this
8178  * function will often be for related code points that are near each other.
8179  * And benchmarks show that caching gives better results.  We also test
8180  * here if the code point is within the bounds of the list.  These tests
8181  * replace others that would have had to be made anyway to make sure that
8182  * the array bounds were not exceeded, and these give us extra information
8183  * at the same time */
8184  if (cp >= array[mid]) {
8185   if (cp >= array[highest_element]) {
8186    return highest_element;
8187   }
8188
8189   /* Here, array[mid] <= cp < array[highest_element].  This means that
8190   * the final element is not the answer, so can exclude it; it also
8191   * means that <mid> is not the final element, so can refer to 'mid + 1'
8192   * safely */
8193   if (cp < array[mid + 1]) {
8194    return mid;
8195   }
8196   high--;
8197   low = mid + 1;
8198  }
8199  else { /* cp < aray[mid] */
8200   if (cp < array[0]) { /* Fail if outside the array */
8201    return -1;
8202   }
8203   high = mid;
8204   if (cp >= array[mid - 1]) {
8205    goto found_entry;
8206   }
8207  }
8208
8209  /* Binary search.  What we are looking for is <i> such that
8210  * array[i] <= cp < array[i+1]
8211  * The loop below converges on the i+1.  Note that there may not be an
8212  * (i+1)th element in the array, and things work nonetheless */
8213  while (low < high) {
8214   mid = (low + high) / 2;
8215   assert(mid <= highest_element);
8216   if (array[mid] <= cp) { /* cp >= array[mid] */
8217    low = mid + 1;
8218
8219    /* We could do this extra test to exit the loop early.
8220    if (cp < array[low]) {
8221     return mid;
8222    }
8223    */
8224   }
8225   else { /* cp < array[mid] */
8226    high = mid;
8227   }
8228  }
8229
8230   found_entry:
8231  high--;
8232  invlist_set_previous_index(invlist, high);
8233  return high;
8234 }
8235
8236 void
8237 Perl__invlist_populate_swatch(SV* const invlist,
8238        const UV start, const UV end, U8* swatch)
8239 {
8240  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8241  * but is used when the swash has an inversion list.  This makes this much
8242  * faster, as it uses a binary search instead of a linear one.  This is
8243  * intimately tied to that function, and perhaps should be in utf8.c,
8244  * except it is intimately tied to inversion lists as well.  It assumes
8245  * that <swatch> is all 0's on input */
8246
8247  UV current = start;
8248  const IV len = _invlist_len(invlist);
8249  IV i;
8250  const UV * array;
8251
8252  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8253
8254  if (len == 0) { /* Empty inversion list */
8255   return;
8256  }
8257
8258  array = invlist_array(invlist);
8259
8260  /* Find which element it is */
8261  i = _invlist_search(invlist, start);
8262
8263  /* We populate from <start> to <end> */
8264  while (current < end) {
8265   UV upper;
8266
8267   /* The inversion list gives the results for every possible code point
8268   * after the first one in the list.  Only those ranges whose index is
8269   * even are ones that the inversion list matches.  For the odd ones,
8270   * and if the initial code point is not in the list, we have to skip
8271   * forward to the next element */
8272   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8273    i++;
8274    if (i >= len) { /* Finished if beyond the end of the array */
8275     return;
8276    }
8277    current = array[i];
8278    if (current >= end) {   /* Finished if beyond the end of what we
8279          are populating */
8280     if (LIKELY(end < UV_MAX)) {
8281      return;
8282     }
8283
8284     /* We get here when the upper bound is the maximum
8285     * representable on the machine, and we are looking for just
8286     * that code point.  Have to special case it */
8287     i = len;
8288     goto join_end_of_list;
8289    }
8290   }
8291   assert(current >= start);
8292
8293   /* The current range ends one below the next one, except don't go past
8294   * <end> */
8295   i++;
8296   upper = (i < len && array[i] < end) ? array[i] : end;
8297
8298   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8299   * for each code point in it */
8300   for (; current < upper; current++) {
8301    const STRLEN offset = (STRLEN)(current - start);
8302    swatch[offset >> 3] |= 1 << (offset & 7);
8303   }
8304
8305  join_end_of_list:
8306
8307   /* Quit if at the end of the list */
8308   if (i >= len) {
8309
8310    /* But first, have to deal with the highest possible code point on
8311    * the platform.  The previous code assumes that <end> is one
8312    * beyond where we want to populate, but that is impossible at the
8313    * platform's infinity, so have to handle it specially */
8314    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8315    {
8316     const STRLEN offset = (STRLEN)(end - start);
8317     swatch[offset >> 3] |= 1 << (offset & 7);
8318    }
8319    return;
8320   }
8321
8322   /* Advance to the next range, which will be for code points not in the
8323   * inversion list */
8324   current = array[i];
8325  }
8326
8327  return;
8328 }
8329
8330 void
8331 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8332           const bool complement_b, SV** output)
8333 {
8334  /* Take the union of two inversion lists and point <output> to it.  *output
8335  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8336  * the reference count to that list will be decremented if not already a
8337  * temporary (mortal); otherwise *output will be made correspondingly
8338  * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8339  * second list is returned.  If <complement_b> is TRUE, the union is taken
8340  * of the complement (inversion) of <b> instead of b itself.
8341  *
8342  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8343  * Richard Gillam, published by Addison-Wesley, and explained at some
8344  * length there.  The preface says to incorporate its examples into your
8345  * code at your own risk.
8346  *
8347  * The algorithm is like a merge sort.
8348  *
8349  * XXX A potential performance improvement is to keep track as we go along
8350  * if only one of the inputs contributes to the result, meaning the other
8351  * is a subset of that one.  In that case, we can skip the final copy and
8352  * return the larger of the input lists, but then outside code might need
8353  * to keep track of whether to free the input list or not */
8354
8355  const UV* array_a;    /* a's array */
8356  const UV* array_b;
8357  UV len_a;     /* length of a's array */
8358  UV len_b;
8359
8360  SV* u;   /* the resulting union */
8361  UV* array_u;
8362  UV len_u;
8363
8364  UV i_a = 0;      /* current index into a's array */
8365  UV i_b = 0;
8366  UV i_u = 0;
8367
8368  /* running count, as explained in the algorithm source book; items are
8369  * stopped accumulating and are output when the count changes to/from 0.
8370  * The count is incremented when we start a range that's in the set, and
8371  * decremented when we start a range that's not in the set.  So its range
8372  * is 0 to 2.  Only when the count is zero is something not in the set.
8373  */
8374  UV count = 0;
8375
8376  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8377  assert(a != b);
8378
8379  /* If either one is empty, the union is the other one */
8380  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8381   bool make_temp = FALSE; /* Should we mortalize the result? */
8382
8383   if (*output == a) {
8384    if (a != NULL) {
8385     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8386      SvREFCNT_dec_NN(a);
8387     }
8388    }
8389   }
8390   if (*output != b) {
8391    *output = invlist_clone(b);
8392    if (complement_b) {
8393     _invlist_invert(*output);
8394    }
8395   } /* else *output already = b; */
8396
8397   if (make_temp) {
8398    sv_2mortal(*output);
8399   }
8400   return;
8401  }
8402  else if ((len_b = _invlist_len(b)) == 0) {
8403   bool make_temp = FALSE;
8404   if (*output == b) {
8405    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8406     SvREFCNT_dec_NN(b);
8407    }
8408   }
8409
8410   /* The complement of an empty list is a list that has everything in it,
8411   * so the union with <a> includes everything too */
8412   if (complement_b) {
8413    if (a == *output) {
8414     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8415      SvREFCNT_dec_NN(a);
8416     }
8417    }
8418    *output = _new_invlist(1);
8419    _append_range_to_invlist(*output, 0, UV_MAX);
8420   }
8421   else if (*output != a) {
8422    *output = invlist_clone(a);
8423   }
8424   /* else *output already = a; */
8425
8426   if (make_temp) {
8427    sv_2mortal(*output);
8428   }
8429   return;
8430  }
8431
8432  /* Here both lists exist and are non-empty */
8433  array_a = invlist_array(a);
8434  array_b = invlist_array(b);
8435
8436  /* If are to take the union of 'a' with the complement of b, set it
8437  * up so are looking at b's complement. */
8438  if (complement_b) {
8439
8440   /* To complement, we invert: if the first element is 0, remove it.  To
8441   * do this, we just pretend the array starts one later */
8442   if (array_b[0] == 0) {
8443    array_b++;
8444    len_b--;
8445   }
8446   else {
8447
8448    /* But if the first element is not zero, we pretend the list starts
8449    * at the 0 that is always stored immediately before the array. */
8450    array_b--;
8451    len_b++;
8452   }
8453  }
8454
8455  /* Size the union for the worst case: that the sets are completely
8456  * disjoint */
8457  u = _new_invlist(len_a + len_b);
8458
8459  /* Will contain U+0000 if either component does */
8460  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8461          || (len_b > 0 && array_b[0] == 0));
8462
8463  /* Go through each list item by item, stopping when exhausted one of
8464  * them */
8465  while (i_a < len_a && i_b < len_b) {
8466   UV cp;     /* The element to potentially add to the union's array */
8467   bool cp_in_set;   /* is it in the the input list's set or not */
8468
8469   /* We need to take one or the other of the two inputs for the union.
8470   * Since we are merging two sorted lists, we take the smaller of the
8471   * next items.  In case of a tie, we take the one that is in its set
8472   * first.  If we took one not in the set first, it would decrement the
8473   * count, possibly to 0 which would cause it to be output as ending the
8474   * range, and the next time through we would take the same number, and
8475   * output it again as beginning the next range.  By doing it the
8476   * opposite way, there is no possibility that the count will be
8477   * momentarily decremented to 0, and thus the two adjoining ranges will
8478   * be seamlessly merged.  (In a tie and both are in the set or both not
8479   * in the set, it doesn't matter which we take first.) */
8480   if (array_a[i_a] < array_b[i_b]
8481    || (array_a[i_a] == array_b[i_b]
8482     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8483   {
8484    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8485    cp= array_a[i_a++];
8486   }
8487   else {
8488    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8489    cp = array_b[i_b++];
8490   }
8491
8492   /* Here, have chosen which of the two inputs to look at.  Only output
8493   * if the running count changes to/from 0, which marks the
8494   * beginning/end of a range in that's in the set */
8495   if (cp_in_set) {
8496    if (count == 0) {
8497     array_u[i_u++] = cp;
8498    }
8499    count++;
8500   }
8501   else {
8502    count--;
8503    if (count == 0) {
8504     array_u[i_u++] = cp;
8505    }
8506   }
8507  }
8508
8509  /* Here, we are finished going through at least one of the lists, which
8510  * means there is something remaining in at most one.  We check if the list
8511  * that hasn't been exhausted is positioned such that we are in the middle
8512  * of a range in its set or not.  (i_a and i_b point to the element beyond
8513  * the one we care about.) If in the set, we decrement 'count'; if 0, there
8514  * is potentially more to output.
8515  * There are four cases:
8516  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
8517  *    in the union is entirely from the non-exhausted set.
8518  * 2) Both were in their sets, count is 2.  Nothing further should
8519  *    be output, as everything that remains will be in the exhausted
8520  *    list's set, hence in the union; decrementing to 1 but not 0 insures
8521  *    that
8522  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8523  *    Nothing further should be output because the union includes
8524  *    everything from the exhausted set.  Not decrementing ensures that.
8525  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8526  *    decrementing to 0 insures that we look at the remainder of the
8527  *    non-exhausted set */
8528  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8529   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8530  {
8531   count--;
8532  }
8533
8534  /* The final length is what we've output so far, plus what else is about to
8535  * be output.  (If 'count' is non-zero, then the input list we exhausted
8536  * has everything remaining up to the machine's limit in its set, and hence
8537  * in the union, so there will be no further output. */
8538  len_u = i_u;
8539  if (count == 0) {
8540   /* At most one of the subexpressions will be non-zero */
8541   len_u += (len_a - i_a) + (len_b - i_b);
8542  }
8543
8544  /* Set result to final length, which can change the pointer to array_u, so
8545  * re-find it */
8546  if (len_u != _invlist_len(u)) {
8547   invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8548   invlist_trim(u);
8549   array_u = invlist_array(u);
8550  }
8551
8552  /* When 'count' is 0, the list that was exhausted (if one was shorter than
8553  * the other) ended with everything above it not in its set.  That means
8554  * that the remaining part of the union is precisely the same as the
8555  * non-exhausted list, so can just copy it unchanged.  (If both list were
8556  * exhausted at the same time, then the operations below will be both 0.)
8557  */
8558  if (count == 0) {
8559   IV copy_count; /* At most one will have a non-zero copy count */
8560   if ((copy_count = len_a - i_a) > 0) {
8561    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8562   }
8563   else if ((copy_count = len_b - i_b) > 0) {
8564    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8565   }
8566  }
8567
8568  /*  We may be removing a reference to one of the inputs.  If so, the output
8569  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8570  *  count decremented) */
8571  if (a == *output || b == *output) {
8572   assert(! invlist_is_iterating(*output));
8573   if ((SvTEMP(*output))) {
8574    sv_2mortal(u);
8575   }
8576   else {
8577    SvREFCNT_dec_NN(*output);
8578   }
8579  }
8580
8581  *output = u;
8582
8583  return;
8584 }
8585
8586 void
8587 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8588            const bool complement_b, SV** i)
8589 {
8590  /* Take the intersection of two inversion lists and point <i> to it.  *i
8591  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8592  * the reference count to that list will be decremented if not already a
8593  * temporary (mortal); otherwise *i will be made correspondingly mortal.
8594  * The first list, <a>, may be NULL, in which case an empty list is
8595  * returned.  If <complement_b> is TRUE, the result will be the
8596  * intersection of <a> and the complement (or inversion) of <b> instead of
8597  * <b> directly.
8598  *
8599  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8600  * Richard Gillam, published by Addison-Wesley, and explained at some
8601  * length there.  The preface says to incorporate its examples into your
8602  * code at your own risk.  In fact, it had bugs
8603  *
8604  * The algorithm is like a merge sort, and is essentially the same as the
8605  * union above
8606  */
8607
8608  const UV* array_a;  /* a's array */
8609  const UV* array_b;
8610  UV len_a; /* length of a's array */
8611  UV len_b;
8612
8613  SV* r;       /* the resulting intersection */
8614  UV* array_r;
8615  UV len_r;
8616
8617  UV i_a = 0;      /* current index into a's array */
8618  UV i_b = 0;
8619  UV i_r = 0;
8620
8621  /* running count, as explained in the algorithm source book; items are
8622  * stopped accumulating and are output when the count changes to/from 2.
8623  * The count is incremented when we start a range that's in the set, and
8624  * decremented when we start a range that's not in the set.  So its range
8625  * is 0 to 2.  Only when the count is 2 is something in the intersection.
8626  */
8627  UV count = 0;
8628
8629  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8630  assert(a != b);
8631
8632  /* Special case if either one is empty */
8633  len_a = (a == NULL) ? 0 : _invlist_len(a);
8634  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8635   bool make_temp = FALSE;
8636
8637   if (len_a != 0 && complement_b) {
8638
8639    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8640    * be empty.  Here, also we are using 'b's complement, which hence
8641    * must be every possible code point.  Thus the intersection is
8642    * simply 'a'. */
8643    if (*i != a) {
8644     if (*i == b) {
8645      if (! (make_temp = cBOOL(SvTEMP(b)))) {
8646       SvREFCNT_dec_NN(b);
8647      }
8648     }
8649
8650     *i = invlist_clone(a);
8651    }
8652    /* else *i is already 'a' */
8653
8654    if (make_temp) {
8655     sv_2mortal(*i);
8656    }
8657    return;
8658   }
8659
8660   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8661   * intersection must be empty */
8662   if (*i == a) {
8663    if (! (make_temp = cBOOL(SvTEMP(a)))) {
8664     SvREFCNT_dec_NN(a);
8665    }
8666   }
8667   else if (*i == b) {
8668    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8669     SvREFCNT_dec_NN(b);
8670    }
8671   }
8672   *i = _new_invlist(0);
8673   if (make_temp) {
8674    sv_2mortal(*i);
8675   }
8676
8677   return;
8678  }
8679
8680  /* Here both lists exist and are non-empty */
8681  array_a = invlist_array(a);
8682  array_b = invlist_array(b);
8683
8684  /* If are to take the intersection of 'a' with the complement of b, set it
8685  * up so are looking at b's complement. */
8686  if (complement_b) {
8687
8688   /* To complement, we invert: if the first element is 0, remove it.  To
8689   * do this, we just pretend the array starts one later */
8690   if (array_b[0] == 0) {
8691    array_b++;
8692    len_b--;
8693   }
8694   else {
8695
8696    /* But if the first element is not zero, we pretend the list starts
8697    * at the 0 that is always stored immediately before the array. */
8698    array_b--;
8699    len_b++;
8700   }
8701  }
8702
8703  /* Size the intersection for the worst case: that the intersection ends up
8704  * fragmenting everything to be completely disjoint */
8705  r= _new_invlist(len_a + len_b);
8706
8707  /* Will contain U+0000 iff both components do */
8708  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8709          && len_b > 0 && array_b[0] == 0);
8710
8711  /* Go through each list item by item, stopping when exhausted one of
8712  * them */
8713  while (i_a < len_a && i_b < len_b) {
8714   UV cp;     /* The element to potentially add to the intersection's
8715      array */
8716   bool cp_in_set; /* Is it in the input list's set or not */
8717
8718   /* We need to take one or the other of the two inputs for the
8719   * intersection.  Since we are merging two sorted lists, we take the
8720   * smaller of the next items.  In case of a tie, we take the one that
8721   * is not in its set first (a difference from the union algorithm).  If
8722   * we took one in the set first, it would increment the count, possibly
8723   * to 2 which would cause it to be output as starting a range in the
8724   * intersection, and the next time through we would take that same
8725   * number, and output it again as ending the set.  By doing it the
8726   * opposite of this, there is no possibility that the count will be
8727   * momentarily incremented to 2.  (In a tie and both are in the set or
8728   * both not in the set, it doesn't matter which we take first.) */
8729   if (array_a[i_a] < array_b[i_b]
8730    || (array_a[i_a] == array_b[i_b]
8731     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8732   {
8733    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8734    cp= array_a[i_a++];
8735   }
8736   else {
8737    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8738    cp= array_b[i_b++];
8739   }
8740
8741   /* Here, have chosen which of the two inputs to look at.  Only output
8742   * if the running count changes to/from 2, which marks the
8743   * beginning/end of a range that's in the intersection */
8744   if (cp_in_set) {
8745    count++;
8746    if (count == 2) {
8747     array_r[i_r++] = cp;
8748    }
8749   }
8750   else {
8751    if (count == 2) {
8752     array_r[i_r++] = cp;
8753    }
8754    count--;
8755   }
8756  }
8757
8758  /* Here, we are finished going through at least one of the lists, which
8759  * means there is something remaining in at most one.  We check if the list
8760  * that has been exhausted is positioned such that we are in the middle
8761  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8762  * the ones we care about.)  There are four cases:
8763  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
8764  *    nothing left in the intersection.
8765  * 2) Both were in their sets, count is 2 and perhaps is incremented to
8766  *    above 2.  What should be output is exactly that which is in the
8767  *    non-exhausted set, as everything it has is also in the intersection
8768  *    set, and everything it doesn't have can't be in the intersection
8769  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8770  *    gets incremented to 2.  Like the previous case, the intersection is
8771  *    everything that remains in the non-exhausted set.
8772  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8773  *    remains 1.  And the intersection has nothing more. */
8774  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8775   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8776  {
8777   count++;
8778  }
8779
8780  /* The final length is what we've output so far plus what else is in the
8781  * intersection.  At most one of the subexpressions below will be non-zero
8782  * */
8783  len_r = i_r;
8784  if (count >= 2) {
8785   len_r += (len_a - i_a) + (len_b - i_b);
8786  }
8787
8788  /* Set result to final length, which can change the pointer to array_r, so
8789  * re-find it */
8790  if (len_r != _invlist_len(r)) {
8791   invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8792   invlist_trim(r);
8793   array_r = invlist_array(r);
8794  }
8795
8796  /* Finish outputting any remaining */
8797  if (count >= 2) { /* At most one will have a non-zero copy count */
8798   IV copy_count;
8799   if ((copy_count = len_a - i_a) > 0) {
8800    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8801   }
8802   else if ((copy_count = len_b - i_b) > 0) {
8803    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8804   }
8805  }
8806
8807  /*  We may be removing a reference to one of the inputs.  If so, the output
8808  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8809  *  count decremented) */
8810  if (a == *i || b == *i) {
8811   assert(! invlist_is_iterating(*i));
8812   if (SvTEMP(*i)) {
8813    sv_2mortal(r);
8814   }
8815   else {
8816    SvREFCNT_dec_NN(*i);
8817   }
8818  }
8819
8820  *i = r;
8821
8822  return;
8823 }
8824
8825 SV*
8826 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8827 {
8828  /* Add the range from 'start' to 'end' inclusive to the inversion list's
8829  * set.  A pointer to the inversion list is returned.  This may actually be
8830  * a new list, in which case the passed in one has been destroyed.  The
8831  * passed in inversion list can be NULL, in which case a new one is created
8832  * with just the one range in it */
8833
8834  SV* range_invlist;
8835  UV len;
8836
8837  if (invlist == NULL) {
8838   invlist = _new_invlist(2);
8839   len = 0;
8840  }
8841  else {
8842   len = _invlist_len(invlist);
8843  }
8844
8845  /* If comes after the final entry actually in the list, can just append it
8846  * to the end, */
8847  if (len == 0
8848   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8849    && start >= invlist_array(invlist)[len - 1]))
8850  {
8851   _append_range_to_invlist(invlist, start, end);
8852   return invlist;
8853  }
8854
8855  /* Here, can't just append things, create and return a new inversion list
8856  * which is the union of this range and the existing inversion list */
8857  range_invlist = _new_invlist(2);
8858  _append_range_to_invlist(range_invlist, start, end);
8859
8860  _invlist_union(invlist, range_invlist, &invlist);
8861
8862  /* The temporary can be freed */
8863  SvREFCNT_dec_NN(range_invlist);
8864
8865  return invlist;
8866 }
8867
8868 SV*
8869 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8870         UV** other_elements_ptr)
8871 {
8872  /* Create and return an inversion list whose contents are to be populated
8873  * by the caller.  The caller gives the number of elements (in 'size') and
8874  * the very first element ('element0').  This function will set
8875  * '*other_elements_ptr' to an array of UVs, where the remaining elements
8876  * are to be placed.
8877  *
8878  * Obviously there is some trust involved that the caller will properly
8879  * fill in the other elements of the array.
8880  *
8881  * (The first element needs to be passed in, as the underlying code does
8882  * things differently depending on whether it is zero or non-zero) */
8883
8884  SV* invlist = _new_invlist(size);
8885  bool offset;
8886
8887  PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8888
8889  _append_range_to_invlist(invlist, element0, element0);
8890  offset = *get_invlist_offset_addr(invlist);
8891
8892  invlist_set_len(invlist, size, offset);
8893  *other_elements_ptr = invlist_array(invlist) + 1;
8894  return invlist;
8895 }
8896
8897 #endif
8898
8899 PERL_STATIC_INLINE SV*
8900 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8901  return _add_range_to_invlist(invlist, cp, cp);
8902 }
8903
8904 #ifndef PERL_IN_XSUB_RE
8905 void
8906 Perl__invlist_invert(pTHX_ SV* const invlist)
8907 {
8908  /* Complement the input inversion list.  This adds a 0 if the list didn't
8909  * have a zero; removes it otherwise.  As described above, the data
8910  * structure is set up so that this is very efficient */
8911
8912  PERL_ARGS_ASSERT__INVLIST_INVERT;
8913
8914  assert(! invlist_is_iterating(invlist));
8915
8916  /* The inverse of matching nothing is matching everything */
8917  if (_invlist_len(invlist) == 0) {
8918   _append_range_to_invlist(invlist, 0, UV_MAX);
8919   return;
8920  }
8921
8922  *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8923 }
8924
8925 #endif
8926
8927 PERL_STATIC_INLINE SV*
8928 S_invlist_clone(pTHX_ SV* const invlist)
8929 {
8930
8931  /* Return a new inversion list that is a copy of the input one, which is
8932  * unchanged.  The new list will not be mortal even if the old one was. */
8933
8934  /* Need to allocate extra space to accommodate Perl's addition of a
8935  * trailing NUL to SvPV's, since it thinks they are always strings */
8936  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8937  STRLEN physical_length = SvCUR(invlist);
8938  bool offset = *(get_invlist_offset_addr(invlist));
8939
8940  PERL_ARGS_ASSERT_INVLIST_CLONE;
8941
8942  *(get_invlist_offset_addr(new_invlist)) = offset;
8943  invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8944  Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8945
8946  return new_invlist;
8947 }
8948
8949 PERL_STATIC_INLINE STRLEN*
8950 S_get_invlist_iter_addr(SV* invlist)
8951 {
8952  /* Return the address of the UV that contains the current iteration
8953  * position */
8954
8955  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8956
8957  assert(SvTYPE(invlist) == SVt_INVLIST);
8958
8959  return &(((XINVLIST*) SvANY(invlist))->iterator);
8960 }
8961
8962 PERL_STATIC_INLINE void
8963 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
8964 {
8965  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8966
8967  *get_invlist_iter_addr(invlist) = 0;
8968 }
8969
8970 PERL_STATIC_INLINE void
8971 S_invlist_iterfinish(SV* invlist)
8972 {
8973  /* Terminate iterator for invlist.  This is to catch development errors.
8974  * Any iteration that is interrupted before completed should call this
8975  * function.  Functions that add code points anywhere else but to the end
8976  * of an inversion list assert that they are not in the middle of an
8977  * iteration.  If they were, the addition would make the iteration
8978  * problematical: if the iteration hadn't reached the place where things
8979  * were being added, it would be ok */
8980
8981  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8982
8983  *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8984 }
8985
8986 STATIC bool
8987 S_invlist_iternext(SV* invlist, UV* start, UV* end)
8988 {
8989  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8990  * This call sets in <*start> and <*end>, the next range in <invlist>.
8991  * Returns <TRUE> if successful and the next call will return the next
8992  * range; <FALSE> if was already at the end of the list.  If the latter,
8993  * <*start> and <*end> are unchanged, and the next call to this function
8994  * will start over at the beginning of the list */
8995
8996  STRLEN* pos = get_invlist_iter_addr(invlist);
8997  UV len = _invlist_len(invlist);
8998  UV *array;
8999
9000  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9001
9002  if (*pos >= len) {
9003   *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9004   return FALSE;
9005  }
9006
9007  array = invlist_array(invlist);
9008
9009  *start = array[(*pos)++];
9010
9011  if (*pos >= len) {
9012   *end = UV_MAX;
9013  }
9014  else {
9015   *end = array[(*pos)++] - 1;
9016  }
9017
9018  return TRUE;
9019 }
9020
9021 PERL_STATIC_INLINE bool
9022 S_invlist_is_iterating(SV* const invlist)
9023 {
9024  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9025
9026  return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9027 }
9028
9029 PERL_STATIC_INLINE UV
9030 S_invlist_highest(SV* const invlist)
9031 {
9032  /* Returns the highest code point that matches an inversion list.  This API
9033  * has an ambiguity, as it returns 0 under either the highest is actually
9034  * 0, or if the list is empty.  If this distinction matters to you, check
9035  * for emptiness before calling this function */
9036
9037  UV len = _invlist_len(invlist);
9038  UV *array;
9039
9040  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9041
9042  if (len == 0) {
9043   return 0;
9044  }
9045
9046  array = invlist_array(invlist);
9047
9048  /* The last element in the array in the inversion list always starts a
9049  * range that goes to infinity.  That range may be for code points that are
9050  * matched in the inversion list, or it may be for ones that aren't
9051  * matched.  In the latter case, the highest code point in the set is one
9052  * less than the beginning of this range; otherwise it is the final element
9053  * of this range: infinity */
9054  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9055   ? UV_MAX
9056   : array[len - 1] - 1;
9057 }
9058
9059 #ifndef PERL_IN_XSUB_RE
9060 SV *
9061 Perl__invlist_contents(pTHX_ SV* const invlist)
9062 {
9063  /* Get the contents of an inversion list into a string SV so that they can
9064  * be printed out.  It uses the format traditionally done for debug tracing
9065  */
9066
9067  UV start, end;
9068  SV* output = newSVpvs("\n");
9069
9070  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9071
9072  assert(! invlist_is_iterating(invlist));
9073
9074  invlist_iterinit(invlist);
9075  while (invlist_iternext(invlist, &start, &end)) {
9076   if (end == UV_MAX) {
9077    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9078   }
9079   else if (end != start) {
9080    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9081      start,       end);
9082   }
9083   else {
9084    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9085   }
9086  }
9087
9088  return output;
9089 }
9090 #endif
9091
9092 #ifndef PERL_IN_XSUB_RE
9093 void
9094 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9095       const char * const indent, SV* const invlist)
9096 {
9097  /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9098  * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9099  * the string 'indent'.  The output looks like this:
9100   [0] 0x000A .. 0x000D
9101   [2] 0x0085
9102   [4] 0x2028 .. 0x2029
9103   [6] 0x3104 .. INFINITY
9104  * This means that the first range of code points matched by the list are
9105  * 0xA through 0xD; the second range contains only the single code point
9106  * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9107  * are used to define each range (except if the final range extends to
9108  * infinity, only a single element is needed).  The array index of the
9109  * first element for the corresponding range is given in brackets. */
9110
9111  UV start, end;
9112  STRLEN count = 0;
9113
9114  PERL_ARGS_ASSERT__INVLIST_DUMP;
9115
9116  if (invlist_is_iterating(invlist)) {
9117   Perl_dump_indent(aTHX_ level, file,
9118    "%sCan't dump inversion list because is in middle of iterating\n",
9119    indent);
9120   return;
9121  }
9122
9123  invlist_iterinit(invlist);
9124  while (invlist_iternext(invlist, &start, &end)) {
9125   if (end == UV_MAX) {
9126    Perl_dump_indent(aTHX_ level, file,
9127          "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9128         indent, (UV)count, start);
9129   }
9130   else if (end != start) {
9131    Perl_dump_indent(aTHX_ level, file,
9132          "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9133         indent, (UV)count, start,         end);
9134   }
9135   else {
9136    Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9137            indent, (UV)count, start);
9138   }
9139   count += 2;
9140  }
9141 }
9142
9143 void
9144 Perl__load_PL_utf8_foldclosures (pTHX)
9145 {
9146  assert(! PL_utf8_foldclosures);
9147
9148  /* If the folds haven't been read in, call a fold function
9149  * to force that */
9150  if (! PL_utf8_tofold) {
9151   U8 dummy[UTF8_MAXBYTES_CASE+1];
9152
9153   /* This string is just a short named one above \xff */
9154   to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9155   assert(PL_utf8_tofold); /* Verify that worked */
9156  }
9157  PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9158 }
9159 #endif
9160
9161 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9162 bool
9163 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9164 {
9165  /* Return a boolean as to if the two passed in inversion lists are
9166  * identical.  The final argument, if TRUE, says to take the complement of
9167  * the second inversion list before doing the comparison */
9168
9169  const UV* array_a = invlist_array(a);
9170  const UV* array_b = invlist_array(b);
9171  UV len_a = _invlist_len(a);
9172  UV len_b = _invlist_len(b);
9173
9174  UV i = 0;      /* current index into the arrays */
9175  bool retval = TRUE;     /* Assume are identical until proven otherwise */
9176
9177  PERL_ARGS_ASSERT__INVLISTEQ;
9178
9179  /* If are to compare 'a' with the complement of b, set it
9180  * up so are looking at b's complement. */
9181  if (complement_b) {
9182
9183   /* The complement of nothing is everything, so <a> would have to have
9184   * just one element, starting at zero (ending at infinity) */
9185   if (len_b == 0) {
9186    return (len_a == 1 && array_a[0] == 0);
9187   }
9188   else if (array_b[0] == 0) {
9189
9190    /* Otherwise, to complement, we invert.  Here, the first element is
9191    * 0, just remove it.  To do this, we just pretend the array starts
9192    * one later */
9193
9194    array_b++;
9195    len_b--;
9196   }
9197   else {
9198
9199    /* But if the first element is not zero, we pretend the list starts
9200    * at the 0 that is always stored immediately before the array. */
9201    array_b--;
9202    len_b++;
9203   }
9204  }
9205
9206  /* Make sure that the lengths are the same, as well as the final element
9207  * before looping through the remainder.  (Thus we test the length, final,
9208  * and first elements right off the bat) */
9209  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9210   retval = FALSE;
9211  }
9212  else for (i = 0; i < len_a - 1; i++) {
9213   if (array_a[i] != array_b[i]) {
9214    retval = FALSE;
9215    break;
9216   }
9217  }
9218
9219  return retval;
9220 }
9221 #endif
9222
9223 #undef HEADER_LENGTH
9224 #undef TO_INTERNAL_SIZE
9225 #undef FROM_INTERNAL_SIZE
9226 #undef INVLIST_VERSION_ID
9227
9228 /* End of inversion list object */
9229
9230 STATIC void
9231 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9232 {
9233  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9234  * constructs, and updates RExC_flags with them.  On input, RExC_parse
9235  * should point to the first flag; it is updated on output to point to the
9236  * final ')' or ':'.  There needs to be at least one flag, or this will
9237  * abort */
9238
9239  /* for (?g), (?gc), and (?o) warnings; warning
9240  about (?c) will warn about (?g) -- japhy    */
9241
9242 #define WASTED_O  0x01
9243 #define WASTED_G  0x02
9244 #define WASTED_C  0x04
9245 #define WASTED_GC (WASTED_G|WASTED_C)
9246  I32 wastedflags = 0x00;
9247  U32 posflags = 0, negflags = 0;
9248  U32 *flagsp = &posflags;
9249  char has_charset_modifier = '\0';
9250  regex_charset cs;
9251  bool has_use_defaults = FALSE;
9252  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9253
9254  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9255
9256  /* '^' as an initial flag sets certain defaults */
9257  if (UCHARAT(RExC_parse) == '^') {
9258   RExC_parse++;
9259   has_use_defaults = TRUE;
9260   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9261   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9262           ? REGEX_UNICODE_CHARSET
9263           : REGEX_DEPENDS_CHARSET);
9264  }
9265
9266  cs = get_regex_charset(RExC_flags);
9267  if (cs == REGEX_DEPENDS_CHARSET
9268   && (RExC_utf8 || RExC_uni_semantics))
9269  {
9270   cs = REGEX_UNICODE_CHARSET;
9271  }
9272
9273  while (*RExC_parse) {
9274   /* && strchr("iogcmsx", *RExC_parse) */
9275   /* (?g), (?gc) and (?o) are useless here
9276   and must be globally applied -- japhy */
9277   switch (*RExC_parse) {
9278
9279    /* Code for the imsx flags */
9280    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9281
9282    case LOCALE_PAT_MOD:
9283     if (has_charset_modifier) {
9284      goto excess_modifier;
9285     }
9286     else if (flagsp == &negflags) {
9287      goto neg_modifier;
9288     }
9289     cs = REGEX_LOCALE_CHARSET;
9290     has_charset_modifier = LOCALE_PAT_MOD;
9291     break;
9292    case UNICODE_PAT_MOD:
9293     if (has_charset_modifier) {
9294      goto excess_modifier;
9295     }
9296     else if (flagsp == &negflags) {
9297      goto neg_modifier;
9298     }
9299     cs = REGEX_UNICODE_CHARSET;
9300     has_charset_modifier = UNICODE_PAT_MOD;
9301     break;
9302    case ASCII_RESTRICT_PAT_MOD:
9303     if (flagsp == &negflags) {
9304      goto neg_modifier;
9305     }
9306     if (has_charset_modifier) {
9307      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9308       goto excess_modifier;
9309      }
9310      /* Doubled modifier implies more restricted */
9311      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9312     }
9313     else {
9314      cs = REGEX_ASCII_RESTRICTED_CHARSET;
9315     }
9316     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9317     break;
9318    case DEPENDS_PAT_MOD:
9319     if (has_use_defaults) {
9320      goto fail_modifiers;
9321     }
9322     else if (flagsp == &negflags) {
9323      goto neg_modifier;
9324     }
9325     else if (has_charset_modifier) {
9326      goto excess_modifier;
9327     }
9328
9329     /* The dual charset means unicode semantics if the
9330     * pattern (or target, not known until runtime) are
9331     * utf8, or something in the pattern indicates unicode
9332     * semantics */
9333     cs = (RExC_utf8 || RExC_uni_semantics)
9334      ? REGEX_UNICODE_CHARSET
9335      : REGEX_DEPENDS_CHARSET;
9336     has_charset_modifier = DEPENDS_PAT_MOD;
9337     break;
9338    excess_modifier:
9339     RExC_parse++;
9340     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9341      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9342     }
9343     else if (has_charset_modifier == *(RExC_parse - 1)) {
9344      vFAIL2("Regexp modifier \"%c\" may not appear twice",
9345           *(RExC_parse - 1));
9346     }
9347     else {
9348      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9349     }
9350     /*NOTREACHED*/
9351    neg_modifier:
9352     RExC_parse++;
9353     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9354          *(RExC_parse - 1));
9355     /*NOTREACHED*/
9356    case ONCE_PAT_MOD: /* 'o' */
9357    case GLOBAL_PAT_MOD: /* 'g' */
9358     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9359      const I32 wflagbit = *RExC_parse == 'o'
9360           ? WASTED_O
9361           : WASTED_G;
9362      if (! (wastedflags & wflagbit) ) {
9363       wastedflags |= wflagbit;
9364       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9365       vWARN5(
9366        RExC_parse + 1,
9367        "Useless (%s%c) - %suse /%c modifier",
9368        flagsp == &negflags ? "?-" : "?",
9369        *RExC_parse,
9370        flagsp == &negflags ? "don't " : "",
9371        *RExC_parse
9372       );
9373      }
9374     }
9375     break;
9376
9377    case CONTINUE_PAT_MOD: /* 'c' */
9378     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9379      if (! (wastedflags & WASTED_C) ) {
9380       wastedflags |= WASTED_GC;
9381       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9382       vWARN3(
9383        RExC_parse + 1,
9384        "Useless (%sc) - %suse /gc modifier",
9385        flagsp == &negflags ? "?-" : "?",
9386        flagsp == &negflags ? "don't " : ""
9387       );
9388      }
9389     }
9390     break;
9391    case KEEPCOPY_PAT_MOD: /* 'p' */
9392     if (flagsp == &negflags) {
9393      if (SIZE_ONLY)
9394       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9395     } else {
9396      *flagsp |= RXf_PMf_KEEPCOPY;
9397     }
9398     break;
9399    case '-':
9400     /* A flag is a default iff it is following a minus, so
9401     * if there is a minus, it means will be trying to
9402     * re-specify a default which is an error */
9403     if (has_use_defaults || flagsp == &negflags) {
9404      goto fail_modifiers;
9405     }
9406     flagsp = &negflags;
9407     wastedflags = 0;  /* reset so (?g-c) warns twice */
9408     break;
9409    case ':':
9410    case ')':
9411     RExC_flags |= posflags;
9412     RExC_flags &= ~negflags;
9413     set_regex_charset(&RExC_flags, cs);
9414     if (RExC_flags & RXf_PMf_FOLD) {
9415      RExC_contains_i = 1;
9416     }
9417     return;
9418     /*NOTREACHED*/
9419    default:
9420    fail_modifiers:
9421     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9422     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9423     vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9424      UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9425     /*NOTREACHED*/
9426   }
9427
9428   ++RExC_parse;
9429  }
9430 }
9431
9432 /*
9433  - reg - regular expression, i.e. main body or parenthesized thing
9434  *
9435  * Caller must absorb opening parenthesis.
9436  *
9437  * Combining parenthesis handling with the base level of regular expression
9438  * is a trifle forced, but the need to tie the tails of the branches to what
9439  * follows makes it hard to avoid.
9440  */
9441 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9442 #ifdef DEBUGGING
9443 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9444 #else
9445 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9446 #endif
9447
9448 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9449    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9450    needs to be restarted.
9451    Otherwise would only return NULL if regbranch() returns NULL, which
9452    cannot happen.  */
9453 STATIC regnode *
9454 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9455  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9456  * 2 is like 1, but indicates that nextchar() has been called to advance
9457  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9458  * this flag alerts us to the need to check for that */
9459 {
9460  regnode *ret;  /* Will be the head of the group. */
9461  regnode *br;
9462  regnode *lastbr;
9463  regnode *ender = NULL;
9464  I32 parno = 0;
9465  I32 flags;
9466  U32 oregflags = RExC_flags;
9467  bool have_branch = 0;
9468  bool is_open = 0;
9469  I32 freeze_paren = 0;
9470  I32 after_freeze = 0;
9471  I32 num; /* numeric backreferences */
9472
9473  char * parse_start = RExC_parse; /* MJD */
9474  char * const oregcomp_parse = RExC_parse;
9475
9476  GET_RE_DEBUG_FLAGS_DECL;
9477
9478  PERL_ARGS_ASSERT_REG;
9479  DEBUG_PARSE("reg ");
9480
9481  *flagp = 0;    /* Tentatively. */
9482
9483
9484  /* Make an OPEN node, if parenthesized. */
9485  if (paren) {
9486
9487   /* Under /x, space and comments can be gobbled up between the '(' and
9488   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9489   * intervening space, as the sequence is a token, and a token should be
9490   * indivisible */
9491   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9492
9493   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9494    char *start_verb = RExC_parse;
9495    STRLEN verb_len = 0;
9496    char *start_arg = NULL;
9497    unsigned char op = 0;
9498    int argok = 1;
9499    int internal_argval = 0; /* internal_argval is only useful if
9500           !argok */
9501
9502    if (has_intervening_patws) {
9503     RExC_parse++;
9504     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9505    }
9506    while ( *RExC_parse && *RExC_parse != ')' ) {
9507     if ( *RExC_parse == ':' ) {
9508      start_arg = RExC_parse + 1;
9509      break;
9510     }
9511     RExC_parse++;
9512    }
9513    ++start_verb;
9514    verb_len = RExC_parse - start_verb;
9515    if ( start_arg ) {
9516     RExC_parse++;
9517     while ( *RExC_parse && *RExC_parse != ')' )
9518      RExC_parse++;
9519     if ( *RExC_parse != ')' )
9520      vFAIL("Unterminated verb pattern argument");
9521     if ( RExC_parse == start_arg )
9522      start_arg = NULL;
9523    } else {
9524     if ( *RExC_parse != ')' )
9525      vFAIL("Unterminated verb pattern");
9526    }
9527
9528    switch ( *start_verb ) {
9529    case 'A':  /* (*ACCEPT) */
9530     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9531      op = ACCEPT;
9532      internal_argval = RExC_nestroot;
9533     }
9534     break;
9535    case 'C':  /* (*COMMIT) */
9536     if ( memEQs(start_verb,verb_len,"COMMIT") )
9537      op = COMMIT;
9538     break;
9539    case 'F':  /* (*FAIL) */
9540     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9541      op = OPFAIL;
9542      argok = 0;
9543     }
9544     break;
9545    case ':':  /* (*:NAME) */
9546    case 'M':  /* (*MARK:NAME) */
9547     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9548      op = MARKPOINT;
9549      argok = -1;
9550     }
9551     break;
9552    case 'P':  /* (*PRUNE) */
9553     if ( memEQs(start_verb,verb_len,"PRUNE") )
9554      op = PRUNE;
9555     break;
9556    case 'S':   /* (*SKIP) */
9557     if ( memEQs(start_verb,verb_len,"SKIP") )
9558      op = SKIP;
9559     break;
9560    case 'T':  /* (*THEN) */
9561     /* [19:06] <TimToady> :: is then */
9562     if ( memEQs(start_verb,verb_len,"THEN") ) {
9563      op = CUTGROUP;
9564      RExC_seen |= REG_CUTGROUP_SEEN;
9565     }
9566     break;
9567    }
9568    if ( ! op ) {
9569     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9570     vFAIL2utf8f(
9571      "Unknown verb pattern '%"UTF8f"'",
9572      UTF8fARG(UTF, verb_len, start_verb));
9573    }
9574    if ( argok ) {
9575     if ( start_arg && internal_argval ) {
9576      vFAIL3("Verb pattern '%.*s' may not have an argument",
9577       verb_len, start_verb);
9578     } else if ( argok < 0 && !start_arg ) {
9579      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9580       verb_len, start_verb);
9581     } else {
9582      ret = reganode(pRExC_state, op, internal_argval);
9583      if ( ! internal_argval && ! SIZE_ONLY ) {
9584       if (start_arg) {
9585        SV *sv = newSVpvn( start_arg,
9586            RExC_parse - start_arg);
9587        ARG(ret) = add_data( pRExC_state,
9588             STR_WITH_LEN("S"));
9589        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9590        ret->flags = 0;
9591       } else {
9592        ret->flags = 1;
9593       }
9594      }
9595     }
9596     if (!internal_argval)
9597      RExC_seen |= REG_VERBARG_SEEN;
9598    } else if ( start_arg ) {
9599     vFAIL3("Verb pattern '%.*s' may not have an argument",
9600       verb_len, start_verb);
9601    } else {
9602     ret = reg_node(pRExC_state, op);
9603    }
9604    nextchar(pRExC_state);
9605    return ret;
9606   }
9607   else if (*RExC_parse == '?') { /* (?...) */
9608    bool is_logical = 0;
9609    const char * const seqstart = RExC_parse;
9610    if (has_intervening_patws) {
9611     RExC_parse++;
9612     vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9613    }
9614
9615    RExC_parse++;
9616    paren = *RExC_parse++;
9617    ret = NULL;   /* For look-ahead/behind. */
9618    switch (paren) {
9619
9620    case 'P': /* (?P...) variants for those used to PCRE/Python */
9621     paren = *RExC_parse++;
9622     if ( paren == '<')         /* (?P<...>) named capture */
9623      goto named_capture;
9624     else if (paren == '>') {   /* (?P>name) named recursion */
9625      goto named_recursion;
9626     }
9627     else if (paren == '=') {   /* (?P=...)  named backref */
9628      /* this pretty much dupes the code for \k<NAME> in
9629      * regatom(), if you change this make sure you change that
9630      * */
9631      char* name_start = RExC_parse;
9632      U32 num = 0;
9633      SV *sv_dat = reg_scan_name(pRExC_state,
9634       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9635      if (RExC_parse == name_start || *RExC_parse != ')')
9636       /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9637       vFAIL2("Sequence %.3s... not terminated",parse_start);
9638
9639      if (!SIZE_ONLY) {
9640       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9641       RExC_rxi->data->data[num]=(void*)sv_dat;
9642       SvREFCNT_inc_simple_void(sv_dat);
9643      }
9644      RExC_sawback = 1;
9645      ret = reganode(pRExC_state,
9646         ((! FOLD)
9647          ? NREF
9648          : (ASCII_FOLD_RESTRICTED)
9649          ? NREFFA
9650          : (AT_LEAST_UNI_SEMANTICS)
9651           ? NREFFU
9652           : (LOC)
9653           ? NREFFL
9654           : NREFF),
9655          num);
9656      *flagp |= HASWIDTH;
9657
9658      Set_Node_Offset(ret, parse_start+1);
9659      Set_Node_Cur_Length(ret, parse_start);
9660
9661      nextchar(pRExC_state);
9662      return ret;
9663     }
9664     RExC_parse++;
9665     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9666     vFAIL3("Sequence (%.*s...) not recognized",
9667         RExC_parse-seqstart, seqstart);
9668     /*NOTREACHED*/
9669    case '<':           /* (?<...) */
9670     if (*RExC_parse == '!')
9671      paren = ',';
9672     else if (*RExC_parse != '=')
9673    named_capture:
9674     {               /* (?<...>) */
9675      char *name_start;
9676      SV *svname;
9677      paren= '>';
9678    case '\'':          /* (?'...') */
9679       name_start= RExC_parse;
9680       svname = reg_scan_name(pRExC_state,
9681       SIZE_ONLY    /* reverse test from the others */
9682       ? REG_RSN_RETURN_NAME
9683       : REG_RSN_RETURN_NULL);
9684      if (RExC_parse == name_start || *RExC_parse != paren)
9685       vFAIL2("Sequence (?%c... not terminated",
9686        paren=='>' ? '<' : paren);
9687      if (SIZE_ONLY) {
9688       HE *he_str;
9689       SV *sv_dat = NULL;
9690       if (!svname) /* shouldn't happen */
9691        Perl_croak(aTHX_
9692         "panic: reg_scan_name returned NULL");
9693       if (!RExC_paren_names) {
9694        RExC_paren_names= newHV();
9695        sv_2mortal(MUTABLE_SV(RExC_paren_names));
9696 #ifdef DEBUGGING
9697        RExC_paren_name_list= newAV();
9698        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9699 #endif
9700       }
9701       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9702       if ( he_str )
9703        sv_dat = HeVAL(he_str);
9704       if ( ! sv_dat ) {
9705        /* croak baby croak */
9706        Perl_croak(aTHX_
9707         "panic: paren_name hash element allocation failed");
9708       } else if ( SvPOK(sv_dat) ) {
9709        /* (?|...) can mean we have dupes so scan to check
9710        its already been stored. Maybe a flag indicating
9711        we are inside such a construct would be useful,
9712        but the arrays are likely to be quite small, so
9713        for now we punt -- dmq */
9714        IV count = SvIV(sv_dat);
9715        I32 *pv = (I32*)SvPVX(sv_dat);
9716        IV i;
9717        for ( i = 0 ; i < count ; i++ ) {
9718         if ( pv[i] == RExC_npar ) {
9719          count = 0;
9720          break;
9721         }
9722        }
9723        if ( count ) {
9724         pv = (I32*)SvGROW(sv_dat,
9725             SvCUR(sv_dat) + sizeof(I32)+1);
9726         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9727         pv[count] = RExC_npar;
9728         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9729        }
9730       } else {
9731        (void)SvUPGRADE(sv_dat,SVt_PVNV);
9732        sv_setpvn(sv_dat, (char *)&(RExC_npar),
9733                 sizeof(I32));
9734        SvIOK_on(sv_dat);
9735        SvIV_set(sv_dat, 1);
9736       }
9737 #ifdef DEBUGGING
9738       /* Yes this does cause a memory leak in debugging Perls
9739       * */
9740       if (!av_store(RExC_paren_name_list,
9741          RExC_npar, SvREFCNT_inc(svname)))
9742        SvREFCNT_dec_NN(svname);
9743 #endif
9744
9745       /*sv_dump(sv_dat);*/
9746      }
9747      nextchar(pRExC_state);
9748      paren = 1;
9749      goto capturing_parens;
9750     }
9751     RExC_seen |= REG_LOOKBEHIND_SEEN;
9752     RExC_in_lookbehind++;
9753     RExC_parse++;
9754     /* FALLTHROUGH */
9755    case '=':           /* (?=...) */
9756     RExC_seen_zerolen++;
9757     break;
9758    case '!':           /* (?!...) */
9759     RExC_seen_zerolen++;
9760     if (*RExC_parse == ')') {
9761      ret=reg_node(pRExC_state, OPFAIL);
9762      nextchar(pRExC_state);
9763      return ret;
9764     }
9765     break;
9766    case '|':           /* (?|...) */
9767     /* branch reset, behave like a (?:...) except that
9768     buffers in alternations share the same numbers */
9769     paren = ':';
9770     after_freeze = freeze_paren = RExC_npar;
9771     break;
9772    case ':':           /* (?:...) */
9773    case '>':           /* (?>...) */
9774     break;
9775    case '$':           /* (?$...) */
9776    case '@':           /* (?@...) */
9777     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9778     break;
9779    case '0' :           /* (?0) */
9780    case 'R' :           /* (?R) */
9781     if (*RExC_parse != ')')
9782      FAIL("Sequence (?R) not terminated");
9783     ret = reg_node(pRExC_state, GOSTART);
9784      RExC_seen |= REG_GOSTART_SEEN;
9785     *flagp |= POSTPONED;
9786     nextchar(pRExC_state);
9787     return ret;
9788     /*notreached*/
9789    /* named and numeric backreferences */
9790    case '&':            /* (?&NAME) */
9791     parse_start = RExC_parse - 1;
9792    named_recursion:
9793     {
9794       SV *sv_dat = reg_scan_name(pRExC_state,
9795        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9796       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9797     }
9798     if (RExC_parse == RExC_end || *RExC_parse != ')')
9799      vFAIL("Sequence (?&... not terminated");
9800     goto gen_recurse_regop;
9801     assert(0); /* NOT REACHED */
9802    case '+':
9803     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9804      RExC_parse++;
9805      vFAIL("Illegal pattern");
9806     }
9807     goto parse_recursion;
9808     /* NOT REACHED*/
9809    case '-': /* (?-1) */
9810     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9811      RExC_parse--; /* rewind to let it be handled later */
9812      goto parse_flags;
9813     }
9814     /* FALLTHROUGH */
9815    case '1': case '2': case '3': case '4': /* (?1) */
9816    case '5': case '6': case '7': case '8': case '9':
9817     RExC_parse--;
9818    parse_recursion:
9819     num = atoi(RExC_parse);
9820     parse_start = RExC_parse - 1; /* MJD */
9821     if (*RExC_parse == '-')
9822      RExC_parse++;
9823     while (isDIGIT(*RExC_parse))
9824       RExC_parse++;
9825     if (*RExC_parse!=')')
9826      vFAIL("Expecting close bracket");
9827
9828    gen_recurse_regop:
9829     if ( paren == '-' ) {
9830      /*
9831      Diagram of capture buffer numbering.
9832      Top line is the normal capture buffer numbers
9833      Bottom line is the negative indexing as from
9834      the X (the (?-2))
9835
9836      +   1 2    3 4 5 X          6 7
9837      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9838      -   5 4    3 2 1 X          x x
9839
9840      */
9841      num = RExC_npar + num;
9842      if (num < 1)  {
9843       RExC_parse++;
9844       vFAIL("Reference to nonexistent group");
9845      }
9846     } else if ( paren == '+' ) {
9847      num = RExC_npar + num - 1;
9848     }
9849
9850     ret = reganode(pRExC_state, GOSUB, num);
9851     if (!SIZE_ONLY) {
9852      if (num > (I32)RExC_rx->nparens) {
9853       RExC_parse++;
9854       vFAIL("Reference to nonexistent group");
9855      }
9856      ARG2L_SET( ret, RExC_recurse_count++);
9857      RExC_emit++;
9858      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9859       "Recurse #%"UVuf" to %"IVdf"\n",
9860        (UV)ARG(ret), (IV)ARG2L(ret)));
9861     } else {
9862      RExC_size++;
9863      }
9864      RExC_seen |= REG_RECURSE_SEEN;
9865     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9866     Set_Node_Offset(ret, parse_start); /* MJD */
9867
9868     *flagp |= POSTPONED;
9869     nextchar(pRExC_state);
9870     return ret;
9871
9872    assert(0); /* NOT REACHED */
9873
9874    case '?':           /* (??...) */
9875     is_logical = 1;
9876     if (*RExC_parse != '{') {
9877      RExC_parse++;
9878      /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9879      vFAIL2utf8f(
9880       "Sequence (%"UTF8f"...) not recognized",
9881       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9882      /*NOTREACHED*/
9883     }
9884     *flagp |= POSTPONED;
9885     paren = *RExC_parse++;
9886     /* FALLTHROUGH */
9887    case '{':           /* (?{...}) */
9888    {
9889     U32 n = 0;
9890     struct reg_code_block *cb;
9891
9892     RExC_seen_zerolen++;
9893
9894     if (   !pRExC_state->num_code_blocks
9895      || pRExC_state->code_index >= pRExC_state->num_code_blocks
9896      || pRExC_state->code_blocks[pRExC_state->code_index].start
9897       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9898        - RExC_start)
9899     ) {
9900      if (RExC_pm_flags & PMf_USE_RE_EVAL)
9901       FAIL("panic: Sequence (?{...}): no code block found\n");
9902      FAIL("Eval-group not allowed at runtime, use re 'eval'");
9903     }
9904     /* this is a pre-compiled code block (?{...}) */
9905     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9906     RExC_parse = RExC_start + cb->end;
9907     if (!SIZE_ONLY) {
9908      OP *o = cb->block;
9909      if (cb->src_regex) {
9910       n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9911       RExC_rxi->data->data[n] =
9912        (void*)SvREFCNT_inc((SV*)cb->src_regex);
9913       RExC_rxi->data->data[n+1] = (void*)o;
9914      }
9915      else {
9916       n = add_data(pRExC_state,
9917        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9918       RExC_rxi->data->data[n] = (void*)o;
9919      }
9920     }
9921     pRExC_state->code_index++;
9922     nextchar(pRExC_state);
9923
9924     if (is_logical) {
9925      regnode *eval;
9926      ret = reg_node(pRExC_state, LOGICAL);
9927      eval = reganode(pRExC_state, EVAL, n);
9928      if (!SIZE_ONLY) {
9929       ret->flags = 2;
9930       /* for later propagation into (??{}) return value */
9931       eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9932      }
9933      REGTAIL(pRExC_state, ret, eval);
9934      /* deal with the length of this later - MJD */
9935      return ret;
9936     }
9937     ret = reganode(pRExC_state, EVAL, n);
9938     Set_Node_Length(ret, RExC_parse - parse_start + 1);
9939     Set_Node_Offset(ret, parse_start);
9940     return ret;
9941    }
9942    case '(':           /* (?(?{...})...) and (?(?=...)...) */
9943    {
9944     int is_define= 0;
9945     if (RExC_parse[0] == '?') {        /* (?(?...)) */
9946      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9947       || RExC_parse[1] == '<'
9948       || RExC_parse[1] == '{') { /* Lookahead or eval. */
9949       I32 flag;
9950       regnode *tail;
9951
9952       ret = reg_node(pRExC_state, LOGICAL);
9953       if (!SIZE_ONLY)
9954        ret->flags = 1;
9955
9956       tail = reg(pRExC_state, 1, &flag, depth+1);
9957       if (flag & RESTART_UTF8) {
9958        *flagp = RESTART_UTF8;
9959        return NULL;
9960       }
9961       REGTAIL(pRExC_state, ret, tail);
9962       goto insert_if;
9963      }
9964     }
9965     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9966       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9967     {
9968      char ch = RExC_parse[0] == '<' ? '>' : '\'';
9969      char *name_start= RExC_parse++;
9970      U32 num = 0;
9971      SV *sv_dat=reg_scan_name(pRExC_state,
9972       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9973      if (RExC_parse == name_start || *RExC_parse != ch)
9974       vFAIL2("Sequence (?(%c... not terminated",
9975        (ch == '>' ? '<' : ch));
9976      RExC_parse++;
9977      if (!SIZE_ONLY) {
9978       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9979       RExC_rxi->data->data[num]=(void*)sv_dat;
9980       SvREFCNT_inc_simple_void(sv_dat);
9981      }
9982      ret = reganode(pRExC_state,NGROUPP,num);
9983      goto insert_if_check_paren;
9984     }
9985     else if (RExC_parse[0] == 'D' &&
9986       RExC_parse[1] == 'E' &&
9987       RExC_parse[2] == 'F' &&
9988       RExC_parse[3] == 'I' &&
9989       RExC_parse[4] == 'N' &&
9990       RExC_parse[5] == 'E')
9991     {
9992      ret = reganode(pRExC_state,DEFINEP,0);
9993      RExC_parse +=6 ;
9994      is_define = 1;
9995      goto insert_if_check_paren;
9996     }
9997     else if (RExC_parse[0] == 'R') {
9998      RExC_parse++;
9999      parno = 0;
10000      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10001       parno = atoi(RExC_parse++);
10002       while (isDIGIT(*RExC_parse))
10003        RExC_parse++;
10004      } else if (RExC_parse[0] == '&') {
10005       SV *sv_dat;
10006       RExC_parse++;
10007       sv_dat = reg_scan_name(pRExC_state,
10008        SIZE_ONLY
10009        ? REG_RSN_RETURN_NULL
10010        : REG_RSN_RETURN_DATA);
10011        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10012      }
10013      ret = reganode(pRExC_state,INSUBP,parno);
10014      goto insert_if_check_paren;
10015     }
10016     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10017      /* (?(1)...) */
10018      char c;
10019      char *tmp;
10020      parno = atoi(RExC_parse++);
10021
10022      while (isDIGIT(*RExC_parse))
10023       RExC_parse++;
10024      ret = reganode(pRExC_state, GROUPP, parno);
10025
10026     insert_if_check_paren:
10027      if (*(tmp = nextchar(pRExC_state)) != ')') {
10028       /* nextchar also skips comments, so undo its work
10029       * and skip over the the next character.
10030       */
10031       RExC_parse = tmp;
10032       RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10033       vFAIL("Switch condition not recognized");
10034      }
10035     insert_if:
10036      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10037      br = regbranch(pRExC_state, &flags, 1,depth+1);
10038      if (br == NULL) {
10039       if (flags & RESTART_UTF8) {
10040        *flagp = RESTART_UTF8;
10041        return NULL;
10042       }
10043       FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10044        (UV) flags);
10045      } else
10046       REGTAIL(pRExC_state, br, reganode(pRExC_state,
10047               LONGJMP, 0));
10048      c = *nextchar(pRExC_state);
10049      if (flags&HASWIDTH)
10050       *flagp |= HASWIDTH;
10051      if (c == '|') {
10052       if (is_define)
10053        vFAIL("(?(DEFINE)....) does not allow branches");
10054
10055       /* Fake one for optimizer.  */
10056       lastbr = reganode(pRExC_state, IFTHEN, 0);
10057
10058       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10059        if (flags & RESTART_UTF8) {
10060         *flagp = RESTART_UTF8;
10061         return NULL;
10062        }
10063        FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10064         (UV) flags);
10065       }
10066       REGTAIL(pRExC_state, ret, lastbr);
10067       if (flags&HASWIDTH)
10068        *flagp |= HASWIDTH;
10069       c = *nextchar(pRExC_state);
10070      }
10071      else
10072       lastbr = NULL;
10073      if (c != ')')
10074       vFAIL("Switch (?(condition)... contains too many branches");
10075      ender = reg_node(pRExC_state, TAIL);
10076      REGTAIL(pRExC_state, br, ender);
10077      if (lastbr) {
10078       REGTAIL(pRExC_state, lastbr, ender);
10079       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10080      }
10081      else
10082       REGTAIL(pRExC_state, ret, ender);
10083      RExC_size++; /* XXX WHY do we need this?!!
10084          For large programs it seems to be required
10085          but I can't figure out why. -- dmq*/
10086      return ret;
10087     }
10088     else {
10089      RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10090      vFAIL("Unknown switch condition (?(...))");
10091     }
10092    }
10093    case '[':           /* (?[ ... ]) */
10094     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10095           oregcomp_parse);
10096    case 0:
10097     RExC_parse--; /* for vFAIL to print correctly */
10098     vFAIL("Sequence (? incomplete");
10099     break;
10100    default: /* e.g., (?i) */
10101     --RExC_parse;
10102    parse_flags:
10103     parse_lparen_question_flags(pRExC_state);
10104     if (UCHARAT(RExC_parse) != ':') {
10105      nextchar(pRExC_state);
10106      *flagp = TRYAGAIN;
10107      return NULL;
10108     }
10109     paren = ':';
10110     nextchar(pRExC_state);
10111     ret = NULL;
10112     goto parse_rest;
10113    } /* end switch */
10114   }
10115   else {                  /* (...) */
10116   capturing_parens:
10117    parno = RExC_npar;
10118    RExC_npar++;
10119
10120    ret = reganode(pRExC_state, OPEN, parno);
10121    if (!SIZE_ONLY ){
10122     if (!RExC_nestroot)
10123      RExC_nestroot = parno;
10124     if (RExC_seen & REG_RECURSE_SEEN
10125      && !RExC_open_parens[parno-1])
10126     {
10127      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10128       "Setting open paren #%"IVdf" to %d\n",
10129       (IV)parno, REG_NODE_NUM(ret)));
10130      RExC_open_parens[parno-1]= ret;
10131     }
10132    }
10133    Set_Node_Length(ret, 1); /* MJD */
10134    Set_Node_Offset(ret, RExC_parse); /* MJD */
10135    is_open = 1;
10136   }
10137  }
10138  else                        /* ! paren */
10139   ret = NULL;
10140
10141    parse_rest:
10142  /* Pick up the branches, linking them together. */
10143  parse_start = RExC_parse;   /* MJD */
10144  br = regbranch(pRExC_state, &flags, 1,depth+1);
10145
10146  /*     branch_len = (paren != 0); */
10147
10148  if (br == NULL) {
10149   if (flags & RESTART_UTF8) {
10150    *flagp = RESTART_UTF8;
10151    return NULL;
10152   }
10153   FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10154  }
10155  if (*RExC_parse == '|') {
10156   if (!SIZE_ONLY && RExC_extralen) {
10157    reginsert(pRExC_state, BRANCHJ, br, depth+1);
10158   }
10159   else {                  /* MJD */
10160    reginsert(pRExC_state, BRANCH, br, depth+1);
10161    Set_Node_Length(br, paren != 0);
10162    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10163   }
10164   have_branch = 1;
10165   if (SIZE_ONLY)
10166    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
10167  }
10168  else if (paren == ':') {
10169   *flagp |= flags&SIMPLE;
10170  }
10171  if (is_open) {    /* Starts with OPEN. */
10172   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10173  }
10174  else if (paren != '?')  /* Not Conditional */
10175   ret = br;
10176  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10177  lastbr = br;
10178  while (*RExC_parse == '|') {
10179   if (!SIZE_ONLY && RExC_extralen) {
10180    ender = reganode(pRExC_state, LONGJMP,0);
10181
10182    /* Append to the previous. */
10183    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10184   }
10185   if (SIZE_ONLY)
10186    RExC_extralen += 2;  /* Account for LONGJMP. */
10187   nextchar(pRExC_state);
10188   if (freeze_paren) {
10189    if (RExC_npar > after_freeze)
10190     after_freeze = RExC_npar;
10191    RExC_npar = freeze_paren;
10192   }
10193   br = regbranch(pRExC_state, &flags, 0, depth+1);
10194
10195   if (br == NULL) {
10196    if (flags & RESTART_UTF8) {
10197     *flagp = RESTART_UTF8;
10198     return NULL;
10199    }
10200    FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10201   }
10202   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10203   lastbr = br;
10204   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10205  }
10206
10207  if (have_branch || paren != ':') {
10208   /* Make a closing node, and hook it on the end. */
10209   switch (paren) {
10210   case ':':
10211    ender = reg_node(pRExC_state, TAIL);
10212    break;
10213   case 1: case 2:
10214    ender = reganode(pRExC_state, CLOSE, parno);
10215    if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10216     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10217       "Setting close paren #%"IVdf" to %d\n",
10218       (IV)parno, REG_NODE_NUM(ender)));
10219     RExC_close_parens[parno-1]= ender;
10220     if (RExC_nestroot == parno)
10221      RExC_nestroot = 0;
10222    }
10223    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10224    Set_Node_Length(ender,1); /* MJD */
10225    break;
10226   case '<':
10227   case ',':
10228   case '=':
10229   case '!':
10230    *flagp &= ~HASWIDTH;
10231    /* FALLTHROUGH */
10232   case '>':
10233    ender = reg_node(pRExC_state, SUCCEED);
10234    break;
10235   case 0:
10236    ender = reg_node(pRExC_state, END);
10237    if (!SIZE_ONLY) {
10238     assert(!RExC_opend); /* there can only be one! */
10239     RExC_opend = ender;
10240    }
10241    break;
10242   }
10243   DEBUG_PARSE_r(if (!SIZE_ONLY) {
10244    SV * const mysv_val1=sv_newmortal();
10245    SV * const mysv_val2=sv_newmortal();
10246    DEBUG_PARSE_MSG("lsbr");
10247    regprop(RExC_rx, mysv_val1, lastbr, NULL);
10248    regprop(RExC_rx, mysv_val2, ender, NULL);
10249    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10250       SvPV_nolen_const(mysv_val1),
10251       (IV)REG_NODE_NUM(lastbr),
10252       SvPV_nolen_const(mysv_val2),
10253       (IV)REG_NODE_NUM(ender),
10254       (IV)(ender - lastbr)
10255    );
10256   });
10257   REGTAIL(pRExC_state, lastbr, ender);
10258
10259   if (have_branch && !SIZE_ONLY) {
10260    char is_nothing= 1;
10261    if (depth==1)
10262     RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10263
10264    /* Hook the tails of the branches to the closing node. */
10265    for (br = ret; br; br = regnext(br)) {
10266     const U8 op = PL_regkind[OP(br)];
10267     if (op == BRANCH) {
10268      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10269      if ( OP(NEXTOPER(br)) != NOTHING
10270       || regnext(NEXTOPER(br)) != ender)
10271       is_nothing= 0;
10272     }
10273     else if (op == BRANCHJ) {
10274      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10275      /* for now we always disable this optimisation * /
10276      if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10277       || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10278      */
10279       is_nothing= 0;
10280     }
10281    }
10282    if (is_nothing) {
10283     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10284     DEBUG_PARSE_r(if (!SIZE_ONLY) {
10285      SV * const mysv_val1=sv_newmortal();
10286      SV * const mysv_val2=sv_newmortal();
10287      DEBUG_PARSE_MSG("NADA");
10288      regprop(RExC_rx, mysv_val1, ret, NULL);
10289      regprop(RExC_rx, mysv_val2, ender, NULL);
10290      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10291         SvPV_nolen_const(mysv_val1),
10292         (IV)REG_NODE_NUM(ret),
10293         SvPV_nolen_const(mysv_val2),
10294         (IV)REG_NODE_NUM(ender),
10295         (IV)(ender - ret)
10296      );
10297     });
10298     OP(br)= NOTHING;
10299     if (OP(ender) == TAIL) {
10300      NEXT_OFF(br)= 0;
10301      RExC_emit= br + 1;
10302     } else {
10303      regnode *opt;
10304      for ( opt= br + 1; opt < ender ; opt++ )
10305       OP(opt)= OPTIMIZED;
10306      NEXT_OFF(br)= ender - br;
10307     }
10308    }
10309   }
10310  }
10311
10312  {
10313   const char *p;
10314   static const char parens[] = "=!<,>";
10315
10316   if (paren && (p = strchr(parens, paren))) {
10317    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10318    int flag = (p - parens) > 1;
10319
10320    if (paren == '>')
10321     node = SUSPEND, flag = 0;
10322    reginsert(pRExC_state, node,ret, depth+1);
10323    Set_Node_Cur_Length(ret, parse_start);
10324    Set_Node_Offset(ret, parse_start + 1);
10325    ret->flags = flag;
10326    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10327   }
10328  }
10329
10330  /* Check for proper termination. */
10331  if (paren) {
10332   /* restore original flags, but keep (?p) */
10333   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10334   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10335    RExC_parse = oregcomp_parse;
10336    vFAIL("Unmatched (");
10337   }
10338  }
10339  else if (!paren && RExC_parse < RExC_end) {
10340   if (*RExC_parse == ')') {
10341    RExC_parse++;
10342    vFAIL("Unmatched )");
10343   }
10344   else
10345    FAIL("Junk on end of regexp"); /* "Can't happen". */
10346   assert(0); /* NOTREACHED */
10347  }
10348
10349  if (RExC_in_lookbehind) {
10350   RExC_in_lookbehind--;
10351  }
10352  if (after_freeze > RExC_npar)
10353   RExC_npar = after_freeze;
10354  return(ret);
10355 }
10356
10357 /*
10358  - regbranch - one alternative of an | operator
10359  *
10360  * Implements the concatenation operator.
10361  *
10362  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10363  * restarted.
10364  */
10365 STATIC regnode *
10366 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10367 {
10368  regnode *ret;
10369  regnode *chain = NULL;
10370  regnode *latest;
10371  I32 flags = 0, c = 0;
10372  GET_RE_DEBUG_FLAGS_DECL;
10373
10374  PERL_ARGS_ASSERT_REGBRANCH;
10375
10376  DEBUG_PARSE("brnc");
10377
10378  if (first)
10379   ret = NULL;
10380  else {
10381   if (!SIZE_ONLY && RExC_extralen)
10382    ret = reganode(pRExC_state, BRANCHJ,0);
10383   else {
10384    ret = reg_node(pRExC_state, BRANCH);
10385    Set_Node_Length(ret, 1);
10386   }
10387  }
10388
10389  if (!first && SIZE_ONLY)
10390   RExC_extralen += 1;   /* BRANCHJ */
10391
10392  *flagp = WORST;   /* Tentatively. */
10393
10394  RExC_parse--;
10395  nextchar(pRExC_state);
10396  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10397   flags &= ~TRYAGAIN;
10398   latest = regpiece(pRExC_state, &flags,depth+1);
10399   if (latest == NULL) {
10400    if (flags & TRYAGAIN)
10401     continue;
10402    if (flags & RESTART_UTF8) {
10403     *flagp = RESTART_UTF8;
10404     return NULL;
10405    }
10406    FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10407   }
10408   else if (ret == NULL)
10409    ret = latest;
10410   *flagp |= flags&(HASWIDTH|POSTPONED);
10411   if (chain == NULL)  /* First piece. */
10412    *flagp |= flags&SPSTART;
10413   else {
10414    RExC_naughty++;
10415    REGTAIL(pRExC_state, chain, latest);
10416   }
10417   chain = latest;
10418   c++;
10419  }
10420  if (chain == NULL) { /* Loop ran zero times. */
10421   chain = reg_node(pRExC_state, NOTHING);
10422   if (ret == NULL)
10423    ret = chain;
10424  }
10425  if (c == 1) {
10426   *flagp |= flags&SIMPLE;
10427  }
10428
10429  return ret;
10430 }
10431
10432 /*
10433  - regpiece - something followed by possible [*+?]
10434  *
10435  * Note that the branching code sequences used for ? and the general cases
10436  * of * and + are somewhat optimized:  they use the same NOTHING node as
10437  * both the endmarker for their branch list and the body of the last branch.
10438  * It might seem that this node could be dispensed with entirely, but the
10439  * endmarker role is not redundant.
10440  *
10441  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10442  * TRYAGAIN.
10443  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10444  * restarted.
10445  */
10446 STATIC regnode *
10447 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10448 {
10449  regnode *ret;
10450  char op;
10451  char *next;
10452  I32 flags;
10453  const char * const origparse = RExC_parse;
10454  I32 min;
10455  I32 max = REG_INFTY;
10456 #ifdef RE_TRACK_PATTERN_OFFSETS
10457  char *parse_start;
10458 #endif
10459  const char *maxpos = NULL;
10460
10461  /* Save the original in case we change the emitted regop to a FAIL. */
10462  regnode * const orig_emit = RExC_emit;
10463
10464  GET_RE_DEBUG_FLAGS_DECL;
10465
10466  PERL_ARGS_ASSERT_REGPIECE;
10467
10468  DEBUG_PARSE("piec");
10469
10470  ret = regatom(pRExC_state, &flags,depth+1);
10471  if (ret == NULL) {
10472   if (flags & (TRYAGAIN|RESTART_UTF8))
10473    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10474   else
10475    FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10476   return(NULL);
10477  }
10478
10479  op = *RExC_parse;
10480
10481  if (op == '{' && regcurly(RExC_parse)) {
10482   maxpos = NULL;
10483 #ifdef RE_TRACK_PATTERN_OFFSETS
10484   parse_start = RExC_parse; /* MJD */
10485 #endif
10486   next = RExC_parse + 1;
10487   while (isDIGIT(*next) || *next == ',') {
10488    if (*next == ',') {
10489     if (maxpos)
10490      break;
10491     else
10492      maxpos = next;
10493    }
10494    next++;
10495   }
10496   if (*next == '}') {  /* got one */
10497    if (!maxpos)
10498     maxpos = next;
10499    RExC_parse++;
10500    min = atoi(RExC_parse);
10501    if (*maxpos == ',')
10502     maxpos++;
10503    else
10504     maxpos = RExC_parse;
10505    max = atoi(maxpos);
10506    if (!max && *maxpos != '0')
10507     max = REG_INFTY;  /* meaning "infinity" */
10508    else if (max >= REG_INFTY)
10509     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10510    RExC_parse = next;
10511    nextchar(pRExC_state);
10512    if (max < min) {    /* If can't match, warn and optimize to fail
10513         unconditionally */
10514     if (SIZE_ONLY) {
10515      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10516
10517      /* We can't back off the size because we have to reserve
10518      * enough space for all the things we are about to throw
10519      * away, but we can shrink it by the ammount we are about
10520      * to re-use here */
10521      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10522     }
10523     else {
10524      RExC_emit = orig_emit;
10525     }
10526     ret = reg_node(pRExC_state, OPFAIL);
10527     return ret;
10528    }
10529    else if (min == max
10530      && RExC_parse < RExC_end
10531      && (*RExC_parse == '?' || *RExC_parse == '+'))
10532    {
10533     if (SIZE_ONLY) {
10534      ckWARN2reg(RExC_parse + 1,
10535        "Useless use of greediness modifier '%c'",
10536        *RExC_parse);
10537     }
10538     /* Absorb the modifier, so later code doesn't see nor use
10539      * it */
10540     nextchar(pRExC_state);
10541    }
10542
10543   do_curly:
10544    if ((flags&SIMPLE)) {
10545     RExC_naughty += 2 + RExC_naughty / 2;
10546     reginsert(pRExC_state, CURLY, ret, depth+1);
10547     Set_Node_Offset(ret, parse_start+1); /* MJD */
10548     Set_Node_Cur_Length(ret, parse_start);
10549    }
10550    else {
10551     regnode * const w = reg_node(pRExC_state, WHILEM);
10552
10553     w->flags = 0;
10554     REGTAIL(pRExC_state, ret, w);
10555     if (!SIZE_ONLY && RExC_extralen) {
10556      reginsert(pRExC_state, LONGJMP,ret, depth+1);
10557      reginsert(pRExC_state, NOTHING,ret, depth+1);
10558      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10559     }
10560     reginsert(pRExC_state, CURLYX,ret, depth+1);
10561         /* MJD hk */
10562     Set_Node_Offset(ret, parse_start+1);
10563     Set_Node_Length(ret,
10564         op == '{' ? (RExC_parse - parse_start) : 1);
10565
10566     if (!SIZE_ONLY && RExC_extralen)
10567      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10568     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10569     if (SIZE_ONLY)
10570      RExC_whilem_seen++, RExC_extralen += 3;
10571     RExC_naughty += 4 + RExC_naughty; /* compound interest */
10572    }
10573    ret->flags = 0;
10574
10575    if (min > 0)
10576     *flagp = WORST;
10577    if (max > 0)
10578     *flagp |= HASWIDTH;
10579    if (!SIZE_ONLY) {
10580     ARG1_SET(ret, (U16)min);
10581     ARG2_SET(ret, (U16)max);
10582    }
10583    if (max == REG_INFTY)
10584     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10585
10586    goto nest_check;
10587   }
10588  }
10589
10590  if (!ISMULT1(op)) {
10591   *flagp = flags;
10592   return(ret);
10593  }
10594
10595 #if 0    /* Now runtime fix should be reliable. */
10596
10597  /* if this is reinstated, don't forget to put this back into perldiag:
10598
10599    =item Regexp *+ operand could be empty at {#} in regex m/%s/
10600
10601   (F) The part of the regexp subject to either the * or + quantifier
10602   could match an empty string. The {#} shows in the regular
10603   expression about where the problem was discovered.
10604
10605  */
10606
10607  if (!(flags&HASWIDTH) && op != '?')
10608  vFAIL("Regexp *+ operand could be empty");
10609 #endif
10610
10611 #ifdef RE_TRACK_PATTERN_OFFSETS
10612  parse_start = RExC_parse;
10613 #endif
10614  nextchar(pRExC_state);
10615
10616  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10617
10618  if (op == '*' && (flags&SIMPLE)) {
10619   reginsert(pRExC_state, STAR, ret, depth+1);
10620   ret->flags = 0;
10621   RExC_naughty += 4;
10622   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10623  }
10624  else if (op == '*') {
10625   min = 0;
10626   goto do_curly;
10627  }
10628  else if (op == '+' && (flags&SIMPLE)) {
10629   reginsert(pRExC_state, PLUS, ret, depth+1);
10630   ret->flags = 0;
10631   RExC_naughty += 3;
10632   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10633  }
10634  else if (op == '+') {
10635   min = 1;
10636   goto do_curly;
10637  }
10638  else if (op == '?') {
10639   min = 0; max = 1;
10640   goto do_curly;
10641  }
10642   nest_check:
10643  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10644   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10645   ckWARN2reg(RExC_parse,
10646     "%"UTF8f" matches null string many times",
10647     UTF8fARG(UTF, (RExC_parse >= origparse
10648         ? RExC_parse - origparse
10649         : 0),
10650     origparse));
10651   (void)ReREFCNT_inc(RExC_rx_sv);
10652  }
10653
10654  if (RExC_parse < RExC_end && *RExC_parse == '?') {
10655   nextchar(pRExC_state);
10656   reginsert(pRExC_state, MINMOD, ret, depth+1);
10657   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10658  }
10659  else
10660  if (RExC_parse < RExC_end && *RExC_parse == '+') {
10661   regnode *ender;
10662   nextchar(pRExC_state);
10663   ender = reg_node(pRExC_state, SUCCEED);
10664   REGTAIL(pRExC_state, ret, ender);
10665   reginsert(pRExC_state, SUSPEND, ret, depth+1);
10666   ret->flags = 0;
10667   ender = reg_node(pRExC_state, TAIL);
10668   REGTAIL(pRExC_state, ret, ender);
10669  }
10670
10671  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10672   RExC_parse++;
10673   vFAIL("Nested quantifiers");
10674  }
10675
10676  return(ret);
10677 }
10678
10679 STATIC bool
10680 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10681      UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10682      const bool strict   /* Apply stricter parsing rules? */
10683  )
10684 {
10685
10686  /* This is expected to be called by a parser routine that has recognized '\N'
10687    and needs to handle the rest. RExC_parse is expected to point at the first
10688    char following the N at the time of the call.  On successful return,
10689    RExC_parse has been updated to point to just after the sequence identified
10690    by this routine, and <*flagp> has been updated.
10691
10692    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10693    character class.
10694
10695    \N may begin either a named sequence, or if outside a character class, mean
10696    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10697    attempted to decide which, and in the case of a named sequence, converted it
10698    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10699    where c1... are the characters in the sequence.  For single-quoted regexes,
10700    the tokenizer passes the \N sequence through unchanged; this code will not
10701    attempt to determine this nor expand those, instead raising a syntax error.
10702    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10703    or there is no '}', it signals that this \N occurrence means to match a
10704    non-newline.
10705
10706    Only the \N{U+...} form should occur in a character class, for the same
10707    reason that '.' inside a character class means to just match a period: it
10708    just doesn't make sense.
10709
10710    The function raises an error (via vFAIL), and doesn't return for various
10711    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10712    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10713    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10714    only possible if node_p is non-NULL.
10715
10716
10717    If <valuep> is non-null, it means the caller can accept an input sequence
10718    consisting of a just a single code point; <*valuep> is set to that value
10719    if the input is such.
10720
10721    If <node_p> is non-null it signifies that the caller can accept any other
10722    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10723    is set as follows:
10724  1) \N means not-a-NL: points to a newly created REG_ANY node;
10725  2) \N{}:              points to a new NOTHING node;
10726  3) otherwise:         points to a new EXACT node containing the resolved
10727       string.
10728    Note that FALSE is returned for single code point sequences if <valuep> is
10729    null.
10730  */
10731
10732  char * endbrace;    /* '}' following the name */
10733  char* p;
10734  char *endchar; /* Points to '.' or '}' ending cur char in the input
10735       stream */
10736  bool has_multiple_chars; /* true if the input stream contains a sequence of
10737         more than one character */
10738
10739  GET_RE_DEBUG_FLAGS_DECL;
10740
10741  PERL_ARGS_ASSERT_GROK_BSLASH_N;
10742
10743  GET_RE_DEBUG_FLAGS;
10744
10745  assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10746
10747  /* The [^\n] meaning of \N ignores spaces and comments under the /x
10748  * modifier.  The other meaning does not, so use a temporary until we find
10749  * out which we are being called with */
10750  p = (RExC_flags & RXf_PMf_EXTENDED)
10751   ? regpatws(pRExC_state, RExC_parse,
10752         TRUE) /* means recognize comments */
10753   : RExC_parse;
10754
10755  /* Disambiguate between \N meaning a named character versus \N meaning
10756  * [^\n].  The former is assumed when it can't be the latter. */
10757  if (*p != '{' || regcurly(p)) {
10758   RExC_parse = p;
10759   if (! node_p) {
10760    /* no bare \N allowed in a charclass */
10761    if (in_char_class) {
10762     vFAIL("\\N in a character class must be a named character: \\N{...}");
10763    }
10764    return FALSE;
10765   }
10766   RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10767       current char */
10768   nextchar(pRExC_state);
10769   *node_p = reg_node(pRExC_state, REG_ANY);
10770   *flagp |= HASWIDTH|SIMPLE;
10771   RExC_naughty++;
10772   Set_Node_Length(*node_p, 1); /* MJD */
10773   return TRUE;
10774  }
10775
10776  /* Here, we have decided it should be a named character or sequence */
10777
10778  /* The test above made sure that the next real character is a '{', but
10779  * under the /x modifier, it could be separated by space (or a comment and
10780  * \n) and this is not allowed (for consistency with \x{...} and the
10781  * tokenizer handling of \N{NAME}). */
10782  if (*RExC_parse != '{') {
10783   vFAIL("Missing braces on \\N{}");
10784  }
10785
10786  RExC_parse++; /* Skip past the '{' */
10787
10788  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10789   || ! (endbrace == RExC_parse  /* nothing between the {} */
10790    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10791             */
10792     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10793              */
10794  {
10795   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10796   vFAIL("\\N{NAME} must be resolved by the lexer");
10797  }
10798
10799  if (endbrace == RExC_parse) {   /* empty: \N{} */
10800   bool ret = TRUE;
10801   if (node_p) {
10802    *node_p = reg_node(pRExC_state,NOTHING);
10803   }
10804   else if (in_char_class) {
10805    if (SIZE_ONLY && in_char_class) {
10806     if (strict) {
10807      RExC_parse++;   /* Position after the "}" */
10808      vFAIL("Zero length \\N{}");
10809     }
10810     else {
10811      ckWARNreg(RExC_parse,
10812        "Ignoring zero length \\N{} in character class");
10813     }
10814    }
10815    ret = FALSE;
10816   }
10817   else {
10818    return FALSE;
10819   }
10820   nextchar(pRExC_state);
10821   return ret;
10822  }
10823
10824  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10825  RExC_parse += 2; /* Skip past the 'U+' */
10826
10827  endchar = RExC_parse + strcspn(RExC_parse, ".}");
10828
10829  /* Code points are separated by dots.  If none, there is only one code
10830  * point, and is terminated by the brace */
10831  has_multiple_chars = (endchar < endbrace);
10832
10833  if (valuep && (! has_multiple_chars || in_char_class)) {
10834   /* We only pay attention to the first char of
10835   multichar strings being returned in char classes. I kinda wonder
10836   if this makes sense as it does change the behaviour
10837   from earlier versions, OTOH that behaviour was broken
10838   as well. XXX Solution is to recharacterize as
10839   [rest-of-class]|multi1|multi2... */
10840
10841   STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10842   I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10843    | PERL_SCAN_DISALLOW_PREFIX
10844    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10845
10846   *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10847
10848   /* The tokenizer should have guaranteed validity, but it's possible to
10849   * bypass it by using single quoting, so check */
10850   if (length_of_hex == 0
10851    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10852   {
10853    RExC_parse += length_of_hex; /* Includes all the valid */
10854    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10855        ? UTF8SKIP(RExC_parse)
10856        : 1;
10857    /* Guard against malformed utf8 */
10858    if (RExC_parse >= endchar) {
10859     RExC_parse = endchar;
10860    }
10861    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10862   }
10863
10864   if (in_char_class && has_multiple_chars) {
10865    if (strict) {
10866     RExC_parse = endbrace;
10867     vFAIL("\\N{} in character class restricted to one character");
10868    }
10869    else {
10870     ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10871    }
10872   }
10873
10874   RExC_parse = endbrace + 1;
10875  }
10876  else if (! node_p || ! has_multiple_chars) {
10877
10878   /* Here, the input is legal, but not according to the caller's
10879   * options.  We fail without advancing the parse, so that the
10880   * caller can try again */
10881   RExC_parse = p;
10882   return FALSE;
10883  }
10884  else {
10885
10886   /* What is done here is to convert this to a sub-pattern of the form
10887   * (?:\x{char1}\x{char2}...)
10888   * and then call reg recursively.  That way, it retains its atomicness,
10889   * while not having to worry about special handling that some code
10890   * points may have.  toke.c has converted the original Unicode values
10891   * to native, so that we can just pass on the hex values unchanged.  We
10892   * do have to set a flag to keep recoding from happening in the
10893   * recursion */
10894
10895   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10896   STRLEN len;
10897   char *orig_end = RExC_end;
10898   I32 flags;
10899
10900   while (RExC_parse < endbrace) {
10901
10902    /* Convert to notation the rest of the code understands */
10903    sv_catpv(substitute_parse, "\\x{");
10904    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10905    sv_catpv(substitute_parse, "}");
10906
10907    /* Point to the beginning of the next character in the sequence. */
10908    RExC_parse = endchar + 1;
10909    endchar = RExC_parse + strcspn(RExC_parse, ".}");
10910   }
10911   sv_catpv(substitute_parse, ")");
10912
10913   RExC_parse = SvPV(substitute_parse, len);
10914
10915   /* Don't allow empty number */
10916   if (len < 8) {
10917    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10918   }
10919   RExC_end = RExC_parse + len;
10920
10921   /* The values are Unicode, and therefore not subject to recoding */
10922   RExC_override_recoding = 1;
10923
10924   if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10925    if (flags & RESTART_UTF8) {
10926     *flagp = RESTART_UTF8;
10927     return FALSE;
10928    }
10929    FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10930     (UV) flags);
10931   }
10932   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10933
10934   RExC_parse = endbrace;
10935   RExC_end = orig_end;
10936   RExC_override_recoding = 0;
10937
10938   nextchar(pRExC_state);
10939  }
10940
10941  return TRUE;
10942 }
10943
10944
10945 /*
10946  * reg_recode
10947  *
10948  * It returns the code point in utf8 for the value in *encp.
10949  *    value: a code value in the source encoding
10950  *    encp:  a pointer to an Encode object
10951  *
10952  * If the result from Encode is not a single character,
10953  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10954  */
10955 STATIC UV
10956 S_reg_recode(pTHX_ const char value, SV **encp)
10957 {
10958  STRLEN numlen = 1;
10959  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10960  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10961  const STRLEN newlen = SvCUR(sv);
10962  UV uv = UNICODE_REPLACEMENT;
10963
10964  PERL_ARGS_ASSERT_REG_RECODE;
10965
10966  if (newlen)
10967   uv = SvUTF8(sv)
10968    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10969    : *(U8*)s;
10970
10971  if (!newlen || numlen != newlen) {
10972   uv = UNICODE_REPLACEMENT;
10973   *encp = NULL;
10974  }
10975  return uv;
10976 }
10977
10978 PERL_STATIC_INLINE U8
10979 S_compute_EXACTish(RExC_state_t *pRExC_state)
10980 {
10981  U8 op;
10982
10983  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10984
10985  if (! FOLD) {
10986   return EXACT;
10987  }
10988
10989  op = get_regex_charset(RExC_flags);
10990  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10991   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10992     been, so there is no hole */
10993  }
10994
10995  return op + EXACTF;
10996 }
10997
10998 PERL_STATIC_INLINE void
10999 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11000       regnode *node, I32* flagp, STRLEN len, UV code_point,
11001       bool downgradable)
11002 {
11003  /* This knows the details about sizing an EXACTish node, setting flags for
11004  * it (by setting <*flagp>, and potentially populating it with a single
11005  * character.
11006  *
11007  * If <len> (the length in bytes) is non-zero, this function assumes that
11008  * the node has already been populated, and just does the sizing.  In this
11009  * case <code_point> should be the final code point that has already been
11010  * placed into the node.  This value will be ignored except that under some
11011  * circumstances <*flagp> is set based on it.
11012  *
11013  * If <len> is zero, the function assumes that the node is to contain only
11014  * the single character given by <code_point> and calculates what <len>
11015  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11016  * additionally will populate the node's STRING with <code_point> or its
11017  * fold if folding.
11018  *
11019  * In both cases <*flagp> is appropriately set
11020  *
11021  * It knows that under FOLD, the Latin Sharp S and UTF characters above
11022  * 255, must be folded (the former only when the rules indicate it can
11023  * match 'ss')
11024  *
11025  * When it does the populating, it looks at the flag 'downgradable'.  If
11026  * true with a node that folds, it checks if the single code point
11027  * participates in a fold, and if not downgrades the node to an EXACT.
11028  * This helps the optimizer */
11029
11030  bool len_passed_in = cBOOL(len != 0);
11031  U8 character[UTF8_MAXBYTES_CASE+1];
11032
11033  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11034
11035  /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11036  * sizing difference, and is extra work that is thrown away */
11037  if (downgradable && ! PASS2) {
11038   downgradable = FALSE;
11039  }
11040
11041  if (! len_passed_in) {
11042   if (UTF) {
11043    if (UNI_IS_INVARIANT(code_point)) {
11044     if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11045      *character = (U8) code_point;
11046     }
11047     else { /* Here is /i and not /l (toFOLD() is defined on just
11048       ASCII, which isn't the same thing as INVARIANT on
11049       EBCDIC, but it works there, as the extra invariants
11050       fold to themselves) */
11051      *character = toFOLD((U8) code_point);
11052      if (downgradable
11053       && *character == code_point
11054       && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
11055      {
11056       OP(node) = EXACT;
11057      }
11058     }
11059     len = 1;
11060    }
11061    else if (FOLD && (! LOC
11062        || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11063    {   /* Folding, and ok to do so now */
11064     UV folded = _to_uni_fold_flags(
11065         code_point,
11066         character,
11067         &len,
11068         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11069              ? FOLD_FLAGS_NOMIX_ASCII
11070              : 0));
11071     if (downgradable
11072      && folded == code_point
11073      && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11074     {
11075      OP(node) = EXACT;
11076     }
11077    }
11078    else if (code_point <= MAX_UTF8_TWO_BYTE) {
11079
11080     /* Not folding this cp, and can output it directly */
11081     *character = UTF8_TWO_BYTE_HI(code_point);
11082     *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11083     len = 2;
11084    }
11085    else {
11086     uvchr_to_utf8( character, code_point);
11087     len = UTF8SKIP(character);
11088    }
11089   } /* Else pattern isn't UTF8.  */
11090   else if (! FOLD) {
11091    *character = (U8) code_point;
11092    len = 1;
11093   } /* Else is folded non-UTF8 */
11094   else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11095
11096    /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11097    * comments at join_exact()); */
11098    *character = (U8) code_point;
11099    len = 1;
11100
11101    /* Can turn into an EXACT node if we know the fold at compile time,
11102    * and it folds to itself and doesn't particpate in other folds */
11103    if (downgradable
11104     && ! LOC
11105     && PL_fold_latin1[code_point] == code_point
11106     && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11107      || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11108    {
11109     OP(node) = EXACT;
11110    }
11111   } /* else is Sharp s.  May need to fold it */
11112   else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11113    *character = 's';
11114    *(character + 1) = 's';
11115    len = 2;
11116   }
11117   else {
11118    *character = LATIN_SMALL_LETTER_SHARP_S;
11119    len = 1;
11120   }
11121  }
11122
11123  if (SIZE_ONLY) {
11124   RExC_size += STR_SZ(len);
11125  }
11126  else {
11127   RExC_emit += STR_SZ(len);
11128   STR_LEN(node) = len;
11129   if (! len_passed_in) {
11130    Copy((char *) character, STRING(node), len, char);
11131   }
11132  }
11133
11134  *flagp |= HASWIDTH;
11135
11136  /* A single character node is SIMPLE, except for the special-cased SHARP S
11137  * under /di. */
11138  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11139   && (code_point != LATIN_SMALL_LETTER_SHARP_S
11140    || ! FOLD || ! DEPENDS_SEMANTICS))
11141  {
11142   *flagp |= SIMPLE;
11143  }
11144
11145  /* The OP may not be well defined in PASS1 */
11146  if (PASS2 && OP(node) == EXACTFL) {
11147   RExC_contains_locale = 1;
11148  }
11149 }
11150
11151
11152 /* return atoi(p), unless it's too big to sensibly be a backref,
11153  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11154
11155 static I32
11156 S_backref_value(char *p)
11157 {
11158  char *q = p;
11159
11160  for (;isDIGIT(*q); q++) {} /* calculate length of num */
11161  if (q - p == 0 || q - p > 9)
11162   return I32_MAX;
11163  return atoi(p);
11164 }
11165
11166
11167 /*
11168  - regatom - the lowest level
11169
11170    Try to identify anything special at the start of the pattern. If there
11171    is, then handle it as required. This may involve generating a single regop,
11172    such as for an assertion; or it may involve recursing, such as to
11173    handle a () structure.
11174
11175    If the string doesn't start with something special then we gobble up
11176    as much literal text as we can.
11177
11178    Once we have been able to handle whatever type of thing started the
11179    sequence, we return.
11180
11181    Note: we have to be careful with escapes, as they can be both literal
11182    and special, and in the case of \10 and friends, context determines which.
11183
11184    A summary of the code structure is:
11185
11186    switch (first_byte) {
11187   cases for each special:
11188    handle this special;
11189    break;
11190   case '\\':
11191    switch (2nd byte) {
11192     cases for each unambiguous special:
11193      handle this special;
11194      break;
11195     cases for each ambigous special/literal:
11196      disambiguate;
11197      if (special)  handle here
11198      else goto defchar;
11199     default: // unambiguously literal:
11200      goto defchar;
11201    }
11202   default:  // is a literal char
11203    // FALL THROUGH
11204   defchar:
11205    create EXACTish node for literal;
11206    while (more input and node isn't full) {
11207     switch (input_byte) {
11208     cases for each special;
11209      make sure parse pointer is set so that the next call to
11210       regatom will see this special first
11211      goto loopdone; // EXACTish node terminated by prev. char
11212     default:
11213      append char to EXACTISH node;
11214     }
11215     get next input byte;
11216    }
11217   loopdone:
11218    }
11219    return the generated node;
11220
11221    Specifically there are two separate switches for handling
11222    escape sequences, with the one for handling literal escapes requiring
11223    a dummy entry for all of the special escapes that are actually handled
11224    by the other.
11225
11226    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11227    TRYAGAIN.
11228    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11229    restarted.
11230    Otherwise does not return NULL.
11231 */
11232
11233 STATIC regnode *
11234 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11235 {
11236  regnode *ret = NULL;
11237  I32 flags = 0;
11238  char *parse_start = RExC_parse;
11239  U8 op;
11240  int invert = 0;
11241  U8 arg;
11242
11243  GET_RE_DEBUG_FLAGS_DECL;
11244
11245  *flagp = WORST;  /* Tentatively. */
11246
11247  DEBUG_PARSE("atom");
11248
11249  PERL_ARGS_ASSERT_REGATOM;
11250
11251 tryagain:
11252  switch ((U8)*RExC_parse) {
11253  case '^':
11254   RExC_seen_zerolen++;
11255   nextchar(pRExC_state);
11256   if (RExC_flags & RXf_PMf_MULTILINE)
11257    ret = reg_node(pRExC_state, MBOL);
11258   else if (RExC_flags & RXf_PMf_SINGLELINE)
11259    ret = reg_node(pRExC_state, SBOL);
11260   else
11261    ret = reg_node(pRExC_state, BOL);
11262   Set_Node_Length(ret, 1); /* MJD */
11263   break;
11264  case '$':
11265   nextchar(pRExC_state);
11266   if (*RExC_parse)
11267    RExC_seen_zerolen++;
11268   if (RExC_flags & RXf_PMf_MULTILINE)
11269    ret = reg_node(pRExC_state, MEOL);
11270   else if (RExC_flags & RXf_PMf_SINGLELINE)
11271    ret = reg_node(pRExC_state, SEOL);
11272   else
11273    ret = reg_node(pRExC_state, EOL);
11274   Set_Node_Length(ret, 1); /* MJD */
11275   break;
11276  case '.':
11277   nextchar(pRExC_state);
11278   if (RExC_flags & RXf_PMf_SINGLELINE)
11279    ret = reg_node(pRExC_state, SANY);
11280   else
11281    ret = reg_node(pRExC_state, REG_ANY);
11282   *flagp |= HASWIDTH|SIMPLE;
11283   RExC_naughty++;
11284   Set_Node_Length(ret, 1); /* MJD */
11285   break;
11286  case '[':
11287  {
11288   char * const oregcomp_parse = ++RExC_parse;
11289   ret = regclass(pRExC_state, flagp,depth+1,
11290      FALSE, /* means parse the whole char class */
11291      TRUE, /* allow multi-char folds */
11292      FALSE, /* don't silence non-portable warnings. */
11293      NULL);
11294   if (*RExC_parse != ']') {
11295    RExC_parse = oregcomp_parse;
11296    vFAIL("Unmatched [");
11297   }
11298   if (ret == NULL) {
11299    if (*flagp & RESTART_UTF8)
11300     return NULL;
11301    FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11302     (UV) *flagp);
11303   }
11304   nextchar(pRExC_state);
11305   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11306   break;
11307  }
11308  case '(':
11309   nextchar(pRExC_state);
11310   ret = reg(pRExC_state, 2, &flags,depth+1);
11311   if (ret == NULL) {
11312     if (flags & TRYAGAIN) {
11313      if (RExC_parse == RExC_end) {
11314       /* Make parent create an empty node if needed. */
11315       *flagp |= TRYAGAIN;
11316       return(NULL);
11317      }
11318      goto tryagain;
11319     }
11320     if (flags & RESTART_UTF8) {
11321      *flagp = RESTART_UTF8;
11322      return NULL;
11323     }
11324     FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11325                 (UV) flags);
11326   }
11327   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11328   break;
11329  case '|':
11330  case ')':
11331   if (flags & TRYAGAIN) {
11332    *flagp |= TRYAGAIN;
11333    return NULL;
11334   }
11335   vFAIL("Internal urp");
11336         /* Supposed to be caught earlier. */
11337   break;
11338  case '?':
11339  case '+':
11340  case '*':
11341   RExC_parse++;
11342   vFAIL("Quantifier follows nothing");
11343   break;
11344  case '\\':
11345   /* Special Escapes
11346
11347   This switch handles escape sequences that resolve to some kind
11348   of special regop and not to literal text. Escape sequnces that
11349   resolve to literal text are handled below in the switch marked
11350   "Literal Escapes".
11351
11352   Every entry in this switch *must* have a corresponding entry
11353   in the literal escape switch. However, the opposite is not
11354   required, as the default for this switch is to jump to the
11355   literal text handling code.
11356   */
11357   switch ((U8)*++RExC_parse) {
11358   /* Special Escapes */
11359   case 'A':
11360    RExC_seen_zerolen++;
11361    ret = reg_node(pRExC_state, SBOL);
11362    *flagp |= SIMPLE;
11363    goto finish_meta_pat;
11364   case 'G':
11365    ret = reg_node(pRExC_state, GPOS);
11366    RExC_seen |= REG_GPOS_SEEN;
11367    *flagp |= SIMPLE;
11368    goto finish_meta_pat;
11369   case 'K':
11370    RExC_seen_zerolen++;
11371    ret = reg_node(pRExC_state, KEEPS);
11372    *flagp |= SIMPLE;
11373    /* XXX:dmq : disabling in-place substitution seems to
11374    * be necessary here to avoid cases of memory corruption, as
11375    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11376    */
11377    RExC_seen |= REG_LOOKBEHIND_SEEN;
11378    goto finish_meta_pat;
11379   case 'Z':
11380    ret = reg_node(pRExC_state, SEOL);
11381    *flagp |= SIMPLE;
11382    RExC_seen_zerolen++;  /* Do not optimize RE away */
11383    goto finish_meta_pat;
11384   case 'z':
11385    ret = reg_node(pRExC_state, EOS);
11386    *flagp |= SIMPLE;
11387    RExC_seen_zerolen++;  /* Do not optimize RE away */
11388    goto finish_meta_pat;
11389   case 'C':
11390    ret = reg_node(pRExC_state, CANY);
11391    RExC_seen |= REG_CANY_SEEN;
11392    *flagp |= HASWIDTH|SIMPLE;
11393    if (SIZE_ONLY) {
11394     ckWARNdep(RExC_parse+1, "\\C is deprecated");
11395    }
11396    goto finish_meta_pat;
11397   case 'X':
11398    ret = reg_node(pRExC_state, CLUMP);
11399    *flagp |= HASWIDTH;
11400    goto finish_meta_pat;
11401
11402   case 'W':
11403    invert = 1;
11404    /* FALLTHROUGH */
11405   case 'w':
11406    arg = ANYOF_WORDCHAR;
11407    goto join_posix;
11408
11409   case 'b':
11410    RExC_seen_zerolen++;
11411    RExC_seen |= REG_LOOKBEHIND_SEEN;
11412    op = BOUND + get_regex_charset(RExC_flags);
11413    if (op > BOUNDA) {  /* /aa is same as /a */
11414     op = BOUNDA;
11415    }
11416    else if (op == BOUNDL) {
11417     RExC_contains_locale = 1;
11418    }
11419    ret = reg_node(pRExC_state, op);
11420    FLAGS(ret) = get_regex_charset(RExC_flags);
11421    *flagp |= SIMPLE;
11422    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11423     /* diag_listed_as: Use "%s" instead of "%s" */
11424     vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11425    }
11426    goto finish_meta_pat;
11427   case 'B':
11428    RExC_seen_zerolen++;
11429    RExC_seen |= REG_LOOKBEHIND_SEEN;
11430    op = NBOUND + get_regex_charset(RExC_flags);
11431    if (op > NBOUNDA) { /* /aa is same as /a */
11432     op = NBOUNDA;
11433    }
11434    else if (op == NBOUNDL) {
11435     RExC_contains_locale = 1;
11436    }
11437    ret = reg_node(pRExC_state, op);
11438    FLAGS(ret) = get_regex_charset(RExC_flags);
11439    *flagp |= SIMPLE;
11440    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11441     /* diag_listed_as: Use "%s" instead of "%s" */
11442     vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11443    }
11444    goto finish_meta_pat;
11445
11446   case 'D':
11447    invert = 1;
11448    /* FALLTHROUGH */
11449   case 'd':
11450    arg = ANYOF_DIGIT;
11451    goto join_posix;
11452
11453   case 'R':
11454    ret = reg_node(pRExC_state, LNBREAK);
11455    *flagp |= HASWIDTH|SIMPLE;
11456    goto finish_meta_pat;
11457
11458   case 'H':
11459    invert = 1;
11460    /* FALLTHROUGH */
11461   case 'h':
11462    arg = ANYOF_BLANK;
11463    op = POSIXU;
11464    goto join_posix_op_known;
11465
11466   case 'V':
11467    invert = 1;
11468    /* FALLTHROUGH */
11469   case 'v':
11470    arg = ANYOF_VERTWS;
11471    op = POSIXU;
11472    goto join_posix_op_known;
11473
11474   case 'S':
11475    invert = 1;
11476    /* FALLTHROUGH */
11477   case 's':
11478    arg = ANYOF_SPACE;
11479
11480   join_posix:
11481
11482    op = POSIXD + get_regex_charset(RExC_flags);
11483    if (op > POSIXA) {  /* /aa is same as /a */
11484     op = POSIXA;
11485    }
11486    else if (op == POSIXL) {
11487     RExC_contains_locale = 1;
11488    }
11489
11490   join_posix_op_known:
11491
11492    if (invert) {
11493     op += NPOSIXD - POSIXD;
11494    }
11495
11496    ret = reg_node(pRExC_state, op);
11497    if (! SIZE_ONLY) {
11498     FLAGS(ret) = namedclass_to_classnum(arg);
11499    }
11500
11501    *flagp |= HASWIDTH|SIMPLE;
11502    /* FALLTHROUGH */
11503
11504   finish_meta_pat:
11505    nextchar(pRExC_state);
11506    Set_Node_Length(ret, 2); /* MJD */
11507    break;
11508   case 'p':
11509   case 'P':
11510    {
11511 #ifdef DEBUGGING
11512     char* parse_start = RExC_parse - 2;
11513 #endif
11514
11515     RExC_parse--;
11516
11517     ret = regclass(pRExC_state, flagp,depth+1,
11518        TRUE, /* means just parse this element */
11519        FALSE, /* don't allow multi-char folds */
11520        FALSE, /* don't silence non-portable warnings.
11521           It would be a bug if these returned
11522           non-portables */
11523        NULL);
11524     /* regclass() can only return RESTART_UTF8 if multi-char folds
11525     are allowed.  */
11526     if (!ret)
11527      FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11528       (UV) *flagp);
11529
11530     RExC_parse--;
11531
11532     Set_Node_Offset(ret, parse_start + 2);
11533     Set_Node_Cur_Length(ret, parse_start);
11534     nextchar(pRExC_state);
11535    }
11536    break;
11537   case 'N':
11538    /* Handle \N and \N{NAME} with multiple code points here and not
11539    * below because it can be multicharacter. join_exact() will join
11540    * them up later on.  Also this makes sure that things like
11541    * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11542    * The options to the grok function call causes it to fail if the
11543    * sequence is just a single code point.  We then go treat it as
11544    * just another character in the current EXACT node, and hence it
11545    * gets uniform treatment with all the other characters.  The
11546    * special treatment for quantifiers is not needed for such single
11547    * character sequences */
11548    ++RExC_parse;
11549    if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11550         FALSE /* not strict */ )) {
11551     if (*flagp & RESTART_UTF8)
11552      return NULL;
11553     RExC_parse--;
11554     goto defchar;
11555    }
11556    break;
11557   case 'k':    /* Handle \k<NAME> and \k'NAME' */
11558   parse_named_seq:
11559   {
11560    char ch= RExC_parse[1];
11561    if (ch != '<' && ch != '\'' && ch != '{') {
11562     RExC_parse++;
11563     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11564     vFAIL2("Sequence %.2s... not terminated",parse_start);
11565    } else {
11566     /* this pretty much dupes the code for (?P=...) in reg(), if
11567     you change this make sure you change that */
11568     char* name_start = (RExC_parse += 2);
11569     U32 num = 0;
11570     SV *sv_dat = reg_scan_name(pRExC_state,
11571      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11572     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11573     if (RExC_parse == name_start || *RExC_parse != ch)
11574      /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11575      vFAIL2("Sequence %.3s... not terminated",parse_start);
11576
11577     if (!SIZE_ONLY) {
11578      num = add_data( pRExC_state, STR_WITH_LEN("S"));
11579      RExC_rxi->data->data[num]=(void*)sv_dat;
11580      SvREFCNT_inc_simple_void(sv_dat);
11581     }
11582
11583     RExC_sawback = 1;
11584     ret = reganode(pRExC_state,
11585        ((! FOLD)
11586         ? NREF
11587         : (ASCII_FOLD_RESTRICTED)
11588         ? NREFFA
11589         : (AT_LEAST_UNI_SEMANTICS)
11590          ? NREFFU
11591          : (LOC)
11592          ? NREFFL
11593          : NREFF),
11594         num);
11595     *flagp |= HASWIDTH;
11596
11597     /* override incorrect value set in reganode MJD */
11598     Set_Node_Offset(ret, parse_start+1);
11599     Set_Node_Cur_Length(ret, parse_start);
11600     nextchar(pRExC_state);
11601
11602    }
11603    break;
11604   }
11605   case 'g':
11606   case '1': case '2': case '3': case '4':
11607   case '5': case '6': case '7': case '8': case '9':
11608    {
11609     I32 num;
11610     bool hasbrace = 0;
11611
11612     if (*RExC_parse == 'g') {
11613      bool isrel = 0;
11614
11615      RExC_parse++;
11616      if (*RExC_parse == '{') {
11617       RExC_parse++;
11618       hasbrace = 1;
11619      }
11620      if (*RExC_parse == '-') {
11621       RExC_parse++;
11622       isrel = 1;
11623      }
11624      if (hasbrace && !isDIGIT(*RExC_parse)) {
11625       if (isrel) RExC_parse--;
11626       RExC_parse -= 2;
11627       goto parse_named_seq;
11628      }
11629
11630      num = S_backref_value(RExC_parse);
11631      if (num == 0)
11632       vFAIL("Reference to invalid group 0");
11633      else if (num == I32_MAX) {
11634       if (isDIGIT(*RExC_parse))
11635        vFAIL("Reference to nonexistent group");
11636       else
11637        vFAIL("Unterminated \\g... pattern");
11638      }
11639
11640      if (isrel) {
11641       num = RExC_npar - num;
11642       if (num < 1)
11643        vFAIL("Reference to nonexistent or unclosed group");
11644      }
11645     }
11646     else {
11647      num = S_backref_value(RExC_parse);
11648      /* bare \NNN might be backref or octal - if it is larger than or equal
11649      * RExC_npar then it is assumed to be and octal escape.
11650      * Note RExC_npar is +1 from the actual number of parens*/
11651      if (num == I32_MAX || (num > 9 && num >= RExC_npar
11652        && *RExC_parse != '8' && *RExC_parse != '9'))
11653      {
11654       /* Probably a character specified in octal, e.g. \35 */
11655       goto defchar;
11656      }
11657     }
11658
11659     /* at this point RExC_parse definitely points to a backref
11660     * number */
11661     {
11662 #ifdef RE_TRACK_PATTERN_OFFSETS
11663      char * const parse_start = RExC_parse - 1; /* MJD */
11664 #endif
11665      while (isDIGIT(*RExC_parse))
11666       RExC_parse++;
11667      if (hasbrace) {
11668       if (*RExC_parse != '}')
11669        vFAIL("Unterminated \\g{...} pattern");
11670       RExC_parse++;
11671      }
11672      if (!SIZE_ONLY) {
11673       if (num > (I32)RExC_rx->nparens)
11674        vFAIL("Reference to nonexistent group");
11675      }
11676      RExC_sawback = 1;
11677      ret = reganode(pRExC_state,
11678         ((! FOLD)
11679          ? REF
11680          : (ASCII_FOLD_RESTRICTED)
11681          ? REFFA
11682          : (AT_LEAST_UNI_SEMANTICS)
11683           ? REFFU
11684           : (LOC)
11685           ? REFFL
11686           : REFF),
11687          num);
11688      *flagp |= HASWIDTH;
11689
11690      /* override incorrect value set in reganode MJD */
11691      Set_Node_Offset(ret, parse_start+1);
11692      Set_Node_Cur_Length(ret, parse_start);
11693      RExC_parse--;
11694      nextchar(pRExC_state);
11695     }
11696    }
11697    break;
11698   case '\0':
11699    if (RExC_parse >= RExC_end)
11700     FAIL("Trailing \\");
11701    /* FALLTHROUGH */
11702   default:
11703    /* Do not generate "unrecognized" warnings here, we fall
11704    back into the quick-grab loop below */
11705    parse_start--;
11706    goto defchar;
11707   }
11708   break;
11709
11710  case '#':
11711   if (RExC_flags & RXf_PMf_EXTENDED) {
11712    RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11713    if (RExC_parse < RExC_end)
11714     goto tryagain;
11715   }
11716   /* FALLTHROUGH */
11717
11718  default:
11719
11720    parse_start = RExC_parse - 1;
11721
11722    RExC_parse++;
11723
11724   defchar: {
11725    STRLEN len = 0;
11726    UV ender = 0;
11727    char *p;
11728    char *s;
11729 #define MAX_NODE_STRING_SIZE 127
11730    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11731    char *s0;
11732    U8 upper_parse = MAX_NODE_STRING_SIZE;
11733    U8 node_type = compute_EXACTish(pRExC_state);
11734    bool next_is_quantifier;
11735    char * oldp = NULL;
11736
11737    /* We can convert EXACTF nodes to EXACTFU if they contain only
11738    * characters that match identically regardless of the target
11739    * string's UTF8ness.  The reason to do this is that EXACTF is not
11740    * trie-able, EXACTFU is.
11741    *
11742    * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11743    * contain only above-Latin1 characters (hence must be in UTF8),
11744    * which don't participate in folds with Latin1-range characters,
11745    * as the latter's folds aren't known until runtime.  (We don't
11746    * need to figure this out until pass 2) */
11747    bool maybe_exactfu = PASS2
11748        && (node_type == EXACTF || node_type == EXACTFL);
11749
11750    /* If a folding node contains only code points that don't
11751    * participate in folds, it can be changed into an EXACT node,
11752    * which allows the optimizer more things to look for */
11753    bool maybe_exact;
11754
11755    ret = reg_node(pRExC_state, node_type);
11756
11757    /* In pass1, folded, we use a temporary buffer instead of the
11758    * actual node, as the node doesn't exist yet */
11759    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11760
11761    s0 = s;
11762
11763   reparse:
11764
11765    /* We do the EXACTFish to EXACT node only if folding.  (And we
11766    * don't need to figure this out until pass 2) */
11767    maybe_exact = FOLD && PASS2;
11768
11769    /* XXX The node can hold up to 255 bytes, yet this only goes to
11770    * 127.  I (khw) do not know why.  Keeping it somewhat less than
11771    * 255 allows us to not have to worry about overflow due to
11772    * converting to utf8 and fold expansion, but that value is
11773    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11774    * split up by this limit into a single one using the real max of
11775    * 255.  Even at 127, this breaks under rare circumstances.  If
11776    * folding, we do not want to split a node at a character that is a
11777    * non-final in a multi-char fold, as an input string could just
11778    * happen to want to match across the node boundary.  The join
11779    * would solve that problem if the join actually happens.  But a
11780    * series of more than two nodes in a row each of 127 would cause
11781    * the first join to succeed to get to 254, but then there wouldn't
11782    * be room for the next one, which could at be one of those split
11783    * multi-char folds.  I don't know of any fool-proof solution.  One
11784    * could back off to end with only a code point that isn't such a
11785    * non-final, but it is possible for there not to be any in the
11786    * entire node. */
11787    for (p = RExC_parse - 1;
11788     len < upper_parse && p < RExC_end;
11789     len++)
11790    {
11791     oldp = p;
11792
11793     if (RExC_flags & RXf_PMf_EXTENDED)
11794      p = regpatws(pRExC_state, p,
11795           TRUE); /* means recognize comments */
11796     switch ((U8)*p) {
11797     case '^':
11798     case '$':
11799     case '.':
11800     case '[':
11801     case '(':
11802     case ')':
11803     case '|':
11804      goto loopdone;
11805     case '\\':
11806      /* Literal Escapes Switch
11807
11808      This switch is meant to handle escape sequences that
11809      resolve to a literal character.
11810
11811      Every escape sequence that represents something
11812      else, like an assertion or a char class, is handled
11813      in the switch marked 'Special Escapes' above in this
11814      routine, but also has an entry here as anything that
11815      isn't explicitly mentioned here will be treated as
11816      an unescaped equivalent literal.
11817      */
11818
11819      switch ((U8)*++p) {
11820      /* These are all the special escapes. */
11821      case 'A':             /* Start assertion */
11822      case 'b': case 'B':   /* Word-boundary assertion*/
11823      case 'C':             /* Single char !DANGEROUS! */
11824      case 'd': case 'D':   /* digit class */
11825      case 'g': case 'G':   /* generic-backref, pos assertion */
11826      case 'h': case 'H':   /* HORIZWS */
11827      case 'k': case 'K':   /* named backref, keep marker */
11828      case 'p': case 'P':   /* Unicode property */
11829        case 'R':   /* LNBREAK */
11830      case 's': case 'S':   /* space class */
11831      case 'v': case 'V':   /* VERTWS */
11832      case 'w': case 'W':   /* word class */
11833      case 'X':             /* eXtended Unicode "combining
11834            character sequence" */
11835      case 'z': case 'Z':   /* End of line/string assertion */
11836       --p;
11837       goto loopdone;
11838
11839      /* Anything after here is an escape that resolves to a
11840      literal. (Except digits, which may or may not)
11841      */
11842      case 'n':
11843       ender = '\n';
11844       p++;
11845       break;
11846      case 'N': /* Handle a single-code point named character. */
11847       /* The options cause it to fail if a multiple code
11848       * point sequence.  Handle those in the switch() above
11849       * */
11850       RExC_parse = p + 1;
11851       if (! grok_bslash_N(pRExC_state, NULL, &ender,
11852            flagp, depth, FALSE,
11853            FALSE /* not strict */ ))
11854       {
11855        if (*flagp & RESTART_UTF8)
11856         FAIL("panic: grok_bslash_N set RESTART_UTF8");
11857        RExC_parse = p = oldp;
11858        goto loopdone;
11859       }
11860       p = RExC_parse;
11861       if (ender > 0xff) {
11862        REQUIRE_UTF8;
11863       }
11864       break;
11865      case 'r':
11866       ender = '\r';
11867       p++;
11868       break;
11869      case 't':
11870       ender = '\t';
11871       p++;
11872       break;
11873      case 'f':
11874       ender = '\f';
11875       p++;
11876       break;
11877      case 'e':
11878       ender = ASCII_TO_NATIVE('\033');
11879       p++;
11880       break;
11881      case 'a':
11882       ender = '\a';
11883       p++;
11884       break;
11885      case 'o':
11886       {
11887        UV result;
11888        const char* error_msg;
11889
11890        bool valid = grok_bslash_o(&p,
11891              &result,
11892              &error_msg,
11893              TRUE, /* out warnings */
11894              FALSE, /* not strict */
11895              TRUE, /* Output warnings
11896                 for non-
11897                 portables */
11898              UTF);
11899        if (! valid) {
11900         RExC_parse = p; /* going to die anyway; point
11901             to exact spot of failure */
11902         vFAIL(error_msg);
11903        }
11904        ender = result;
11905        if (PL_encoding && ender < 0x100) {
11906         goto recode_encoding;
11907        }
11908        if (ender > 0xff) {
11909         REQUIRE_UTF8;
11910        }
11911        break;
11912       }
11913      case 'x':
11914       {
11915        UV result = UV_MAX; /* initialize to erroneous
11916             value */
11917        const char* error_msg;
11918
11919        bool valid = grok_bslash_x(&p,
11920              &result,
11921              &error_msg,
11922              TRUE, /* out warnings */
11923              FALSE, /* not strict */
11924              TRUE, /* Output warnings
11925                 for non-
11926                 portables */
11927              UTF);
11928        if (! valid) {
11929         RExC_parse = p; /* going to die anyway; point
11930             to exact spot of failure */
11931         vFAIL(error_msg);
11932        }
11933        ender = result;
11934
11935        if (PL_encoding && ender < 0x100) {
11936         goto recode_encoding;
11937        }
11938        if (ender > 0xff) {
11939         REQUIRE_UTF8;
11940        }
11941        break;
11942       }
11943      case 'c':
11944       p++;
11945       ender = grok_bslash_c(*p++, SIZE_ONLY);
11946       break;
11947      case '8': case '9': /* must be a backreference */
11948       --p;
11949       goto loopdone;
11950      case '1': case '2': case '3':case '4':
11951      case '5': case '6': case '7':
11952       /* When we parse backslash escapes there is ambiguity
11953       * between backreferences and octal escapes. Any escape
11954       * from \1 - \9 is a backreference, any multi-digit
11955       * escape which does not start with 0 and which when
11956       * evaluated as decimal could refer to an already
11957       * parsed capture buffer is a backslash. Anything else
11958       * is octal.
11959       *
11960       * Note this implies that \118 could be interpreted as
11961       * 118 OR as "\11" . "8" depending on whether there
11962       * were 118 capture buffers defined already in the
11963       * pattern.  */
11964
11965       /* NOTE, RExC_npar is 1 more than the actual number of
11966       * parens we have seen so far, hence the < RExC_npar below. */
11967
11968       if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11969       {  /* Not to be treated as an octal constant, go
11970         find backref */
11971        --p;
11972        goto loopdone;
11973       }
11974       /* FALLTHROUGH */
11975      case '0':
11976       {
11977        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11978        STRLEN numlen = 3;
11979        ender = grok_oct(p, &numlen, &flags, NULL);
11980        if (ender > 0xff) {
11981         REQUIRE_UTF8;
11982        }
11983        p += numlen;
11984        if (SIZE_ONLY   /* like \08, \178 */
11985         && numlen < 3
11986         && p < RExC_end
11987         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11988        {
11989         reg_warn_non_literal_string(
11990           p + 1,
11991           form_short_octal_warning(p, numlen));
11992        }
11993       }
11994       if (PL_encoding && ender < 0x100)
11995        goto recode_encoding;
11996       break;
11997      recode_encoding:
11998       if (! RExC_override_recoding) {
11999        SV* enc = PL_encoding;
12000        ender = reg_recode((const char)(U8)ender, &enc);
12001        if (!enc && SIZE_ONLY)
12002         ckWARNreg(p, "Invalid escape in the specified encoding");
12003        REQUIRE_UTF8;
12004       }
12005       break;
12006      case '\0':
12007       if (p >= RExC_end)
12008        FAIL("Trailing \\");
12009       /* FALLTHROUGH */
12010      default:
12011       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12012        /* Include any { following the alpha to emphasize
12013        * that it could be part of an escape at some point
12014        * in the future */
12015        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12016        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12017       }
12018       goto normal_default;
12019      } /* End of switch on '\' */
12020      break;
12021     case '{':
12022      /* Currently we don't warn when the lbrace is at the start
12023      * of a construct.  This catches it in the middle of a
12024      * literal string, or when its the first thing after
12025      * something like "\b" */
12026      if (! SIZE_ONLY
12027       && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12028      {
12029       ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12030      }
12031      /*FALLTHROUGH*/
12032     default:    /* A literal character */
12033     normal_default:
12034      if (UTF8_IS_START(*p) && UTF) {
12035       STRLEN numlen;
12036       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12037            &numlen, UTF8_ALLOW_DEFAULT);
12038       p += numlen;
12039      }
12040      else
12041       ender = (U8) *p++;
12042      break;
12043     } /* End of switch on the literal */
12044
12045     /* Here, have looked at the literal character and <ender>
12046     * contains its ordinal, <p> points to the character after it
12047     */
12048
12049     if ( RExC_flags & RXf_PMf_EXTENDED)
12050      p = regpatws(pRExC_state, p,
12051           TRUE); /* means recognize comments */
12052
12053     /* If the next thing is a quantifier, it applies to this
12054     * character only, which means that this character has to be in
12055     * its own node and can't just be appended to the string in an
12056     * existing node, so if there are already other characters in
12057     * the node, close the node with just them, and set up to do
12058     * this character again next time through, when it will be the
12059     * only thing in its new node */
12060     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12061     {
12062      p = oldp;
12063      goto loopdone;
12064     }
12065
12066     if (! FOLD   /* The simple case, just append the literal */
12067      || (LOC  /* Also don't fold for tricky chars under /l */
12068       && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12069     {
12070      if (UTF) {
12071       const STRLEN unilen = reguni(pRExC_state, ender, s);
12072       if (unilen > 0) {
12073       s   += unilen;
12074       len += unilen;
12075       }
12076
12077       /* The loop increments <len> each time, as all but this
12078       * path (and one other) through it add a single byte to
12079       * the EXACTish node.  But this one has changed len to
12080       * be the correct final value, so subtract one to
12081       * cancel out the increment that follows */
12082       len--;
12083      }
12084      else {
12085       REGC((char)ender, s++);
12086      }
12087
12088      /* Can get here if folding only if is one of the /l
12089      * characters whose fold depends on the locale.  The
12090      * occurrence of any of these indicate that we can't
12091      * simplify things */
12092      if (FOLD) {
12093       maybe_exact = FALSE;
12094       maybe_exactfu = FALSE;
12095      }
12096     }
12097     else             /* FOLD */
12098      if (! ( UTF
12099       /* See comments for join_exact() as to why we fold this
12100       * non-UTF at compile time */
12101       || (node_type == EXACTFU
12102        && ender == LATIN_SMALL_LETTER_SHARP_S)))
12103     {
12104      /* Here, are folding and are not UTF-8 encoded; therefore
12105      * the character must be in the range 0-255, and is not /l
12106      * (Not /l because we already handled these under /l in
12107      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12108      if (IS_IN_SOME_FOLD_L1(ender)) {
12109       maybe_exact = FALSE;
12110
12111       /* See if the character's fold differs between /d and
12112       * /u.  This includes the multi-char fold SHARP S to
12113       * 'ss' */
12114       if (maybe_exactfu
12115        && (PL_fold[ender] != PL_fold_latin1[ender]
12116         || ender == LATIN_SMALL_LETTER_SHARP_S
12117         || (len > 0
12118         && isARG2_lower_or_UPPER_ARG1('s', ender)
12119         && isARG2_lower_or_UPPER_ARG1('s',
12120                 *(s-1)))))
12121       {
12122        maybe_exactfu = FALSE;
12123       }
12124      }
12125
12126      /* Even when folding, we store just the input character, as
12127      * we have an array that finds its fold quickly */
12128      *(s++) = (char) ender;
12129     }
12130     else {  /* FOLD and UTF */
12131      /* Unlike the non-fold case, we do actually have to
12132      * calculate the results here in pass 1.  This is for two
12133      * reasons, the folded length may be longer than the
12134      * unfolded, and we have to calculate how many EXACTish
12135      * nodes it will take; and we may run out of room in a node
12136      * in the middle of a potential multi-char fold, and have
12137      * to back off accordingly.  (Hence we can't use REGC for
12138      * the simple case just below.) */
12139
12140      UV folded;
12141      if (isASCII(ender)) {
12142       folded = toFOLD(ender);
12143       *(s)++ = (U8) folded;
12144      }
12145      else {
12146       STRLEN foldlen;
12147
12148       folded = _to_uni_fold_flags(
12149          ender,
12150          (U8 *) s,
12151          &foldlen,
12152          FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12153               ? FOLD_FLAGS_NOMIX_ASCII
12154               : 0));
12155       s += foldlen;
12156
12157       /* The loop increments <len> each time, as all but this
12158       * path (and one other) through it add a single byte to
12159       * the EXACTish node.  But this one has changed len to
12160       * be the correct final value, so subtract one to
12161       * cancel out the increment that follows */
12162       len += foldlen - 1;
12163      }
12164      /* If this node only contains non-folding code points so
12165      * far, see if this new one is also non-folding */
12166      if (maybe_exact) {
12167       if (folded != ender) {
12168        maybe_exact = FALSE;
12169       }
12170       else {
12171        /* Here the fold is the original; we have to check
12172        * further to see if anything folds to it */
12173        if (_invlist_contains_cp(PL_utf8_foldable,
12174               ender))
12175        {
12176         maybe_exact = FALSE;
12177        }
12178       }
12179      }
12180      ender = folded;
12181     }
12182
12183     if (next_is_quantifier) {
12184
12185      /* Here, the next input is a quantifier, and to get here,
12186      * the current character is the only one in the node.
12187      * Also, here <len> doesn't include the final byte for this
12188      * character */
12189      len++;
12190      goto loopdone;
12191     }
12192
12193    } /* End of loop through literal characters */
12194
12195    /* Here we have either exhausted the input or ran out of room in
12196    * the node.  (If we encountered a character that can't be in the
12197    * node, transfer is made directly to <loopdone>, and so we
12198    * wouldn't have fallen off the end of the loop.)  In the latter
12199    * case, we artificially have to split the node into two, because
12200    * we just don't have enough space to hold everything.  This
12201    * creates a problem if the final character participates in a
12202    * multi-character fold in the non-final position, as a match that
12203    * should have occurred won't, due to the way nodes are matched,
12204    * and our artificial boundary.  So back off until we find a non-
12205    * problematic character -- one that isn't at the beginning or
12206    * middle of such a fold.  (Either it doesn't participate in any
12207    * folds, or appears only in the final position of all the folds it
12208    * does participate in.)  A better solution with far fewer false
12209    * positives, and that would fill the nodes more completely, would
12210    * be to actually have available all the multi-character folds to
12211    * test against, and to back-off only far enough to be sure that
12212    * this node isn't ending with a partial one.  <upper_parse> is set
12213    * further below (if we need to reparse the node) to include just
12214    * up through that final non-problematic character that this code
12215    * identifies, so when it is set to less than the full node, we can
12216    * skip the rest of this */
12217    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12218
12219     const STRLEN full_len = len;
12220
12221     assert(len >= MAX_NODE_STRING_SIZE);
12222
12223     /* Here, <s> points to the final byte of the final character.
12224     * Look backwards through the string until find a non-
12225     * problematic character */
12226
12227     if (! UTF) {
12228
12229      /* This has no multi-char folds to non-UTF characters */
12230      if (ASCII_FOLD_RESTRICTED) {
12231       goto loopdone;
12232      }
12233
12234      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12235      len = s - s0 + 1;
12236     }
12237     else {
12238      if (!  PL_NonL1NonFinalFold) {
12239       PL_NonL1NonFinalFold = _new_invlist_C_array(
12240           NonL1_Perl_Non_Final_Folds_invlist);
12241      }
12242
12243      /* Point to the first byte of the final character */
12244      s = (char *) utf8_hop((U8 *) s, -1);
12245
12246      while (s >= s0) {   /* Search backwards until find
12247           non-problematic char */
12248       if (UTF8_IS_INVARIANT(*s)) {
12249
12250        /* There are no ascii characters that participate
12251        * in multi-char folds under /aa.  In EBCDIC, the
12252        * non-ascii invariants are all control characters,
12253        * so don't ever participate in any folds. */
12254        if (ASCII_FOLD_RESTRICTED
12255         || ! IS_NON_FINAL_FOLD(*s))
12256        {
12257         break;
12258        }
12259       }
12260       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12261        if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12262                 *s, *(s+1))))
12263        {
12264         break;
12265        }
12266       }
12267       else if (! _invlist_contains_cp(
12268           PL_NonL1NonFinalFold,
12269           valid_utf8_to_uvchr((U8 *) s, NULL)))
12270       {
12271        break;
12272       }
12273
12274       /* Here, the current character is problematic in that
12275       * it does occur in the non-final position of some
12276       * fold, so try the character before it, but have to
12277       * special case the very first byte in the string, so
12278       * we don't read outside the string */
12279       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12280      } /* End of loop backwards through the string */
12281
12282      /* If there were only problematic characters in the string,
12283      * <s> will point to before s0, in which case the length
12284      * should be 0, otherwise include the length of the
12285      * non-problematic character just found */
12286      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12287     }
12288
12289     /* Here, have found the final character, if any, that is
12290     * non-problematic as far as ending the node without splitting
12291     * it across a potential multi-char fold.  <len> contains the
12292     * number of bytes in the node up-to and including that
12293     * character, or is 0 if there is no such character, meaning
12294     * the whole node contains only problematic characters.  In
12295     * this case, give up and just take the node as-is.  We can't
12296     * do any better */
12297     if (len == 0) {
12298      len = full_len;
12299
12300      /* If the node ends in an 's' we make sure it stays EXACTF,
12301      * as if it turns into an EXACTFU, it could later get
12302      * joined with another 's' that would then wrongly match
12303      * the sharp s */
12304      if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12305      {
12306       maybe_exactfu = FALSE;
12307      }
12308     } else {
12309
12310      /* Here, the node does contain some characters that aren't
12311      * problematic.  If one such is the final character in the
12312      * node, we are done */
12313      if (len == full_len) {
12314       goto loopdone;
12315      }
12316      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12317
12318       /* If the final character is problematic, but the
12319       * penultimate is not, back-off that last character to
12320       * later start a new node with it */
12321       p = oldp;
12322       goto loopdone;
12323      }
12324
12325      /* Here, the final non-problematic character is earlier
12326      * in the input than the penultimate character.  What we do
12327      * is reparse from the beginning, going up only as far as
12328      * this final ok one, thus guaranteeing that the node ends
12329      * in an acceptable character.  The reason we reparse is
12330      * that we know how far in the character is, but we don't
12331      * know how to correlate its position with the input parse.
12332      * An alternate implementation would be to build that
12333      * correlation as we go along during the original parse,
12334      * but that would entail extra work for every node, whereas
12335      * this code gets executed only when the string is too
12336      * large for the node, and the final two characters are
12337      * problematic, an infrequent occurrence.  Yet another
12338      * possible strategy would be to save the tail of the
12339      * string, and the next time regatom is called, initialize
12340      * with that.  The problem with this is that unless you
12341      * back off one more character, you won't be guaranteed
12342      * regatom will get called again, unless regbranch,
12343      * regpiece ... are also changed.  If you do back off that
12344      * extra character, so that there is input guaranteed to
12345      * force calling regatom, you can't handle the case where
12346      * just the first character in the node is acceptable.  I
12347      * (khw) decided to try this method which doesn't have that
12348      * pitfall; if performance issues are found, we can do a
12349      * combination of the current approach plus that one */
12350      upper_parse = len;
12351      len = 0;
12352      s = s0;
12353      goto reparse;
12354     }
12355    }   /* End of verifying node ends with an appropriate char */
12356
12357   loopdone:   /* Jumped to when encounters something that shouldn't be in
12358      the node */
12359
12360    /* I (khw) don't know if you can get here with zero length, but the
12361    * old code handled this situation by creating a zero-length EXACT
12362    * node.  Might as well be NOTHING instead */
12363    if (len == 0) {
12364     OP(ret) = NOTHING;
12365    }
12366    else {
12367     if (FOLD) {
12368      /* If 'maybe_exact' is still set here, means there are no
12369      * code points in the node that participate in folds;
12370      * similarly for 'maybe_exactfu' and code points that match
12371      * differently depending on UTF8ness of the target string
12372      * (for /u), or depending on locale for /l */
12373      if (maybe_exact) {
12374       OP(ret) = EXACT;
12375      }
12376      else if (maybe_exactfu) {
12377       OP(ret) = EXACTFU;
12378      }
12379     }
12380     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12381           FALSE /* Don't look to see if could
12382              be turned into an EXACT
12383              node, as we have already
12384              computed that */
12385           );
12386    }
12387
12388    RExC_parse = p - 1;
12389    Set_Node_Cur_Length(ret, parse_start);
12390    nextchar(pRExC_state);
12391    {
12392     /* len is STRLEN which is unsigned, need to copy to signed */
12393     IV iv = len;
12394     if (iv < 0)
12395      vFAIL("Internal disaster");
12396    }
12397
12398   } /* End of label 'defchar:' */
12399   break;
12400  } /* End of giant switch on input character */
12401
12402  return(ret);
12403 }
12404
12405 STATIC char *
12406 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12407 {
12408  /* Returns the next non-pattern-white space, non-comment character (the
12409  * latter only if 'recognize_comment is true) in the string p, which is
12410  * ended by RExC_end.  See also reg_skipcomment */
12411  const char *e = RExC_end;
12412
12413  PERL_ARGS_ASSERT_REGPATWS;
12414
12415  while (p < e) {
12416   STRLEN len;
12417   if ((len = is_PATWS_safe(p, e, UTF))) {
12418    p += len;
12419   }
12420   else if (recognize_comment && *p == '#') {
12421    p = reg_skipcomment(pRExC_state, p);
12422   }
12423   else
12424    break;
12425  }
12426  return p;
12427 }
12428
12429 STATIC void
12430 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12431 {
12432  /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12433  * sets up the bitmap and any flags, removing those code points from the
12434  * inversion list, setting it to NULL should it become completely empty */
12435
12436  PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12437  assert(PL_regkind[OP(node)] == ANYOF);
12438
12439  ANYOF_BITMAP_ZERO(node);
12440  if (*invlist_ptr) {
12441
12442   /* This gets set if we actually need to modify things */
12443   bool change_invlist = FALSE;
12444
12445   UV start, end;
12446
12447   /* Start looking through *invlist_ptr */
12448   invlist_iterinit(*invlist_ptr);
12449   while (invlist_iternext(*invlist_ptr, &start, &end)) {
12450    UV high;
12451    int i;
12452
12453    if (end == UV_MAX && start <= 256) {
12454     ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12455    }
12456    else if (end >= 256) {
12457     ANYOF_FLAGS(node) |= ANYOF_UTF8;
12458    }
12459
12460    /* Quit if are above what we should change */
12461    if (start > 255) {
12462     break;
12463    }
12464
12465    change_invlist = TRUE;
12466
12467    /* Set all the bits in the range, up to the max that we are doing */
12468    high = (end < 255) ? end : 255;
12469    for (i = start; i <= (int) high; i++) {
12470     if (! ANYOF_BITMAP_TEST(node, i)) {
12471      ANYOF_BITMAP_SET(node, i);
12472     }
12473    }
12474   }
12475   invlist_iterfinish(*invlist_ptr);
12476
12477   /* Done with loop; remove any code points that are in the bitmap from
12478   * *invlist_ptr; similarly for code points above latin1 if we have a
12479   * flag to match all of them anyways */
12480   if (change_invlist) {
12481    _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12482   }
12483   if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12484    _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12485   }
12486
12487   /* If have completely emptied it, remove it completely */
12488   if (_invlist_len(*invlist_ptr) == 0) {
12489    SvREFCNT_dec_NN(*invlist_ptr);
12490    *invlist_ptr = NULL;
12491   }
12492  }
12493 }
12494
12495 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12496    Character classes ([:foo:]) can also be negated ([:^foo:]).
12497    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12498    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12499    but trigger failures because they are currently unimplemented. */
12500
12501 #define POSIXCC_DONE(c)   ((c) == ':')
12502 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12503 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12504
12505 PERL_STATIC_INLINE I32
12506 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12507 {
12508  I32 namedclass = OOB_NAMEDCLASS;
12509
12510  PERL_ARGS_ASSERT_REGPPOSIXCC;
12511
12512  if (value == '[' && RExC_parse + 1 < RExC_end &&
12513   /* I smell either [: or [= or [. -- POSIX has been here, right? */
12514   POSIXCC(UCHARAT(RExC_parse)))
12515  {
12516   const char c = UCHARAT(RExC_parse);
12517   char* const s = RExC_parse++;
12518
12519   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12520    RExC_parse++;
12521   if (RExC_parse == RExC_end) {
12522    if (strict) {
12523
12524     /* Try to give a better location for the error (than the end of
12525     * the string) by looking for the matching ']' */
12526     RExC_parse = s;
12527     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12528      RExC_parse++;
12529     }
12530     vFAIL2("Unmatched '%c' in POSIX class", c);
12531    }
12532    /* Grandfather lone [:, [=, [. */
12533    RExC_parse = s;
12534   }
12535   else {
12536    const char* const t = RExC_parse++; /* skip over the c */
12537    assert(*t == c);
12538
12539    if (UCHARAT(RExC_parse) == ']') {
12540     const char *posixcc = s + 1;
12541     RExC_parse++; /* skip over the ending ] */
12542
12543     if (*s == ':') {
12544      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12545      const I32 skip = t - posixcc;
12546
12547      /* Initially switch on the length of the name.  */
12548      switch (skip) {
12549      case 4:
12550       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12551               this is the Perl \w
12552               */
12553        namedclass = ANYOF_WORDCHAR;
12554       break;
12555      case 5:
12556       /* Names all of length 5.  */
12557       /* alnum alpha ascii blank cntrl digit graph lower
12558       print punct space upper  */
12559       /* Offset 4 gives the best switch position.  */
12560       switch (posixcc[4]) {
12561       case 'a':
12562        if (memEQ(posixcc, "alph", 4)) /* alpha */
12563         namedclass = ANYOF_ALPHA;
12564        break;
12565       case 'e':
12566        if (memEQ(posixcc, "spac", 4)) /* space */
12567         namedclass = ANYOF_PSXSPC;
12568        break;
12569       case 'h':
12570        if (memEQ(posixcc, "grap", 4)) /* graph */
12571         namedclass = ANYOF_GRAPH;
12572        break;
12573       case 'i':
12574        if (memEQ(posixcc, "asci", 4)) /* ascii */
12575         namedclass = ANYOF_ASCII;
12576        break;
12577       case 'k':
12578        if (memEQ(posixcc, "blan", 4)) /* blank */
12579         namedclass = ANYOF_BLANK;
12580        break;
12581       case 'l':
12582        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12583         namedclass = ANYOF_CNTRL;
12584        break;
12585       case 'm':
12586        if (memEQ(posixcc, "alnu", 4)) /* alnum */
12587         namedclass = ANYOF_ALPHANUMERIC;
12588        break;
12589       case 'r':
12590        if (memEQ(posixcc, "lowe", 4)) /* lower */
12591         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12592        else if (memEQ(posixcc, "uppe", 4)) /* upper */
12593         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12594        break;
12595       case 't':
12596        if (memEQ(posixcc, "digi", 4)) /* digit */
12597         namedclass = ANYOF_DIGIT;
12598        else if (memEQ(posixcc, "prin", 4)) /* print */
12599         namedclass = ANYOF_PRINT;
12600        else if (memEQ(posixcc, "punc", 4)) /* punct */
12601         namedclass = ANYOF_PUNCT;
12602        break;
12603       }
12604       break;
12605      case 6:
12606       if (memEQ(posixcc, "xdigit", 6))
12607        namedclass = ANYOF_XDIGIT;
12608       break;
12609      }
12610
12611      if (namedclass == OOB_NAMEDCLASS)
12612       vFAIL2utf8f(
12613        "POSIX class [:%"UTF8f":] unknown",
12614        UTF8fARG(UTF, t - s - 1, s + 1));
12615
12616      /* The #defines are structured so each complement is +1 to
12617      * the normal one */
12618      if (complement) {
12619       namedclass++;
12620      }
12621      assert (posixcc[skip] == ':');
12622      assert (posixcc[skip+1] == ']');
12623     } else if (!SIZE_ONLY) {
12624      /* [[=foo=]] and [[.foo.]] are still future. */
12625
12626      /* adjust RExC_parse so the warning shows after
12627      the class closes */
12628      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12629       RExC_parse++;
12630      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12631     }
12632    } else {
12633     /* Maternal grandfather:
12634     * "[:" ending in ":" but not in ":]" */
12635     if (strict) {
12636      vFAIL("Unmatched '[' in POSIX class");
12637     }
12638
12639     /* Grandfather lone [:, [=, [. */
12640     RExC_parse = s;
12641    }
12642   }
12643  }
12644
12645  return namedclass;
12646 }
12647
12648 STATIC bool
12649 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12650 {
12651  /* This applies some heuristics at the current parse position (which should
12652  * be at a '[') to see if what follows might be intended to be a [:posix:]
12653  * class.  It returns true if it really is a posix class, of course, but it
12654  * also can return true if it thinks that what was intended was a posix
12655  * class that didn't quite make it.
12656  *
12657  * It will return true for
12658  *      [:alphanumerics:
12659  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12660  *                         ')' indicating the end of the (?[
12661  *      [:any garbage including %^&$ punctuation:]
12662  *
12663  * This is designed to be called only from S_handle_regex_sets; it could be
12664  * easily adapted to be called from the spot at the beginning of regclass()
12665  * that checks to see in a normal bracketed class if the surrounding []
12666  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12667  * change long-standing behavior, so I (khw) didn't do that */
12668  char* p = RExC_parse + 1;
12669  char first_char = *p;
12670
12671  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12672
12673  assert(*(p - 1) == '[');
12674
12675  if (! POSIXCC(first_char)) {
12676   return FALSE;
12677  }
12678
12679  p++;
12680  while (p < RExC_end && isWORDCHAR(*p)) p++;
12681
12682  if (p >= RExC_end) {
12683   return FALSE;
12684  }
12685
12686  if (p - RExC_parse > 2    /* Got at least 1 word character */
12687   && (*p == first_char
12688    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12689  {
12690   return TRUE;
12691  }
12692
12693  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12694
12695  return (p
12696    && p - RExC_parse > 2 /* [:] evaluates to colon;
12697          [::] is a bad posix class. */
12698    && first_char == *(p - 1));
12699 }
12700
12701 STATIC regnode *
12702 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12703      I32 *flagp, U32 depth,
12704      char * const oregcomp_parse)
12705 {
12706  /* Handle the (?[...]) construct to do set operations */
12707
12708  U8 curchar;
12709  UV start, end; /* End points of code point ranges */
12710  SV* result_string;
12711  char *save_end, *save_parse;
12712  SV* final;
12713  STRLEN len;
12714  regnode* node;
12715  AV* stack;
12716  const bool save_fold = FOLD;
12717
12718  GET_RE_DEBUG_FLAGS_DECL;
12719
12720  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12721
12722  if (LOC) {
12723   vFAIL("(?[...]) not valid in locale");
12724  }
12725  RExC_uni_semantics = 1;
12726
12727  /* This will return only an ANYOF regnode, or (unlikely) something smaller
12728  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12729  * call regclass to handle '[]' so as to not have to reinvent its parsing
12730  * rules here (throwing away the size it computes each time).  And, we exit
12731  * upon an unescaped ']' that isn't one ending a regclass.  To do both
12732  * these things, we need to realize that something preceded by a backslash
12733  * is escaped, so we have to keep track of backslashes */
12734  if (SIZE_ONLY) {
12735   UV depth = 0; /* how many nested (?[...]) constructs */
12736
12737   Perl_ck_warner_d(aTHX_
12738    packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12739    "The regex_sets feature is experimental" REPORT_LOCATION,
12740     UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12741     UTF8fARG(UTF,
12742       RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12743       RExC_precomp + (RExC_parse - RExC_precomp)));
12744
12745   while (RExC_parse < RExC_end) {
12746    SV* current = NULL;
12747    RExC_parse = regpatws(pRExC_state, RExC_parse,
12748           TRUE); /* means recognize comments */
12749    switch (*RExC_parse) {
12750     case '?':
12751      if (RExC_parse[1] == '[') depth++, RExC_parse++;
12752      /* FALLTHROUGH */
12753     default:
12754      break;
12755     case '\\':
12756      /* Skip the next byte (which could cause us to end up in
12757      * the middle of a UTF-8 character, but since none of those
12758      * are confusable with anything we currently handle in this
12759      * switch (invariants all), it's safe.  We'll just hit the
12760      * default: case next time and keep on incrementing until
12761      * we find one of the invariants we do handle. */
12762      RExC_parse++;
12763      break;
12764     case '[':
12765     {
12766      /* If this looks like it is a [:posix:] class, leave the
12767      * parse pointer at the '[' to fool regclass() into
12768      * thinking it is part of a '[[:posix:]]'.  That function
12769      * will use strict checking to force a syntax error if it
12770      * doesn't work out to a legitimate class */
12771      bool is_posix_class
12772          = could_it_be_a_POSIX_class(pRExC_state);
12773      if (! is_posix_class) {
12774       RExC_parse++;
12775      }
12776
12777      /* regclass() can only return RESTART_UTF8 if multi-char
12778      folds are allowed.  */
12779      if (!regclass(pRExC_state, flagp,depth+1,
12780         is_posix_class, /* parse the whole char
12781              class only if not a
12782              posix class */
12783         FALSE, /* don't allow multi-char folds */
12784         TRUE, /* silence non-portable warnings. */
12785         &current))
12786       FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12787        (UV) *flagp);
12788
12789      /* function call leaves parse pointing to the ']', except
12790      * if we faked it */
12791      if (is_posix_class) {
12792       RExC_parse--;
12793      }
12794
12795      SvREFCNT_dec(current);   /* In case it returned something */
12796      break;
12797     }
12798
12799     case ']':
12800      if (depth--) break;
12801      RExC_parse++;
12802      if (RExC_parse < RExC_end
12803       && *RExC_parse == ')')
12804      {
12805       node = reganode(pRExC_state, ANYOF, 0);
12806       RExC_size += ANYOF_SKIP;
12807       nextchar(pRExC_state);
12808       Set_Node_Length(node,
12809         RExC_parse - oregcomp_parse + 1); /* MJD */
12810       return node;
12811      }
12812      goto no_close;
12813    }
12814    RExC_parse++;
12815   }
12816
12817   no_close:
12818   FAIL("Syntax error in (?[...])");
12819  }
12820
12821  /* Pass 2 only after this.  Everything in this construct is a
12822  * metacharacter.  Operands begin with either a '\' (for an escape
12823  * sequence), or a '[' for a bracketed character class.  Any other
12824  * character should be an operator, or parenthesis for grouping.  Both
12825  * types of operands are handled by calling regclass() to parse them.  It
12826  * is called with a parameter to indicate to return the computed inversion
12827  * list.  The parsing here is implemented via a stack.  Each entry on the
12828  * stack is a single character representing one of the operators, or the
12829  * '('; or else a pointer to an operand inversion list. */
12830
12831 #define IS_OPERAND(a)  (! SvIOK(a))
12832
12833  /* The stack starts empty.  It is a syntax error if the first thing parsed
12834  * is a binary operator; everything else is pushed on the stack.  When an
12835  * operand is parsed, the top of the stack is examined.  If it is a binary
12836  * operator, the item before it should be an operand, and both are replaced
12837  * by the result of doing that operation on the new operand and the one on
12838  * the stack.   Thus a sequence of binary operands is reduced to a single
12839  * one before the next one is parsed.
12840  *
12841  * A unary operator may immediately follow a binary in the input, for
12842  * example
12843  *      [a] + ! [b]
12844  * When an operand is parsed and the top of the stack is a unary operator,
12845  * the operation is performed, and then the stack is rechecked to see if
12846  * this new operand is part of a binary operation; if so, it is handled as
12847  * above.
12848  *
12849  * A '(' is simply pushed on the stack; it is valid only if the stack is
12850  * empty, or the top element of the stack is an operator or another '('
12851  * (for which the parenthesized expression will become an operand).  By the
12852  * time the corresponding ')' is parsed everything in between should have
12853  * been parsed and evaluated to a single operand (or else is a syntax
12854  * error), and is handled as a regular operand */
12855
12856  sv_2mortal((SV *)(stack = newAV()));
12857
12858  while (RExC_parse < RExC_end) {
12859   I32 top_index = av_tindex(stack);
12860   SV** top_ptr;
12861   SV* current = NULL;
12862
12863   /* Skip white space */
12864   RExC_parse = regpatws(pRExC_state, RExC_parse,
12865           TRUE /* means recognize comments */ );
12866   if (RExC_parse >= RExC_end) {
12867    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12868   }
12869   if ((curchar = UCHARAT(RExC_parse)) == ']') {
12870    break;
12871   }
12872
12873   switch (curchar) {
12874
12875    case '?':
12876     if (av_tindex(stack) >= 0   /* This makes sure that we can
12877            safely subtract 1 from
12878            RExC_parse in the next clause.
12879            If we have something on the
12880            stack, we have parsed something
12881            */
12882      && UCHARAT(RExC_parse - 1) == '('
12883      && RExC_parse < RExC_end)
12884     {
12885      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12886      * This happens when we have some thing like
12887      *
12888      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12889      *   ...
12890      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12891      *
12892      * Here we would be handling the interpolated
12893      * '$thai_or_lao'.  We handle this by a recursive call to
12894      * ourselves which returns the inversion list the
12895      * interpolated expression evaluates to.  We use the flags
12896      * from the interpolated pattern. */
12897      U32 save_flags = RExC_flags;
12898      const char * const save_parse = ++RExC_parse;
12899
12900      parse_lparen_question_flags(pRExC_state);
12901
12902      if (RExC_parse == save_parse  /* Makes sure there was at
12903              least one flag (or this
12904              embedding wasn't compiled)
12905             */
12906       || RExC_parse >= RExC_end - 4
12907       || UCHARAT(RExC_parse) != ':'
12908       || UCHARAT(++RExC_parse) != '('
12909       || UCHARAT(++RExC_parse) != '?'
12910       || UCHARAT(++RExC_parse) != '[')
12911      {
12912
12913       /* In combination with the above, this moves the
12914       * pointer to the point just after the first erroneous
12915       * character (or if there are no flags, to where they
12916       * should have been) */
12917       if (RExC_parse >= RExC_end - 4) {
12918        RExC_parse = RExC_end;
12919       }
12920       else if (RExC_parse != save_parse) {
12921        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12922       }
12923       vFAIL("Expecting '(?flags:(?[...'");
12924      }
12925      RExC_parse++;
12926      (void) handle_regex_sets(pRExC_state, &current, flagp,
12927              depth+1, oregcomp_parse);
12928
12929      /* Here, 'current' contains the embedded expression's
12930      * inversion list, and RExC_parse points to the trailing
12931      * ']'; the next character should be the ')' which will be
12932      * paired with the '(' that has been put on the stack, so
12933      * the whole embedded expression reduces to '(operand)' */
12934      RExC_parse++;
12935
12936      RExC_flags = save_flags;
12937      goto handle_operand;
12938     }
12939     /* FALLTHROUGH */
12940
12941    default:
12942     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12943     vFAIL("Unexpected character");
12944
12945    case '\\':
12946     /* regclass() can only return RESTART_UTF8 if multi-char
12947     folds are allowed.  */
12948     if (!regclass(pRExC_state, flagp,depth+1,
12949        TRUE, /* means parse just the next thing */
12950        FALSE, /* don't allow multi-char folds */
12951        FALSE, /* don't silence non-portable warnings.  */
12952        &current))
12953      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12954       (UV) *flagp);
12955     /* regclass() will return with parsing just the \ sequence,
12956     * leaving the parse pointer at the next thing to parse */
12957     RExC_parse--;
12958     goto handle_operand;
12959
12960    case '[':   /* Is a bracketed character class */
12961    {
12962     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12963
12964     if (! is_posix_class) {
12965      RExC_parse++;
12966     }
12967
12968     /* regclass() can only return RESTART_UTF8 if multi-char
12969     folds are allowed.  */
12970     if(!regclass(pRExC_state, flagp,depth+1,
12971        is_posix_class, /* parse the whole char class
12972             only if not a posix class */
12973        FALSE, /* don't allow multi-char folds */
12974        FALSE, /* don't silence non-portable warnings.  */
12975        &current))
12976      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12977       (UV) *flagp);
12978     /* function call leaves parse pointing to the ']', except if we
12979     * faked it */
12980     if (is_posix_class) {
12981      RExC_parse--;
12982     }
12983
12984     goto handle_operand;
12985    }
12986
12987    case '&':
12988    case '|':
12989    case '+':
12990    case '-':
12991    case '^':
12992     if (top_index < 0
12993      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12994      || ! IS_OPERAND(*top_ptr))
12995     {
12996      RExC_parse++;
12997      vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12998     }
12999     av_push(stack, newSVuv(curchar));
13000     break;
13001
13002    case '!':
13003     av_push(stack, newSVuv(curchar));
13004     break;
13005
13006    case '(':
13007     if (top_index >= 0) {
13008      top_ptr = av_fetch(stack, top_index, FALSE);
13009      assert(top_ptr);
13010      if (IS_OPERAND(*top_ptr)) {
13011       RExC_parse++;
13012       vFAIL("Unexpected '(' with no preceding operator");
13013      }
13014     }
13015     av_push(stack, newSVuv(curchar));
13016     break;
13017
13018    case ')':
13019    {
13020     SV* lparen;
13021     if (top_index < 1
13022      || ! (current = av_pop(stack))
13023      || ! IS_OPERAND(current)
13024      || ! (lparen = av_pop(stack))
13025      || IS_OPERAND(lparen)
13026      || SvUV(lparen) != '(')
13027     {
13028      SvREFCNT_dec(current);
13029      RExC_parse++;
13030      vFAIL("Unexpected ')'");
13031     }
13032     top_index -= 2;
13033     SvREFCNT_dec_NN(lparen);
13034
13035     /* FALLTHROUGH */
13036    }
13037
13038    handle_operand:
13039
13040     /* Here, we have an operand to process, in 'current' */
13041
13042     if (top_index < 0) {    /* Just push if stack is empty */
13043      av_push(stack, current);
13044     }
13045     else {
13046      SV* top = av_pop(stack);
13047      SV *prev = NULL;
13048      char current_operator;
13049
13050      if (IS_OPERAND(top)) {
13051       SvREFCNT_dec_NN(top);
13052       SvREFCNT_dec_NN(current);
13053       vFAIL("Operand with no preceding operator");
13054      }
13055      current_operator = (char) SvUV(top);
13056      switch (current_operator) {
13057       case '(':   /* Push the '(' back on followed by the new
13058          operand */
13059        av_push(stack, top);
13060        av_push(stack, current);
13061        SvREFCNT_inc(top);  /* Counters the '_dec' done
13062             just after the 'break', so
13063             it doesn't get wrongly freed
13064             */
13065        break;
13066
13067       case '!':
13068        _invlist_invert(current);
13069
13070        /* Unlike binary operators, the top of the stack,
13071        * now that this unary one has been popped off, may
13072        * legally be an operator, and we now have operand
13073        * for it. */
13074        top_index--;
13075        SvREFCNT_dec_NN(top);
13076        goto handle_operand;
13077
13078       case '&':
13079        prev = av_pop(stack);
13080        _invlist_intersection(prev,
13081             current,
13082             &current);
13083        av_push(stack, current);
13084        break;
13085
13086       case '|':
13087       case '+':
13088        prev = av_pop(stack);
13089        _invlist_union(prev, current, &current);
13090        av_push(stack, current);
13091        break;
13092
13093       case '-':
13094        prev = av_pop(stack);;
13095        _invlist_subtract(prev, current, &current);
13096        av_push(stack, current);
13097        break;
13098
13099       case '^':   /* The union minus the intersection */
13100       {
13101        SV* i = NULL;
13102        SV* u = NULL;
13103        SV* element;
13104
13105        prev = av_pop(stack);
13106        _invlist_union(prev, current, &u);
13107        _invlist_intersection(prev, current, &i);
13108        /* _invlist_subtract will overwrite current
13109         without freeing what it already contains */
13110        element = current;
13111        _invlist_subtract(u, i, &current);
13112        av_push(stack, current);
13113        SvREFCNT_dec_NN(i);
13114        SvREFCNT_dec_NN(u);
13115        SvREFCNT_dec_NN(element);
13116        break;
13117       }
13118
13119       default:
13120        Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13121     }
13122     SvREFCNT_dec_NN(top);
13123     SvREFCNT_dec(prev);
13124    }
13125   }
13126
13127   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13128  }
13129
13130  if (av_tindex(stack) < 0   /* Was empty */
13131   || ((final = av_pop(stack)) == NULL)
13132   || ! IS_OPERAND(final)
13133   || av_tindex(stack) >= 0)  /* More left on stack */
13134  {
13135   vFAIL("Incomplete expression within '(?[ ])'");
13136  }
13137
13138  /* Here, 'final' is the resultant inversion list from evaluating the
13139  * expression.  Return it if so requested */
13140  if (return_invlist) {
13141   *return_invlist = final;
13142   return END;
13143  }
13144
13145  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13146  * expecting a string of ranges and individual code points */
13147  invlist_iterinit(final);
13148  result_string = newSVpvs("");
13149  while (invlist_iternext(final, &start, &end)) {
13150   if (start == end) {
13151    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13152   }
13153   else {
13154    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13155              start,          end);
13156   }
13157  }
13158
13159  save_parse = RExC_parse;
13160  RExC_parse = SvPV(result_string, len);
13161  save_end = RExC_end;
13162  RExC_end = RExC_parse + len;
13163
13164  /* We turn off folding around the call, as the class we have constructed
13165  * already has all folding taken into consideration, and we don't want
13166  * regclass() to add to that */
13167  RExC_flags &= ~RXf_PMf_FOLD;
13168  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13169  */
13170  node = regclass(pRExC_state, flagp,depth+1,
13171      FALSE, /* means parse the whole char class */
13172      FALSE, /* don't allow multi-char folds */
13173      TRUE, /* silence non-portable warnings.  The above may very
13174        well have generated non-portable code points, but
13175        they're valid on this machine */
13176      NULL);
13177  if (!node)
13178   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13179      PTR2UV(flagp));
13180  if (save_fold) {
13181   RExC_flags |= RXf_PMf_FOLD;
13182  }
13183  RExC_parse = save_parse + 1;
13184  RExC_end = save_end;
13185  SvREFCNT_dec_NN(final);
13186  SvREFCNT_dec_NN(result_string);
13187
13188  nextchar(pRExC_state);
13189  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13190  return node;
13191 }
13192 #undef IS_OPERAND
13193
13194 STATIC void
13195 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13196 {
13197  /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13198  * innocent-looking character class, like /[ks]/i won't have to go out to
13199  * disk to find the possible matches.
13200  *
13201  * This should be called only for a Latin1-range code points, cp, which is
13202  * known to be involved in a simple fold with other code points above
13203  * Latin1.  It would give false results if /aa has been specified.
13204  * Multi-char folds are outside the scope of this, and must be handled
13205  * specially.
13206  *
13207  * XXX It would be better to generate these via regen, in case a new
13208  * version of the Unicode standard adds new mappings, though that is not
13209  * really likely, and may be caught by the default: case of the switch
13210  * below. */
13211
13212  PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13213
13214  assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13215
13216  switch (cp) {
13217   case 'k':
13218   case 'K':
13219   *invlist =
13220    add_cp_to_invlist(*invlist, KELVIN_SIGN);
13221    break;
13222   case 's':
13223   case 'S':
13224   *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13225    break;
13226   case MICRO_SIGN:
13227   *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13228   *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13229    break;
13230   case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13231   case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13232   *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13233    break;
13234   case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13235   *invlist = add_cp_to_invlist(*invlist,
13236           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13237    break;
13238   case LATIN_SMALL_LETTER_SHARP_S:
13239   *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13240    break;
13241   default:
13242    /* Use deprecated warning to increase the chances of this being
13243    * output */
13244    ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13245    break;
13246  }
13247 }
13248
13249 /* The names of properties whose definitions are not known at compile time are
13250  * stored in this SV, after a constant heading.  So if the length has been
13251  * changed since initialization, then there is a run-time definition. */
13252 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13253           (SvCUR(listsv) != initial_listsv_len)
13254
13255 STATIC regnode *
13256 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13257     const bool stop_at_1,  /* Just parse the next thing, don't
13258           look for a full character class */
13259     bool allow_multi_folds,
13260     const bool silence_non_portable,   /* Don't output warnings
13261              about too large
13262              characters */
13263     SV** ret_invlist)  /* Return an inversion list, not a node */
13264 {
13265  /* parse a bracketed class specification.  Most of these will produce an
13266  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13267  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13268  * under /i with multi-character folds: it will be rewritten following the
13269  * paradigm of this example, where the <multi-fold>s are characters which
13270  * fold to multiple character sequences:
13271  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13272  * gets effectively rewritten as:
13273  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13274  * reg() gets called (recursively) on the rewritten version, and this
13275  * function will return what it constructs.  (Actually the <multi-fold>s
13276  * aren't physically removed from the [abcdefghi], it's just that they are
13277  * ignored in the recursion by means of a flag:
13278  * <RExC_in_multi_char_class>.)
13279  *
13280  * ANYOF nodes contain a bit map for the first 256 characters, with the
13281  * corresponding bit set if that character is in the list.  For characters
13282  * above 255, a range list or swash is used.  There are extra bits for \w,
13283  * etc. in locale ANYOFs, as what these match is not determinable at
13284  * compile time
13285  *
13286  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13287  * to be restarted.  This can only happen if ret_invlist is non-NULL.
13288  */
13289
13290  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13291  IV range = 0;
13292  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13293  regnode *ret;
13294  STRLEN numlen;
13295  IV namedclass = OOB_NAMEDCLASS;
13296  char *rangebegin = NULL;
13297  bool need_class = 0;
13298  SV *listsv = NULL;
13299  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13300          than just initialized.  */
13301  SV* properties = NULL;    /* Code points that match \p{} \P{} */
13302  SV* posixes = NULL;     /* Code points that match classes like [:word:],
13303        extended beyond the Latin1 range.  These have to
13304        be kept separate from other code points for much
13305        of this function because their handling  is
13306        different under /i, and for most classes under
13307        /d as well */
13308  SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13309        separate for a while from the non-complemented
13310        versions because of complications with /d
13311        matching */
13312  UV element_count = 0;   /* Number of distinct elements in the class.
13313        Optimizations may be possible if this is tiny */
13314  AV * multi_char_matches = NULL; /* Code points that fold to more than one
13315          character; used under /i */
13316  UV n;
13317  char * stop_ptr = RExC_end;    /* where to stop parsing */
13318  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13319             space? */
13320  const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13321
13322  /* Unicode properties are stored in a swash; this holds the current one
13323  * being parsed.  If this swash is the only above-latin1 component of the
13324  * character class, an optimization is to pass it directly on to the
13325  * execution engine.  Otherwise, it is set to NULL to indicate that there
13326  * are other things in the class that have to be dealt with at execution
13327  * time */
13328  SV* swash = NULL;  /* Code points that match \p{} \P{} */
13329
13330  /* Set if a component of this character class is user-defined; just passed
13331  * on to the engine */
13332  bool has_user_defined_property = FALSE;
13333
13334  /* inversion list of code points this node matches only when the target
13335  * string is in UTF-8.  (Because is under /d) */
13336  SV* depends_list = NULL;
13337
13338  /* Inversion list of code points this node matches regardless of things
13339  * like locale, folding, utf8ness of the target string */
13340  SV* cp_list = NULL;
13341
13342  /* Like cp_list, but code points on this list need to be checked for things
13343  * that fold to/from them under /i */
13344  SV* cp_foldable_list = NULL;
13345
13346  /* Like cp_list, but code points on this list are valid only when the
13347  * runtime locale is UTF-8 */
13348  SV* only_utf8_locale_list = NULL;
13349
13350 #ifdef EBCDIC
13351  /* In a range, counts how many 0-2 of the ends of it came from literals,
13352  * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13353  UV literal_endpoint = 0;
13354 #endif
13355  bool invert = FALSE;    /* Is this class to be complemented */
13356
13357  bool warn_super = ALWAYS_WARN_SUPER;
13358
13359  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13360   case we need to change the emitted regop to an EXACT. */
13361  const char * orig_parse = RExC_parse;
13362  const SSize_t orig_size = RExC_size;
13363  bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13364  GET_RE_DEBUG_FLAGS_DECL;
13365
13366  PERL_ARGS_ASSERT_REGCLASS;
13367 #ifndef DEBUGGING
13368  PERL_UNUSED_ARG(depth);
13369 #endif
13370
13371  DEBUG_PARSE("clas");
13372
13373  /* Assume we are going to generate an ANYOF node. */
13374  ret = reganode(pRExC_state, ANYOF, 0);
13375
13376  if (SIZE_ONLY) {
13377   RExC_size += ANYOF_SKIP;
13378   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13379  }
13380  else {
13381   ANYOF_FLAGS(ret) = 0;
13382
13383   RExC_emit += ANYOF_SKIP;
13384   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13385   initial_listsv_len = SvCUR(listsv);
13386   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13387  }
13388
13389  if (skip_white) {
13390   RExC_parse = regpatws(pRExC_state, RExC_parse,
13391        FALSE /* means don't recognize comments */ );
13392  }
13393
13394  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13395   RExC_parse++;
13396   invert = TRUE;
13397   allow_multi_folds = FALSE;
13398   RExC_naughty++;
13399   if (skip_white) {
13400    RExC_parse = regpatws(pRExC_state, RExC_parse,
13401         FALSE /* means don't recognize comments */ );
13402   }
13403  }
13404
13405  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13406  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13407   const char *s = RExC_parse;
13408   const char  c = *s++;
13409
13410   while (isWORDCHAR(*s))
13411    s++;
13412   if (*s && c == *s && s[1] == ']') {
13413    SAVEFREESV(RExC_rx_sv);
13414    ckWARN3reg(s+2,
13415      "POSIX syntax [%c %c] belongs inside character classes",
13416      c, c);
13417    (void)ReREFCNT_inc(RExC_rx_sv);
13418   }
13419  }
13420
13421  /* If the caller wants us to just parse a single element, accomplish this
13422  * by faking the loop ending condition */
13423  if (stop_at_1 && RExC_end > RExC_parse) {
13424   stop_ptr = RExC_parse + 1;
13425  }
13426
13427  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13428  if (UCHARAT(RExC_parse) == ']')
13429   goto charclassloop;
13430
13431 parseit:
13432  while (1) {
13433   if  (RExC_parse >= stop_ptr) {
13434    break;
13435   }
13436
13437   if (skip_white) {
13438    RExC_parse = regpatws(pRExC_state, RExC_parse,
13439         FALSE /* means don't recognize comments */ );
13440   }
13441
13442   if  (UCHARAT(RExC_parse) == ']') {
13443    break;
13444   }
13445
13446  charclassloop:
13447
13448   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13449   save_value = value;
13450   save_prevvalue = prevvalue;
13451
13452   if (!range) {
13453    rangebegin = RExC_parse;
13454    element_count++;
13455   }
13456   if (UTF) {
13457    value = utf8n_to_uvchr((U8*)RExC_parse,
13458         RExC_end - RExC_parse,
13459         &numlen, UTF8_ALLOW_DEFAULT);
13460    RExC_parse += numlen;
13461   }
13462   else
13463    value = UCHARAT(RExC_parse++);
13464
13465   if (value == '['
13466    && RExC_parse < RExC_end
13467    && POSIXCC(UCHARAT(RExC_parse)))
13468   {
13469    namedclass = regpposixcc(pRExC_state, value, strict);
13470   }
13471   else if (value == '\\') {
13472    if (UTF) {
13473     value = utf8n_to_uvchr((U8*)RExC_parse,
13474         RExC_end - RExC_parse,
13475         &numlen, UTF8_ALLOW_DEFAULT);
13476     RExC_parse += numlen;
13477    }
13478    else
13479     value = UCHARAT(RExC_parse++);
13480
13481    /* Some compilers cannot handle switching on 64-bit integer
13482    * values, therefore value cannot be an UV.  Yes, this will
13483    * be a problem later if we want switch on Unicode.
13484    * A similar issue a little bit later when switching on
13485    * namedclass. --jhi */
13486
13487    /* If the \ is escaping white space when white space is being
13488    * skipped, it means that that white space is wanted literally, and
13489    * is already in 'value'.  Otherwise, need to translate the escape
13490    * into what it signifies. */
13491    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13492
13493    case 'w': namedclass = ANYOF_WORDCHAR; break;
13494    case 'W': namedclass = ANYOF_NWORDCHAR; break;
13495    case 's': namedclass = ANYOF_SPACE; break;
13496    case 'S': namedclass = ANYOF_NSPACE; break;
13497    case 'd': namedclass = ANYOF_DIGIT; break;
13498    case 'D': namedclass = ANYOF_NDIGIT; break;
13499    case 'v': namedclass = ANYOF_VERTWS; break;
13500    case 'V': namedclass = ANYOF_NVERTWS; break;
13501    case 'h': namedclass = ANYOF_HORIZWS; break;
13502    case 'H': namedclass = ANYOF_NHORIZWS; break;
13503    case 'N':  /* Handle \N{NAME} in class */
13504     {
13505      /* We only pay attention to the first char of
13506      multichar strings being returned. I kinda wonder
13507      if this makes sense as it does change the behaviour
13508      from earlier versions, OTOH that behaviour was broken
13509      as well. */
13510      if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13511          TRUE, /* => charclass */
13512          strict))
13513      {
13514       if (*flagp & RESTART_UTF8)
13515        FAIL("panic: grok_bslash_N set RESTART_UTF8");
13516       goto parseit;
13517      }
13518     }
13519     break;
13520    case 'p':
13521    case 'P':
13522     {
13523     char *e;
13524
13525     /* We will handle any undefined properties ourselves */
13526     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13527          /* And we actually would prefer to get
13528           * the straight inversion list of the
13529           * swash, since we will be accessing it
13530           * anyway, to save a little time */
13531          |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13532
13533     if (RExC_parse >= RExC_end)
13534      vFAIL2("Empty \\%c{}", (U8)value);
13535     if (*RExC_parse == '{') {
13536      const U8 c = (U8)value;
13537      e = strchr(RExC_parse++, '}');
13538      if (!e)
13539       vFAIL2("Missing right brace on \\%c{}", c);
13540      while (isSPACE(*RExC_parse))
13541       RExC_parse++;
13542      if (e == RExC_parse)
13543       vFAIL2("Empty \\%c{}", c);
13544      n = e - RExC_parse;
13545      while (isSPACE(*(RExC_parse + n - 1)))
13546       n--;
13547     }
13548     else {
13549      e = RExC_parse;
13550      n = 1;
13551     }
13552     if (!SIZE_ONLY) {
13553      SV* invlist;
13554      char* name;
13555
13556      if (UCHARAT(RExC_parse) == '^') {
13557       RExC_parse++;
13558       n--;
13559       /* toggle.  (The rhs xor gets the single bit that
13560       * differs between P and p; the other xor inverts just
13561       * that bit) */
13562       value ^= 'P' ^ 'p';
13563
13564       while (isSPACE(*RExC_parse)) {
13565        RExC_parse++;
13566        n--;
13567       }
13568      }
13569      /* Try to get the definition of the property into
13570      * <invlist>.  If /i is in effect, the effective property
13571      * will have its name be <__NAME_i>.  The design is
13572      * discussed in commit
13573      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13574      name = savepv(Perl_form(aTHX_
13575           "%s%.*s%s\n",
13576           (FOLD) ? "__" : "",
13577           (int)n,
13578           RExC_parse,
13579           (FOLD) ? "_i" : ""
13580         ));
13581
13582      /* Look up the property name, and get its swash and
13583      * inversion list, if the property is found  */
13584      if (swash) {
13585       SvREFCNT_dec_NN(swash);
13586      }
13587      swash = _core_swash_init("utf8", name, &PL_sv_undef,
13588            1, /* binary */
13589            0, /* not tr/// */
13590            NULL, /* No inversion list */
13591            &swash_init_flags
13592            );
13593      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13594       HV* curpkg = (IN_PERL_COMPILETIME)
13595          ? PL_curstash
13596          : CopSTASH(PL_curcop);
13597       if (swash) {
13598        SvREFCNT_dec_NN(swash);
13599        swash = NULL;
13600       }
13601
13602       /* Here didn't find it.  It could be a user-defined
13603       * property that will be available at run-time.  If we
13604       * accept only compile-time properties, is an error;
13605       * otherwise add it to the list for run-time look up */
13606       if (ret_invlist) {
13607        RExC_parse = e + 1;
13608        vFAIL2utf8f(
13609         "Property '%"UTF8f"' is unknown",
13610         UTF8fARG(UTF, n, name));
13611       }
13612
13613       /* If the property name doesn't already have a package
13614       * name, add the current one to it so that it can be
13615       * referred to outside it. [perl #121777] */
13616       if (curpkg && ! instr(name, "::")) {
13617        char* pkgname = HvNAME(curpkg);
13618        if (strNE(pkgname, "main")) {
13619         char* full_name = Perl_form(aTHX_
13620                "%s::%s",
13621                pkgname,
13622                name);
13623         n = strlen(full_name);
13624         Safefree(name);
13625         name = savepvn(full_name, n);
13626        }
13627       }
13628       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13629           (value == 'p' ? '+' : '!'),
13630           UTF8fARG(UTF, n, name));
13631       has_user_defined_property = TRUE;
13632
13633       /* We don't know yet, so have to assume that the
13634       * property could match something in the Latin1 range,
13635       * hence something that isn't utf8.  Note that this
13636       * would cause things in <depends_list> to match
13637       * inappropriately, except that any \p{}, including
13638       * this one forces Unicode semantics, which means there
13639       * is no <depends_list> */
13640       ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13641      }
13642      else {
13643
13644       /* Here, did get the swash and its inversion list.  If
13645       * the swash is from a user-defined property, then this
13646       * whole character class should be regarded as such */
13647       if (swash_init_flags
13648        & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13649       {
13650        has_user_defined_property = TRUE;
13651       }
13652       else if
13653        /* We warn on matching an above-Unicode code point
13654        * if the match would return true, except don't
13655        * warn for \p{All}, which has exactly one element
13656        * = 0 */
13657        (_invlist_contains_cp(invlist, 0x110000)
13658         && (! (_invlist_len(invlist) == 1
13659          && *invlist_array(invlist) == 0)))
13660       {
13661        warn_super = TRUE;
13662       }
13663
13664
13665       /* Invert if asking for the complement */
13666       if (value == 'P') {
13667        _invlist_union_complement_2nd(properties,
13668               invlist,
13669               &properties);
13670
13671        /* The swash can't be used as-is, because we've
13672        * inverted things; delay removing it to here after
13673        * have copied its invlist above */
13674        SvREFCNT_dec_NN(swash);
13675        swash = NULL;
13676       }
13677       else {
13678        _invlist_union(properties, invlist, &properties);
13679       }
13680      }
13681      Safefree(name);
13682     }
13683     RExC_parse = e + 1;
13684     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13685             named */
13686
13687     /* \p means they want Unicode semantics */
13688     RExC_uni_semantics = 1;
13689     }
13690     break;
13691    case 'n': value = '\n';   break;
13692    case 'r': value = '\r';   break;
13693    case 't': value = '\t';   break;
13694    case 'f': value = '\f';   break;
13695    case 'b': value = '\b';   break;
13696    case 'e': value = ASCII_TO_NATIVE('\033');break;
13697    case 'a': value = '\a';                   break;
13698    case 'o':
13699     RExC_parse--; /* function expects to be pointed at the 'o' */
13700     {
13701      const char* error_msg;
13702      bool valid = grok_bslash_o(&RExC_parse,
13703            &value,
13704            &error_msg,
13705            SIZE_ONLY,   /* warnings in pass
13706                1 only */
13707            strict,
13708            silence_non_portable,
13709            UTF);
13710      if (! valid) {
13711       vFAIL(error_msg);
13712      }
13713     }
13714     if (PL_encoding && value < 0x100) {
13715      goto recode_encoding;
13716     }
13717     break;
13718    case 'x':
13719     RExC_parse--; /* function expects to be pointed at the 'x' */
13720     {
13721      const char* error_msg;
13722      bool valid = grok_bslash_x(&RExC_parse,
13723            &value,
13724            &error_msg,
13725            TRUE, /* Output warnings */
13726            strict,
13727            silence_non_portable,
13728            UTF);
13729      if (! valid) {
13730       vFAIL(error_msg);
13731      }
13732     }
13733     if (PL_encoding && value < 0x100)
13734      goto recode_encoding;
13735     break;
13736    case 'c':
13737     value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13738     break;
13739    case '0': case '1': case '2': case '3': case '4':
13740    case '5': case '6': case '7':
13741     {
13742      /* Take 1-3 octal digits */
13743      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13744      numlen = (strict) ? 4 : 3;
13745      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13746      RExC_parse += numlen;
13747      if (numlen != 3) {
13748       if (strict) {
13749        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13750        vFAIL("Need exactly 3 octal digits");
13751       }
13752       else if (! SIZE_ONLY /* like \08, \178 */
13753         && numlen < 3
13754         && RExC_parse < RExC_end
13755         && isDIGIT(*RExC_parse)
13756         && ckWARN(WARN_REGEXP))
13757       {
13758        SAVEFREESV(RExC_rx_sv);
13759        reg_warn_non_literal_string(
13760         RExC_parse + 1,
13761         form_short_octal_warning(RExC_parse, numlen));
13762        (void)ReREFCNT_inc(RExC_rx_sv);
13763       }
13764      }
13765      if (PL_encoding && value < 0x100)
13766       goto recode_encoding;
13767      break;
13768     }
13769    recode_encoding:
13770     if (! RExC_override_recoding) {
13771      SV* enc = PL_encoding;
13772      value = reg_recode((const char)(U8)value, &enc);
13773      if (!enc) {
13774       if (strict) {
13775        vFAIL("Invalid escape in the specified encoding");
13776       }
13777       else if (SIZE_ONLY) {
13778        ckWARNreg(RExC_parse,
13779         "Invalid escape in the specified encoding");
13780       }
13781      }
13782      break;
13783     }
13784    default:
13785     /* Allow \_ to not give an error */
13786     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13787      if (strict) {
13788       vFAIL2("Unrecognized escape \\%c in character class",
13789        (int)value);
13790      }
13791      else {
13792       SAVEFREESV(RExC_rx_sv);
13793       ckWARN2reg(RExC_parse,
13794        "Unrecognized escape \\%c in character class passed through",
13795        (int)value);
13796       (void)ReREFCNT_inc(RExC_rx_sv);
13797      }
13798     }
13799     break;
13800    }   /* End of switch on char following backslash */
13801   } /* end of handling backslash escape sequences */
13802 #ifdef EBCDIC
13803   else
13804    literal_endpoint++;
13805 #endif
13806
13807   /* Here, we have the current token in 'value' */
13808
13809   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13810    U8 classnum;
13811
13812    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13813    * literal, as is the character that began the false range, i.e.
13814    * the 'a' in the examples */
13815    if (range) {
13816     if (!SIZE_ONLY) {
13817      const int w = (RExC_parse >= rangebegin)
13818         ? RExC_parse - rangebegin
13819         : 0;
13820      if (strict) {
13821       vFAIL2utf8f(
13822        "False [] range \"%"UTF8f"\"",
13823        UTF8fARG(UTF, w, rangebegin));
13824      }
13825      else {
13826       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13827       ckWARN2reg(RExC_parse,
13828        "False [] range \"%"UTF8f"\"",
13829        UTF8fARG(UTF, w, rangebegin));
13830       (void)ReREFCNT_inc(RExC_rx_sv);
13831       cp_list = add_cp_to_invlist(cp_list, '-');
13832       cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13833                prevvalue);
13834      }
13835     }
13836
13837     range = 0; /* this was not a true range */
13838     element_count += 2; /* So counts for three values */
13839    }
13840
13841    classnum = namedclass_to_classnum(namedclass);
13842
13843    if (LOC && namedclass < ANYOF_POSIXL_MAX
13844 #ifndef HAS_ISASCII
13845     && classnum != _CC_ASCII
13846 #endif
13847    ) {
13848     /* What the Posix classes (like \w, [:space:]) match in locale
13849     * isn't knowable under locale until actual match time.  Room
13850     * must be reserved (one time per outer bracketed class) to
13851     * store such classes.  The space will contain a bit for each
13852     * named class that is to be matched against.  This isn't
13853     * needed for \p{} and pseudo-classes, as they are not affected
13854     * by locale, and hence are dealt with separately */
13855     if (! need_class) {
13856      need_class = 1;
13857      if (SIZE_ONLY) {
13858       RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13859      }
13860      else {
13861       RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13862      }
13863      ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13864      ANYOF_POSIXL_ZERO(ret);
13865     }
13866
13867     /* Coverity thinks it is possible for this to be negative; both
13868     * jhi and khw think it's not, but be safer */
13869     assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13870      || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
13871
13872     /* See if it already matches the complement of this POSIX
13873     * class */
13874     if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13875      && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13876                ? -1
13877                : 1)))
13878     {
13879      posixl_matches_all = TRUE;
13880      break;  /* No need to continue.  Since it matches both
13881        e.g., \w and \W, it matches everything, and the
13882        bracketed class can be optimized into qr/./s */
13883     }
13884
13885     /* Add this class to those that should be checked at runtime */
13886     ANYOF_POSIXL_SET(ret, namedclass);
13887
13888     /* The above-Latin1 characters are not subject to locale rules.
13889     * Just add them, in the second pass, to the
13890     * unconditionally-matched list */
13891     if (! SIZE_ONLY) {
13892      SV* scratch_list = NULL;
13893
13894      /* Get the list of the above-Latin1 code points this
13895      * matches */
13896      _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13897           PL_XPosix_ptrs[classnum],
13898
13899           /* Odd numbers are complements, like
13900           * NDIGIT, NASCII, ... */
13901           namedclass % 2 != 0,
13902           &scratch_list);
13903      /* Checking if 'cp_list' is NULL first saves an extra
13904      * clone.  Its reference count will be decremented at the
13905      * next union, etc, or if this is the only instance, at the
13906      * end of the routine */
13907      if (! cp_list) {
13908       cp_list = scratch_list;
13909      }
13910      else {
13911       _invlist_union(cp_list, scratch_list, &cp_list);
13912       SvREFCNT_dec_NN(scratch_list);
13913      }
13914      continue;   /* Go get next character */
13915     }
13916    }
13917    else if (! SIZE_ONLY) {
13918
13919     /* Here, not in pass1 (in that pass we skip calculating the
13920     * contents of this class), and is /l, or is a POSIX class for
13921     * which /l doesn't matter (or is a Unicode property, which is
13922     * skipped here). */
13923     if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13924      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13925
13926       /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13927       * nor /l make a difference in what these match,
13928       * therefore we just add what they match to cp_list. */
13929       if (classnum != _CC_VERTSPACE) {
13930        assert(   namedclass == ANYOF_HORIZWS
13931         || namedclass == ANYOF_NHORIZWS);
13932
13933        /* It turns out that \h is just a synonym for
13934        * XPosixBlank */
13935        classnum = _CC_BLANK;
13936       }
13937
13938       _invlist_union_maybe_complement_2nd(
13939         cp_list,
13940         PL_XPosix_ptrs[classnum],
13941         namedclass % 2 != 0,    /* Complement if odd
13942               (NHORIZWS, NVERTWS)
13943               */
13944         &cp_list);
13945      }
13946     }
13947     else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13948       complement and use nposixes */
13949      SV** posixes_ptr = namedclass % 2 == 0
13950          ? &posixes
13951          : &nposixes;
13952      SV** source_ptr = &PL_XPosix_ptrs[classnum];
13953      _invlist_union_maybe_complement_2nd(
13954              *posixes_ptr,
13955              *source_ptr,
13956              namedclass % 2 != 0,
13957              posixes_ptr);
13958     }
13959     continue;   /* Go get next character */
13960    }
13961   } /* end of namedclass \blah */
13962
13963   /* Here, we have a single value.  If 'range' is set, it is the ending
13964   * of a range--check its validity.  Later, we will handle each
13965   * individual code point in the range.  If 'range' isn't set, this
13966   * could be the beginning of a range, so check for that by looking
13967   * ahead to see if the next real character to be processed is the range
13968   * indicator--the minus sign */
13969
13970   if (skip_white) {
13971    RExC_parse = regpatws(pRExC_state, RExC_parse,
13972         FALSE /* means don't recognize comments */ );
13973   }
13974
13975   if (range) {
13976    if (prevvalue > value) /* b-a */ {
13977     const int w = RExC_parse - rangebegin;
13978     vFAIL2utf8f(
13979      "Invalid [] range \"%"UTF8f"\"",
13980      UTF8fARG(UTF, w, rangebegin));
13981     range = 0; /* not a valid range */
13982    }
13983   }
13984   else {
13985    prevvalue = value; /* save the beginning of the potential range */
13986    if (! stop_at_1     /* Can't be a range if parsing just one thing */
13987     && *RExC_parse == '-')
13988    {
13989     char* next_char_ptr = RExC_parse + 1;
13990     if (skip_white) {   /* Get the next real char after the '-' */
13991      next_char_ptr = regpatws(pRExC_state,
13992            RExC_parse + 1,
13993            FALSE); /* means don't recognize
13994               comments */
13995     }
13996
13997     /* If the '-' is at the end of the class (just before the ']',
13998     * it is a literal minus; otherwise it is a range */
13999     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14000      RExC_parse = next_char_ptr;
14001
14002      /* a bad range like \w-, [:word:]- ? */
14003      if (namedclass > OOB_NAMEDCLASS) {
14004       if (strict || ckWARN(WARN_REGEXP)) {
14005        const int w =
14006         RExC_parse >= rangebegin ?
14007         RExC_parse - rangebegin : 0;
14008        if (strict) {
14009         vFAIL4("False [] range \"%*.*s\"",
14010          w, w, rangebegin);
14011        }
14012        else {
14013         vWARN4(RExC_parse,
14014          "False [] range \"%*.*s\"",
14015          w, w, rangebegin);
14016        }
14017       }
14018       if (!SIZE_ONLY) {
14019        cp_list = add_cp_to_invlist(cp_list, '-');
14020       }
14021       element_count++;
14022      } else
14023       range = 1; /* yeah, it's a range! */
14024      continue; /* but do it the next time */
14025     }
14026    }
14027   }
14028
14029   /* Here, <prevvalue> is the beginning of the range, if any; or <value>
14030   * if not */
14031
14032   /* non-Latin1 code point implies unicode semantics.  Must be set in
14033   * pass1 so is there for the whole of pass 2 */
14034   if (value > 255) {
14035    RExC_uni_semantics = 1;
14036   }
14037
14038   /* Ready to process either the single value, or the completed range.
14039   * For single-valued non-inverted ranges, we consider the possibility
14040   * of multi-char folds.  (We made a conscious decision to not do this
14041   * for the other cases because it can often lead to non-intuitive
14042   * results.  For example, you have the peculiar case that:
14043   *  "s s" =~ /^[^\xDF]+$/i => Y
14044   *  "ss"  =~ /^[^\xDF]+$/i => N
14045   *
14046   * See [perl #89750] */
14047   if (FOLD && allow_multi_folds && value == prevvalue) {
14048    if (value == LATIN_SMALL_LETTER_SHARP_S
14049     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14050               value)))
14051    {
14052     /* Here <value> is indeed a multi-char fold.  Get what it is */
14053
14054     U8 foldbuf[UTF8_MAXBYTES_CASE];
14055     STRLEN foldlen;
14056
14057     UV folded = _to_uni_fold_flags(
14058         value,
14059         foldbuf,
14060         &foldlen,
14061         FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14062             ? FOLD_FLAGS_NOMIX_ASCII
14063             : 0)
14064         );
14065
14066     /* Here, <folded> should be the first character of the
14067     * multi-char fold of <value>, with <foldbuf> containing the
14068     * whole thing.  But, if this fold is not allowed (because of
14069     * the flags), <fold> will be the same as <value>, and should
14070     * be processed like any other character, so skip the special
14071     * handling */
14072     if (folded != value) {
14073
14074      /* Skip if we are recursed, currently parsing the class
14075      * again.  Otherwise add this character to the list of
14076      * multi-char folds. */
14077      if (! RExC_in_multi_char_class) {
14078       AV** this_array_ptr;
14079       AV* this_array;
14080       STRLEN cp_count = utf8_length(foldbuf,
14081              foldbuf + foldlen);
14082       SV* multi_fold = sv_2mortal(newSVpvs(""));
14083
14084       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14085
14086
14087       if (! multi_char_matches) {
14088        multi_char_matches = newAV();
14089       }
14090
14091       /* <multi_char_matches> is actually an array of arrays.
14092       * There will be one or two top-level elements: [2],
14093       * and/or [3].  The [2] element is an array, each
14094       * element thereof is a character which folds to TWO
14095       * characters; [3] is for folds to THREE characters.
14096       * (Unicode guarantees a maximum of 3 characters in any
14097       * fold.)  When we rewrite the character class below,
14098       * we will do so such that the longest folds are
14099       * written first, so that it prefers the longest
14100       * matching strings first.  This is done even if it
14101       * turns out that any quantifier is non-greedy, out of
14102       * programmer laziness.  Tom Christiansen has agreed
14103       * that this is ok.  This makes the test for the
14104       * ligature 'ffi' come before the test for 'ff' */
14105       if (av_exists(multi_char_matches, cp_count)) {
14106        this_array_ptr = (AV**) av_fetch(multi_char_matches,
14107                cp_count, FALSE);
14108        this_array = *this_array_ptr;
14109       }
14110       else {
14111        this_array = newAV();
14112        av_store(multi_char_matches, cp_count,
14113          (SV*) this_array);
14114       }
14115       av_push(this_array, multi_fold);
14116      }
14117
14118      /* This element should not be processed further in this
14119      * class */
14120      element_count--;
14121      value = save_value;
14122      prevvalue = save_prevvalue;
14123      continue;
14124     }
14125    }
14126   }
14127
14128   /* Deal with this element of the class */
14129   if (! SIZE_ONLY) {
14130 #ifndef EBCDIC
14131    cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14132              prevvalue, value);
14133 #else
14134    SV* this_range = _new_invlist(1);
14135    _append_range_to_invlist(this_range, prevvalue, value);
14136
14137    /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14138    * If this range was specified using something like 'i-j', we want
14139    * to include only the 'i' and the 'j', and not anything in
14140    * between, so exclude non-ASCII, non-alphabetics from it.
14141    * However, if the range was specified with something like
14142    * [\x89-\x91] or [\x89-j], all code points within it should be
14143    * included.  literal_endpoint==2 means both ends of the range used
14144    * a literal character, not \x{foo} */
14145    if (literal_endpoint == 2
14146     && ((prevvalue >= 'a' && value <= 'z')
14147      || (prevvalue >= 'A' && value <= 'Z')))
14148    {
14149     _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14150          &this_range);
14151
14152     /* Since this above only contains ascii, the intersection of it
14153     * with anything will still yield only ascii */
14154     _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14155          &this_range);
14156    }
14157    _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14158    literal_endpoint = 0;
14159 #endif
14160   }
14161
14162   range = 0; /* this range (if it was one) is done now */
14163  } /* End of loop through all the text within the brackets */
14164
14165  /* If anything in the class expands to more than one character, we have to
14166  * deal with them by building up a substitute parse string, and recursively
14167  * calling reg() on it, instead of proceeding */
14168  if (multi_char_matches) {
14169   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14170   I32 cp_count;
14171   STRLEN len;
14172   char *save_end = RExC_end;
14173   char *save_parse = RExC_parse;
14174   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14175          a "|" */
14176   I32 reg_flags;
14177
14178   assert(! invert);
14179 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14180   because too confusing */
14181   if (invert) {
14182    sv_catpv(substitute_parse, "(?:");
14183   }
14184 #endif
14185
14186   /* Look at the longest folds first */
14187   for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14188
14189    if (av_exists(multi_char_matches, cp_count)) {
14190     AV** this_array_ptr;
14191     SV* this_sequence;
14192
14193     this_array_ptr = (AV**) av_fetch(multi_char_matches,
14194             cp_count, FALSE);
14195     while ((this_sequence = av_pop(*this_array_ptr)) !=
14196                 &PL_sv_undef)
14197     {
14198      if (! first_time) {
14199       sv_catpv(substitute_parse, "|");
14200      }
14201      first_time = FALSE;
14202
14203      sv_catpv(substitute_parse, SvPVX(this_sequence));
14204     }
14205    }
14206   }
14207
14208   /* If the character class contains anything else besides these
14209   * multi-character folds, have to include it in recursive parsing */
14210   if (element_count) {
14211    sv_catpv(substitute_parse, "|[");
14212    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14213    sv_catpv(substitute_parse, "]");
14214   }
14215
14216   sv_catpv(substitute_parse, ")");
14217 #if 0
14218   if (invert) {
14219    /* This is a way to get the parse to skip forward a whole named
14220    * sequence instead of matching the 2nd character when it fails the
14221    * first */
14222    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14223   }
14224 #endif
14225
14226   RExC_parse = SvPV(substitute_parse, len);
14227   RExC_end = RExC_parse + len;
14228   RExC_in_multi_char_class = 1;
14229   RExC_emit = (regnode *)orig_emit;
14230
14231   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14232
14233   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14234
14235   RExC_parse = save_parse;
14236   RExC_end = save_end;
14237   RExC_in_multi_char_class = 0;
14238   SvREFCNT_dec_NN(multi_char_matches);
14239   return ret;
14240  }
14241
14242  /* Here, we've gone through the entire class and dealt with multi-char
14243  * folds.  We are now in a position that we can do some checks to see if we
14244  * can optimize this ANYOF node into a simpler one, even in Pass 1.
14245  * Currently we only do two checks:
14246  * 1) is in the unlikely event that the user has specified both, eg. \w and
14247  *    \W under /l, then the class matches everything.  (This optimization
14248  *    is done only to make the optimizer code run later work.)
14249  * 2) if the character class contains only a single element (including a
14250  *    single range), we see if there is an equivalent node for it.
14251  * Other checks are possible */
14252  if (! ret_invlist   /* Can't optimize if returning the constructed
14253       inversion list */
14254   && (UNLIKELY(posixl_matches_all) || element_count == 1))
14255  {
14256   U8 op = END;
14257   U8 arg = 0;
14258
14259   if (UNLIKELY(posixl_matches_all)) {
14260    op = SANY;
14261   }
14262   else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14263             \w or [:digit:] or \p{foo}
14264             */
14265
14266    /* All named classes are mapped into POSIXish nodes, with its FLAG
14267    * argument giving which class it is */
14268    switch ((I32)namedclass) {
14269     case ANYOF_UNIPROP:
14270      break;
14271
14272     /* These don't depend on the charset modifiers.  They always
14273     * match under /u rules */
14274     case ANYOF_NHORIZWS:
14275     case ANYOF_HORIZWS:
14276      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14277      /* FALLTHROUGH */
14278
14279     case ANYOF_NVERTWS:
14280     case ANYOF_VERTWS:
14281      op = POSIXU;
14282      goto join_posix;
14283
14284     /* The actual POSIXish node for all the rest depends on the
14285     * charset modifier.  The ones in the first set depend only on
14286     * ASCII or, if available on this platform, locale */
14287     case ANYOF_ASCII:
14288     case ANYOF_NASCII:
14289 #ifdef HAS_ISASCII
14290      op = (LOC) ? POSIXL : POSIXA;
14291 #else
14292      op = POSIXA;
14293 #endif
14294      goto join_posix;
14295
14296     case ANYOF_NCASED:
14297     case ANYOF_LOWER:
14298     case ANYOF_NLOWER:
14299     case ANYOF_UPPER:
14300     case ANYOF_NUPPER:
14301      /* under /a could be alpha */
14302      if (FOLD) {
14303       if (ASCII_RESTRICTED) {
14304        namedclass = ANYOF_ALPHA + (namedclass % 2);
14305       }
14306       else if (! LOC) {
14307        break;
14308       }
14309      }
14310      /* FALLTHROUGH */
14311
14312     /* The rest have more possibilities depending on the charset.
14313     * We take advantage of the enum ordering of the charset
14314     * modifiers to get the exact node type, */
14315     default:
14316      op = POSIXD + get_regex_charset(RExC_flags);
14317      if (op > POSIXA) { /* /aa is same as /a */
14318       op = POSIXA;
14319      }
14320
14321     join_posix:
14322      /* The odd numbered ones are the complements of the
14323      * next-lower even number one */
14324      if (namedclass % 2 == 1) {
14325       invert = ! invert;
14326       namedclass--;
14327      }
14328      arg = namedclass_to_classnum(namedclass);
14329      break;
14330    }
14331   }
14332   else if (value == prevvalue) {
14333
14334    /* Here, the class consists of just a single code point */
14335
14336    if (invert) {
14337     if (! LOC && value == '\n') {
14338      op = REG_ANY; /* Optimize [^\n] */
14339      *flagp |= HASWIDTH|SIMPLE;
14340      RExC_naughty++;
14341     }
14342    }
14343    else if (value < 256 || UTF) {
14344
14345     /* Optimize a single value into an EXACTish node, but not if it
14346     * would require converting the pattern to UTF-8. */
14347     op = compute_EXACTish(pRExC_state);
14348    }
14349   } /* Otherwise is a range */
14350   else if (! LOC) {   /* locale could vary these */
14351    if (prevvalue == '0') {
14352     if (value == '9') {
14353      arg = _CC_DIGIT;
14354      op = POSIXA;
14355     }
14356    }
14357    else if (prevvalue == 'A') {
14358     if (value == 'Z'
14359 #ifdef EBCDIC
14360      && literal_endpoint == 2
14361 #endif
14362     ) {
14363      arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14364      op = POSIXA;
14365     }
14366    }
14367    else if (prevvalue == 'a') {
14368     if (value == 'z'
14369 #ifdef EBCDIC
14370      && literal_endpoint == 2
14371 #endif
14372     ) {
14373      arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14374      op = POSIXA;
14375     }
14376    }
14377   }
14378
14379   /* Here, we have changed <op> away from its initial value iff we found
14380   * an optimization */
14381   if (op != END) {
14382
14383    /* Throw away this ANYOF regnode, and emit the calculated one,
14384    * which should correspond to the beginning, not current, state of
14385    * the parse */
14386    const char * cur_parse = RExC_parse;
14387    RExC_parse = (char *)orig_parse;
14388    if ( SIZE_ONLY) {
14389     if (! LOC) {
14390
14391      /* To get locale nodes to not use the full ANYOF size would
14392      * require moving the code above that writes the portions
14393      * of it that aren't in other nodes to after this point.
14394      * e.g.  ANYOF_POSIXL_SET */
14395      RExC_size = orig_size;
14396     }
14397    }
14398    else {
14399     RExC_emit = (regnode *)orig_emit;
14400     if (PL_regkind[op] == POSIXD) {
14401      if (op == POSIXL) {
14402       RExC_contains_locale = 1;
14403      }
14404      if (invert) {
14405       op += NPOSIXD - POSIXD;
14406      }
14407     }
14408    }
14409
14410    ret = reg_node(pRExC_state, op);
14411
14412    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14413     if (! SIZE_ONLY) {
14414      FLAGS(ret) = arg;
14415     }
14416     *flagp |= HASWIDTH|SIMPLE;
14417    }
14418    else if (PL_regkind[op] == EXACT) {
14419     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14420           TRUE /* downgradable to EXACT */
14421           );
14422    }
14423
14424    RExC_parse = (char *) cur_parse;
14425
14426    SvREFCNT_dec(posixes);
14427    SvREFCNT_dec(nposixes);
14428    SvREFCNT_dec(cp_list);
14429    SvREFCNT_dec(cp_foldable_list);
14430    return ret;
14431   }
14432  }
14433
14434  if (SIZE_ONLY)
14435   return ret;
14436  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14437
14438  /* If folding, we calculate all characters that could fold to or from the
14439  * ones already on the list */
14440  if (cp_foldable_list) {
14441   if (FOLD) {
14442    UV start, end; /* End points of code point ranges */
14443
14444    SV* fold_intersection = NULL;
14445    SV** use_list;
14446
14447    /* Our calculated list will be for Unicode rules.  For locale
14448    * matching, we have to keep a separate list that is consulted at
14449    * runtime only when the locale indicates Unicode rules.  For
14450    * non-locale, we just use to the general list */
14451    if (LOC) {
14452     use_list = &only_utf8_locale_list;
14453    }
14454    else {
14455     use_list = &cp_list;
14456    }
14457
14458    /* Only the characters in this class that participate in folds need
14459    * be checked.  Get the intersection of this class and all the
14460    * possible characters that are foldable.  This can quickly narrow
14461    * down a large class */
14462    _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14463         &fold_intersection);
14464
14465    /* The folds for all the Latin1 characters are hard-coded into this
14466    * program, but we have to go out to disk to get the others. */
14467    if (invlist_highest(cp_foldable_list) >= 256) {
14468
14469     /* This is a hash that for a particular fold gives all
14470     * characters that are involved in it */
14471     if (! PL_utf8_foldclosures) {
14472      _load_PL_utf8_foldclosures();
14473     }
14474    }
14475
14476    /* Now look at the foldable characters in this class individually */
14477    invlist_iterinit(fold_intersection);
14478    while (invlist_iternext(fold_intersection, &start, &end)) {
14479     UV j;
14480
14481     /* Look at every character in the range */
14482     for (j = start; j <= end; j++) {
14483      U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14484      STRLEN foldlen;
14485      SV** listp;
14486
14487      if (j < 256) {
14488
14489       if (IS_IN_SOME_FOLD_L1(j)) {
14490
14491        /* ASCII is always matched; non-ASCII is matched
14492        * only under Unicode rules (which could happen
14493        * under /l if the locale is a UTF-8 one */
14494        if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14495         *use_list = add_cp_to_invlist(*use_list,
14496                PL_fold_latin1[j]);
14497        }
14498        else {
14499         depends_list =
14500         add_cp_to_invlist(depends_list,
14501             PL_fold_latin1[j]);
14502        }
14503       }
14504
14505       if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14506        && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14507       {
14508        add_above_Latin1_folds(pRExC_state,
14509             (U8) j,
14510             use_list);
14511       }
14512       continue;
14513      }
14514
14515      /* Here is an above Latin1 character.  We don't have the
14516      * rules hard-coded for it.  First, get its fold.  This is
14517      * the simple fold, as the multi-character folds have been
14518      * handled earlier and separated out */
14519      _to_uni_fold_flags(j, foldbuf, &foldlen,
14520               (ASCII_FOLD_RESTRICTED)
14521               ? FOLD_FLAGS_NOMIX_ASCII
14522               : 0);
14523
14524      /* Single character fold of above Latin1.  Add everything in
14525      * its fold closure to the list that this node should match.
14526      * The fold closures data structure is a hash with the keys
14527      * being the UTF-8 of every character that is folded to, like
14528      * 'k', and the values each an array of all code points that
14529      * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14530      * Multi-character folds are not included */
14531      if ((listp = hv_fetch(PL_utf8_foldclosures,
14532           (char *) foldbuf, foldlen, FALSE)))
14533      {
14534       AV* list = (AV*) *listp;
14535       IV k;
14536       for (k = 0; k <= av_tindex(list); k++) {
14537        SV** c_p = av_fetch(list, k, FALSE);
14538        UV c;
14539        assert(c_p);
14540
14541        c = SvUV(*c_p);
14542
14543        /* /aa doesn't allow folds between ASCII and non- */
14544        if ((ASCII_FOLD_RESTRICTED
14545         && (isASCII(c) != isASCII(j))))
14546        {
14547         continue;
14548        }
14549
14550        /* Folds under /l which cross the 255/256 boundary
14551        * are added to a separate list.  (These are valid
14552        * only when the locale is UTF-8.) */
14553        if (c < 256 && LOC) {
14554         *use_list = add_cp_to_invlist(*use_list, c);
14555         continue;
14556        }
14557
14558        if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14559        {
14560         cp_list = add_cp_to_invlist(cp_list, c);
14561        }
14562        else {
14563         /* Similarly folds involving non-ascii Latin1
14564         * characters under /d are added to their list */
14565         depends_list = add_cp_to_invlist(depends_list,
14566                 c);
14567        }
14568       }
14569      }
14570     }
14571    }
14572    SvREFCNT_dec_NN(fold_intersection);
14573   }
14574
14575   /* Now that we have finished adding all the folds, there is no reason
14576   * to keep the foldable list separate */
14577   _invlist_union(cp_list, cp_foldable_list, &cp_list);
14578   SvREFCNT_dec_NN(cp_foldable_list);
14579  }
14580
14581  /* And combine the result (if any) with any inversion list from posix
14582  * classes.  The lists are kept separate up to now because we don't want to
14583  * fold the classes (folding of those is automatically handled by the swash
14584  * fetching code) */
14585  if (posixes || nposixes) {
14586   if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14587    /* Under /a and /aa, nothing above ASCII matches these */
14588    _invlist_intersection(posixes,
14589         PL_XPosix_ptrs[_CC_ASCII],
14590         &posixes);
14591   }
14592   if (nposixes) {
14593    if (DEPENDS_SEMANTICS) {
14594     /* Under /d, everything in the upper half of the Latin1 range
14595     * matches these complements */
14596     ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14597    }
14598    else if (AT_LEAST_ASCII_RESTRICTED) {
14599     /* Under /a and /aa, everything above ASCII matches these
14600     * complements */
14601     _invlist_union_complement_2nd(nposixes,
14602            PL_XPosix_ptrs[_CC_ASCII],
14603            &nposixes);
14604    }
14605    if (posixes) {
14606     _invlist_union(posixes, nposixes, &posixes);
14607     SvREFCNT_dec_NN(nposixes);
14608    }
14609    else {
14610     posixes = nposixes;
14611    }
14612   }
14613   if (! DEPENDS_SEMANTICS) {
14614    if (cp_list) {
14615     _invlist_union(cp_list, posixes, &cp_list);
14616     SvREFCNT_dec_NN(posixes);
14617    }
14618    else {
14619     cp_list = posixes;
14620    }
14621   }
14622   else {
14623    /* Under /d, we put into a separate list the Latin1 things that
14624    * match only when the target string is utf8 */
14625    SV* nonascii_but_latin1_properties = NULL;
14626    _invlist_intersection(posixes, PL_UpperLatin1,
14627         &nonascii_but_latin1_properties);
14628    _invlist_subtract(posixes, nonascii_but_latin1_properties,
14629        &posixes);
14630    if (cp_list) {
14631     _invlist_union(cp_list, posixes, &cp_list);
14632     SvREFCNT_dec_NN(posixes);
14633    }
14634    else {
14635     cp_list = posixes;
14636    }
14637
14638    if (depends_list) {
14639     _invlist_union(depends_list, nonascii_but_latin1_properties,
14640        &depends_list);
14641     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14642    }
14643    else {
14644     depends_list = nonascii_but_latin1_properties;
14645    }
14646   }
14647  }
14648
14649  /* And combine the result (if any) with any inversion list from properties.
14650  * The lists are kept separate up to now so that we can distinguish the two
14651  * in regards to matching above-Unicode.  A run-time warning is generated
14652  * if a Unicode property is matched against a non-Unicode code point. But,
14653  * we allow user-defined properties to match anything, without any warning,
14654  * and we also suppress the warning if there is a portion of the character
14655  * class that isn't a Unicode property, and which matches above Unicode, \W
14656  * or [\x{110000}] for example.
14657  * (Note that in this case, unlike the Posix one above, there is no
14658  * <depends_list>, because having a Unicode property forces Unicode
14659  * semantics */
14660  if (properties) {
14661   if (cp_list) {
14662
14663    /* If it matters to the final outcome, see if a non-property
14664    * component of the class matches above Unicode.  If so, the
14665    * warning gets suppressed.  This is true even if just a single
14666    * such code point is specified, as though not strictly correct if
14667    * another such code point is matched against, the fact that they
14668    * are using above-Unicode code points indicates they should know
14669    * the issues involved */
14670    if (warn_super) {
14671     warn_super = ! (invert
14672        ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14673    }
14674
14675    _invlist_union(properties, cp_list, &cp_list);
14676    SvREFCNT_dec_NN(properties);
14677   }
14678   else {
14679    cp_list = properties;
14680   }
14681
14682   if (warn_super) {
14683    ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14684   }
14685  }
14686
14687  /* Here, we have calculated what code points should be in the character
14688  * class.
14689  *
14690  * Now we can see about various optimizations.  Fold calculation (which we
14691  * did above) needs to take place before inversion.  Otherwise /[^k]/i
14692  * would invert to include K, which under /i would match k, which it
14693  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14694  * folded until runtime */
14695
14696  /* If we didn't do folding, it's because some information isn't available
14697  * until runtime; set the run-time fold flag for these.  (We don't have to
14698  * worry about properties folding, as that is taken care of by the swash
14699  * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14700  * locales, or the class matches at least one 0-255 range code point */
14701  if (LOC && FOLD) {
14702   if (only_utf8_locale_list) {
14703    ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14704   }
14705   else if (cp_list) { /* Look to see if there a 0-255 code point is in
14706        the list */
14707    UV start, end;
14708    invlist_iterinit(cp_list);
14709    if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14710     ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14711    }
14712    invlist_iterfinish(cp_list);
14713   }
14714  }
14715
14716  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14717  * at compile time.  Besides not inverting folded locale now, we can't
14718  * invert if there are things such as \w, which aren't known until runtime
14719  * */
14720  if (cp_list
14721   && invert
14722   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14723   && ! depends_list
14724   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14725  {
14726   _invlist_invert(cp_list);
14727
14728   /* Any swash can't be used as-is, because we've inverted things */
14729   if (swash) {
14730    SvREFCNT_dec_NN(swash);
14731    swash = NULL;
14732   }
14733
14734   /* Clear the invert flag since have just done it here */
14735   invert = FALSE;
14736  }
14737
14738  if (ret_invlist) {
14739   *ret_invlist = cp_list;
14740   SvREFCNT_dec(swash);
14741
14742   /* Discard the generated node */
14743   if (SIZE_ONLY) {
14744    RExC_size = orig_size;
14745   }
14746   else {
14747    RExC_emit = orig_emit;
14748   }
14749   return orig_emit;
14750  }
14751
14752  /* Some character classes are equivalent to other nodes.  Such nodes take
14753  * up less room and generally fewer operations to execute than ANYOF nodes.
14754  * Above, we checked for and optimized into some such equivalents for
14755  * certain common classes that are easy to test.  Getting to this point in
14756  * the code means that the class didn't get optimized there.  Since this
14757  * code is only executed in Pass 2, it is too late to save space--it has
14758  * been allocated in Pass 1, and currently isn't given back.  But turning
14759  * things into an EXACTish node can allow the optimizer to join it to any
14760  * adjacent such nodes.  And if the class is equivalent to things like /./,
14761  * expensive run-time swashes can be avoided.  Now that we have more
14762  * complete information, we can find things necessarily missed by the
14763  * earlier code.  I (khw) am not sure how much to look for here.  It would
14764  * be easy, but perhaps too slow, to check any candidates against all the
14765  * node types they could possibly match using _invlistEQ(). */
14766
14767  if (cp_list
14768   && ! invert
14769   && ! depends_list
14770   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14771   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14772
14773   /* We don't optimize if we are supposed to make sure all non-Unicode
14774    * code points raise a warning, as only ANYOF nodes have this check.
14775    * */
14776   && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14777  {
14778   UV start, end;
14779   U8 op = END;  /* The optimzation node-type */
14780   const char * cur_parse= RExC_parse;
14781
14782   invlist_iterinit(cp_list);
14783   if (! invlist_iternext(cp_list, &start, &end)) {
14784
14785    /* Here, the list is empty.  This happens, for example, when a
14786    * Unicode property is the only thing in the character class, and
14787    * it doesn't match anything.  (perluniprops.pod notes such
14788    * properties) */
14789    op = OPFAIL;
14790    *flagp |= HASWIDTH|SIMPLE;
14791   }
14792   else if (start == end) {    /* The range is a single code point */
14793    if (! invlist_iternext(cp_list, &start, &end)
14794
14795      /* Don't do this optimization if it would require changing
14796      * the pattern to UTF-8 */
14797     && (start < 256 || UTF))
14798    {
14799     /* Here, the list contains a single code point.  Can optimize
14800     * into an EXACTish node */
14801
14802     value = start;
14803
14804     if (! FOLD) {
14805      op = EXACT;
14806     }
14807     else if (LOC) {
14808
14809      /* A locale node under folding with one code point can be
14810      * an EXACTFL, as its fold won't be calculated until
14811      * runtime */
14812      op = EXACTFL;
14813     }
14814     else {
14815
14816      /* Here, we are generally folding, but there is only one
14817      * code point to match.  If we have to, we use an EXACT
14818      * node, but it would be better for joining with adjacent
14819      * nodes in the optimization pass if we used the same
14820      * EXACTFish node that any such are likely to be.  We can
14821      * do this iff the code point doesn't participate in any
14822      * folds.  For example, an EXACTF of a colon is the same as
14823      * an EXACT one, since nothing folds to or from a colon. */
14824      if (value < 256) {
14825       if (IS_IN_SOME_FOLD_L1(value)) {
14826        op = EXACT;
14827       }
14828      }
14829      else {
14830       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14831        op = EXACT;
14832       }
14833      }
14834
14835      /* If we haven't found the node type, above, it means we
14836      * can use the prevailing one */
14837      if (op == END) {
14838       op = compute_EXACTish(pRExC_state);
14839      }
14840     }
14841    }
14842   }
14843   else if (start == 0) {
14844    if (end == UV_MAX) {
14845     op = SANY;
14846     *flagp |= HASWIDTH|SIMPLE;
14847     RExC_naughty++;
14848    }
14849    else if (end == '\n' - 1
14850      && invlist_iternext(cp_list, &start, &end)
14851      && start == '\n' + 1 && end == UV_MAX)
14852    {
14853     op = REG_ANY;
14854     *flagp |= HASWIDTH|SIMPLE;
14855     RExC_naughty++;
14856    }
14857   }
14858   invlist_iterfinish(cp_list);
14859
14860   if (op != END) {
14861    RExC_parse = (char *)orig_parse;
14862    RExC_emit = (regnode *)orig_emit;
14863
14864    ret = reg_node(pRExC_state, op);
14865
14866    RExC_parse = (char *)cur_parse;
14867
14868    if (PL_regkind[op] == EXACT) {
14869     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14870           TRUE /* downgradable to EXACT */
14871           );
14872    }
14873
14874    SvREFCNT_dec_NN(cp_list);
14875    return ret;
14876   }
14877  }
14878
14879  /* Here, <cp_list> contains all the code points we can determine at
14880  * compile time that match under all conditions.  Go through it, and
14881  * for things that belong in the bitmap, put them there, and delete from
14882  * <cp_list>.  While we are at it, see if everything above 255 is in the
14883  * list, and if so, set a flag to speed up execution */
14884
14885  populate_ANYOF_from_invlist(ret, &cp_list);
14886
14887  if (invert) {
14888   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14889  }
14890
14891  /* Here, the bitmap has been populated with all the Latin1 code points that
14892  * always match.  Can now add to the overall list those that match only
14893  * when the target string is UTF-8 (<depends_list>). */
14894  if (depends_list) {
14895   if (cp_list) {
14896    _invlist_union(cp_list, depends_list, &cp_list);
14897    SvREFCNT_dec_NN(depends_list);
14898   }
14899   else {
14900    cp_list = depends_list;
14901   }
14902   ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14903  }
14904
14905  /* If there is a swash and more than one element, we can't use the swash in
14906  * the optimization below. */
14907  if (swash && element_count > 1) {
14908   SvREFCNT_dec_NN(swash);
14909   swash = NULL;
14910  }
14911
14912  set_ANYOF_arg(pRExC_state, ret, cp_list,
14913     (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14914     ? listsv : NULL,
14915     only_utf8_locale_list,
14916     swash, has_user_defined_property);
14917
14918  *flagp |= HASWIDTH|SIMPLE;
14919
14920  if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14921   RExC_contains_locale = 1;
14922  }
14923
14924  return ret;
14925 }
14926
14927 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14928
14929 STATIC void
14930 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14931     regnode* const node,
14932     SV* const cp_list,
14933     SV* const runtime_defns,
14934     SV* const only_utf8_locale_list,
14935     SV* const swash,
14936     const bool has_user_defined_property)
14937 {
14938  /* Sets the arg field of an ANYOF-type node 'node', using information about
14939  * the node passed-in.  If there is nothing outside the node's bitmap, the
14940  * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14941  * the count returned by add_data(), having allocated and stored an array,
14942  * av, that that count references, as follows:
14943  *  av[0] stores the character class description in its textual form.
14944  *        This is used later (regexec.c:Perl_regclass_swash()) to
14945  *        initialize the appropriate swash, and is also useful for dumping
14946  *        the regnode.  This is set to &PL_sv_undef if the textual
14947  *        description is not needed at run-time (as happens if the other
14948  *        elements completely define the class)
14949  *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14950  *        computed from av[0].  But if no further computation need be done,
14951  *        the swash is stored here now (and av[0] is &PL_sv_undef).
14952  *  av[2] stores the inversion list of code points that match only if the
14953  *        current locale is UTF-8
14954  *  av[3] stores the cp_list inversion list for use in addition or instead
14955  *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14956  *        (Otherwise everything needed is already in av[0] and av[1])
14957  *  av[4] is set if any component of the class is from a user-defined
14958  *        property; used only if av[3] exists */
14959
14960  UV n;
14961
14962  PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14963
14964  if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14965   assert(! (ANYOF_FLAGS(node)
14966      & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14967   ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14968  }
14969  else {
14970   AV * const av = newAV();
14971   SV *rv;
14972
14973   assert(ANYOF_FLAGS(node)
14974      & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14975
14976   av_store(av, 0, (runtime_defns)
14977       ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14978   if (swash) {
14979    assert(cp_list);
14980    av_store(av, 1, swash);
14981    SvREFCNT_dec_NN(cp_list);
14982   }
14983   else {
14984    av_store(av, 1, &PL_sv_undef);
14985    if (cp_list) {
14986     av_store(av, 3, cp_list);
14987     av_store(av, 4, newSVuv(has_user_defined_property));
14988    }
14989   }
14990
14991   if (only_utf8_locale_list) {
14992    av_store(av, 2, only_utf8_locale_list);
14993   }
14994   else {
14995    av_store(av, 2, &PL_sv_undef);
14996   }
14997
14998   rv = newRV_noinc(MUTABLE_SV(av));
14999   n = add_data(pRExC_state, STR_WITH_LEN("s"));
15000   RExC_rxi->data->data[n] = (void*)rv;
15001   ARG_SET(node, n);
15002  }
15003 }
15004
15005
15006 /* reg_skipcomment()
15007
15008    Absorbs an /x style # comment from the input stream,
15009    returning a pointer to the first character beyond the comment, or if the
15010    comment terminates the pattern without anything following it, this returns
15011    one past the final character of the pattern (in other words, RExC_end) and
15012    sets the REG_RUN_ON_COMMENT_SEEN flag.
15013
15014    Note it's the callers responsibility to ensure that we are
15015    actually in /x mode
15016
15017 */
15018
15019 PERL_STATIC_INLINE char*
15020 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15021 {
15022  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15023
15024  assert(*p == '#');
15025
15026  while (p < RExC_end) {
15027   if (*(++p) == '\n') {
15028    return p+1;
15029   }
15030  }
15031
15032  /* we ran off the end of the pattern without ending the comment, so we have
15033  * to add an \n when wrapping */
15034  RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15035  return p;
15036 }
15037
15038 /* nextchar()
15039
15040    Advances the parse position, and optionally absorbs
15041    "whitespace" from the inputstream.
15042
15043    Without /x "whitespace" means (?#...) style comments only,
15044    with /x this means (?#...) and # comments and whitespace proper.
15045
15046    Returns the RExC_parse point from BEFORE the scan occurs.
15047
15048    This is the /x friendly way of saying RExC_parse++.
15049 */
15050
15051 STATIC char*
15052 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15053 {
15054  char* const retval = RExC_parse++;
15055
15056  PERL_ARGS_ASSERT_NEXTCHAR;
15057
15058  for (;;) {
15059   if (RExC_end - RExC_parse >= 3
15060    && *RExC_parse == '('
15061    && RExC_parse[1] == '?'
15062    && RExC_parse[2] == '#')
15063   {
15064    while (*RExC_parse != ')') {
15065     if (RExC_parse == RExC_end)
15066      FAIL("Sequence (?#... not terminated");
15067     RExC_parse++;
15068    }
15069    RExC_parse++;
15070    continue;
15071   }
15072   if (RExC_flags & RXf_PMf_EXTENDED) {
15073    char * p = regpatws(pRExC_state, RExC_parse,
15074           TRUE); /* means recognize comments */
15075    if (p != RExC_parse) {
15076     RExC_parse = p;
15077     continue;
15078    }
15079   }
15080   return retval;
15081  }
15082 }
15083
15084 /*
15085 - reg_node - emit a node
15086 */
15087 STATIC regnode *   /* Location. */
15088 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15089 {
15090  regnode *ptr;
15091  regnode * const ret = RExC_emit;
15092  GET_RE_DEBUG_FLAGS_DECL;
15093
15094  PERL_ARGS_ASSERT_REG_NODE;
15095
15096  if (SIZE_ONLY) {
15097   SIZE_ALIGN(RExC_size);
15098   RExC_size += 1;
15099   return(ret);
15100  }
15101  if (RExC_emit >= RExC_emit_bound)
15102   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15103     op, (void*)RExC_emit, (void*)RExC_emit_bound);
15104
15105  NODE_ALIGN_FILL(ret);
15106  ptr = ret;
15107  FILL_ADVANCE_NODE(ptr, op);
15108  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
15109 #ifdef RE_TRACK_PATTERN_OFFSETS
15110  if (RExC_offsets) {         /* MJD */
15111   MJD_OFFSET_DEBUG(
15112    ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15113    "reg_node", __LINE__,
15114    PL_reg_name[op],
15115    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15116     ? "Overwriting end of array!\n" : "OK",
15117    (UV)(RExC_emit - RExC_emit_start),
15118    (UV)(RExC_parse - RExC_start),
15119    (UV)RExC_offsets[0]));
15120   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15121  }
15122 #endif
15123  RExC_emit = ptr;
15124  return(ret);
15125 }
15126
15127 /*
15128 - reganode - emit a node with an argument
15129 */
15130 STATIC regnode *   /* Location. */
15131 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15132 {
15133  regnode *ptr;
15134  regnode * const ret = RExC_emit;
15135  GET_RE_DEBUG_FLAGS_DECL;
15136
15137  PERL_ARGS_ASSERT_REGANODE;
15138
15139  if (SIZE_ONLY) {
15140   SIZE_ALIGN(RExC_size);
15141   RExC_size += 2;
15142   /*
15143   We can't do this:
15144
15145   assert(2==regarglen[op]+1);
15146
15147   Anything larger than this has to allocate the extra amount.
15148   If we changed this to be:
15149
15150   RExC_size += (1 + regarglen[op]);
15151
15152   then it wouldn't matter. Its not clear what side effect
15153   might come from that so its not done so far.
15154   -- dmq
15155   */
15156   return(ret);
15157  }
15158  if (RExC_emit >= RExC_emit_bound)
15159   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15160     op, (void*)RExC_emit, (void*)RExC_emit_bound);
15161
15162  NODE_ALIGN_FILL(ret);
15163  ptr = ret;
15164  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15165  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
15166 #ifdef RE_TRACK_PATTERN_OFFSETS
15167  if (RExC_offsets) {         /* MJD */
15168   MJD_OFFSET_DEBUG(
15169    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15170    "reganode",
15171    __LINE__,
15172    PL_reg_name[op],
15173    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15174    "Overwriting end of array!\n" : "OK",
15175    (UV)(RExC_emit - RExC_emit_start),
15176    (UV)(RExC_parse - RExC_start),
15177    (UV)RExC_offsets[0]));
15178   Set_Cur_Node_Offset;
15179  }
15180 #endif
15181  RExC_emit = ptr;
15182  return(ret);
15183 }
15184
15185 /*
15186 - reguni - emit (if appropriate) a Unicode character
15187 */
15188 PERL_STATIC_INLINE STRLEN
15189 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15190 {
15191  PERL_ARGS_ASSERT_REGUNI;
15192
15193  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15194 }
15195
15196 /*
15197 - reginsert - insert an operator in front of already-emitted operand
15198 *
15199 * Means relocating the operand.
15200 */
15201 STATIC void
15202 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15203 {
15204  regnode *src;
15205  regnode *dst;
15206  regnode *place;
15207  const int offset = regarglen[(U8)op];
15208  const int size = NODE_STEP_REGNODE + offset;
15209  GET_RE_DEBUG_FLAGS_DECL;
15210
15211  PERL_ARGS_ASSERT_REGINSERT;
15212  PERL_UNUSED_CONTEXT;
15213  PERL_UNUSED_ARG(depth);
15214 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15215  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15216  if (SIZE_ONLY) {
15217   RExC_size += size;
15218   return;
15219  }
15220
15221  src = RExC_emit;
15222  RExC_emit += size;
15223  dst = RExC_emit;
15224  if (RExC_open_parens) {
15225   int paren;
15226   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15227   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15228    if ( RExC_open_parens[paren] >= opnd ) {
15229     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15230     RExC_open_parens[paren] += size;
15231    } else {
15232     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15233    }
15234    if ( RExC_close_parens[paren] >= opnd ) {
15235     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15236     RExC_close_parens[paren] += size;
15237    } else {
15238     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15239    }
15240   }
15241  }
15242
15243  while (src > opnd) {
15244   StructCopy(--src, --dst, regnode);
15245 #ifdef RE_TRACK_PATTERN_OFFSETS
15246   if (RExC_offsets) {     /* MJD 20010112 */
15247    MJD_OFFSET_DEBUG(
15248     ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15249     "reg_insert",
15250     __LINE__,
15251     PL_reg_name[op],
15252     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15253      ? "Overwriting end of array!\n" : "OK",
15254     (UV)(src - RExC_emit_start),
15255     (UV)(dst - RExC_emit_start),
15256     (UV)RExC_offsets[0]));
15257    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15258    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15259   }
15260 #endif
15261  }
15262
15263
15264  place = opnd;  /* Op node, where operand used to be. */
15265 #ifdef RE_TRACK_PATTERN_OFFSETS
15266  if (RExC_offsets) {         /* MJD */
15267   MJD_OFFSET_DEBUG(
15268    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15269    "reginsert",
15270    __LINE__,
15271    PL_reg_name[op],
15272    (UV)(place - RExC_emit_start) > RExC_offsets[0]
15273    ? "Overwriting end of array!\n" : "OK",
15274    (UV)(place - RExC_emit_start),
15275    (UV)(RExC_parse - RExC_start),
15276    (UV)RExC_offsets[0]));
15277   Set_Node_Offset(place, RExC_parse);
15278   Set_Node_Length(place, 1);
15279  }
15280 #endif
15281  src = NEXTOPER(place);
15282  FILL_ADVANCE_NODE(place, op);
15283  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
15284  Zero(src, offset, regnode);
15285 }
15286
15287 /*
15288 - regtail - set the next-pointer at the end of a node chain of p to val.
15289 - SEE ALSO: regtail_study
15290 */
15291 /* TODO: All three parms should be const */
15292 STATIC void
15293 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15294     const regnode *val,U32 depth)
15295 {
15296  regnode *scan;
15297  GET_RE_DEBUG_FLAGS_DECL;
15298
15299  PERL_ARGS_ASSERT_REGTAIL;
15300 #ifndef DEBUGGING
15301  PERL_UNUSED_ARG(depth);
15302 #endif
15303
15304  if (SIZE_ONLY)
15305   return;
15306
15307  /* Find last node. */
15308  scan = p;
15309  for (;;) {
15310   regnode * const temp = regnext(scan);
15311   DEBUG_PARSE_r({
15312    SV * const mysv=sv_newmortal();
15313    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15314    regprop(RExC_rx, mysv, scan, NULL);
15315    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15316     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15317      (temp == NULL ? "->" : ""),
15318      (temp == NULL ? PL_reg_name[OP(val)] : "")
15319    );
15320   });
15321   if (temp == NULL)
15322    break;
15323   scan = temp;
15324  }
15325
15326  if (reg_off_by_arg[OP(scan)]) {
15327   ARG_SET(scan, val - scan);
15328  }
15329  else {
15330   NEXT_OFF(scan) = val - scan;
15331  }
15332 }
15333
15334 #ifdef DEBUGGING
15335 /*
15336 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15337 - Look for optimizable sequences at the same time.
15338 - currently only looks for EXACT chains.
15339
15340 This is experimental code. The idea is to use this routine to perform
15341 in place optimizations on branches and groups as they are constructed,
15342 with the long term intention of removing optimization from study_chunk so
15343 that it is purely analytical.
15344
15345 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15346 to control which is which.
15347
15348 */
15349 /* TODO: All four parms should be const */
15350
15351 STATIC U8
15352 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15353      const regnode *val,U32 depth)
15354 {
15355  dVAR;
15356  regnode *scan;
15357  U8 exact = PSEUDO;
15358 #ifdef EXPERIMENTAL_INPLACESCAN
15359  I32 min = 0;
15360 #endif
15361  GET_RE_DEBUG_FLAGS_DECL;
15362
15363  PERL_ARGS_ASSERT_REGTAIL_STUDY;
15364
15365
15366  if (SIZE_ONLY)
15367   return exact;
15368
15369  /* Find last node. */
15370
15371  scan = p;
15372  for (;;) {
15373   regnode * const temp = regnext(scan);
15374 #ifdef EXPERIMENTAL_INPLACESCAN
15375   if (PL_regkind[OP(scan)] == EXACT) {
15376    bool unfolded_multi_char; /* Unexamined in this routine */
15377    if (join_exact(pRExC_state, scan, &min,
15378       &unfolded_multi_char, 1, val, depth+1))
15379     return EXACT;
15380   }
15381 #endif
15382   if ( exact ) {
15383    switch (OP(scan)) {
15384     case EXACT:
15385     case EXACTF:
15386     case EXACTFA_NO_TRIE:
15387     case EXACTFA:
15388     case EXACTFU:
15389     case EXACTFU_SS:
15390     case EXACTFL:
15391       if( exact == PSEUDO )
15392        exact= OP(scan);
15393       else if ( exact != OP(scan) )
15394        exact= 0;
15395     case NOTHING:
15396      break;
15397     default:
15398      exact= 0;
15399    }
15400   }
15401   DEBUG_PARSE_r({
15402    SV * const mysv=sv_newmortal();
15403    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15404    regprop(RExC_rx, mysv, scan, NULL);
15405    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15406     SvPV_nolen_const(mysv),
15407     REG_NODE_NUM(scan),
15408     PL_reg_name[exact]);
15409   });
15410   if (temp == NULL)
15411    break;
15412   scan = temp;
15413  }
15414  DEBUG_PARSE_r({
15415   SV * const mysv_val=sv_newmortal();
15416   DEBUG_PARSE_MSG("");
15417   regprop(RExC_rx, mysv_val, val, NULL);
15418   PerlIO_printf(Perl_debug_log,
15419      "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15420      SvPV_nolen_const(mysv_val),
15421      (IV)REG_NODE_NUM(val),
15422      (IV)(val - scan)
15423   );
15424  });
15425  if (reg_off_by_arg[OP(scan)]) {
15426   ARG_SET(scan, val - scan);
15427  }
15428  else {
15429   NEXT_OFF(scan) = val - scan;
15430  }
15431
15432  return exact;
15433 }
15434 #endif
15435
15436 /*
15437  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15438  */
15439 #ifdef DEBUGGING
15440
15441 static void
15442 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15443 {
15444  int bit;
15445  int set=0;
15446
15447  ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15448
15449  for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15450   if (flags & (1<<bit)) {
15451    if (!set++ && lead)
15452     PerlIO_printf(Perl_debug_log, "%s",lead);
15453    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15454   }
15455  }
15456  if (lead)  {
15457   if (set)
15458    PerlIO_printf(Perl_debug_log, "\n");
15459   else
15460    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15461  }
15462 }
15463
15464 static void
15465 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15466 {
15467  int bit;
15468  int set=0;
15469  regex_charset cs;
15470
15471  ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15472
15473  for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15474   if (flags & (1<<bit)) {
15475    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15476     continue;
15477    }
15478    if (!set++ && lead)
15479     PerlIO_printf(Perl_debug_log, "%s",lead);
15480    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15481   }
15482  }
15483  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15484    if (!set++ && lead) {
15485     PerlIO_printf(Perl_debug_log, "%s",lead);
15486    }
15487    switch (cs) {
15488     case REGEX_UNICODE_CHARSET:
15489      PerlIO_printf(Perl_debug_log, "UNICODE");
15490      break;
15491     case REGEX_LOCALE_CHARSET:
15492      PerlIO_printf(Perl_debug_log, "LOCALE");
15493      break;
15494     case REGEX_ASCII_RESTRICTED_CHARSET:
15495      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15496      break;
15497     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15498      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15499      break;
15500     default:
15501      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15502      break;
15503    }
15504  }
15505  if (lead)  {
15506   if (set)
15507    PerlIO_printf(Perl_debug_log, "\n");
15508   else
15509    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15510  }
15511 }
15512 #endif
15513
15514 void
15515 Perl_regdump(pTHX_ const regexp *r)
15516 {
15517 #ifdef DEBUGGING
15518  dVAR;
15519  SV * const sv = sv_newmortal();
15520  SV *dsv= sv_newmortal();
15521  RXi_GET_DECL(r,ri);
15522  GET_RE_DEBUG_FLAGS_DECL;
15523
15524  PERL_ARGS_ASSERT_REGDUMP;
15525
15526  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15527
15528  /* Header fields of interest. */
15529  if (r->anchored_substr) {
15530   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15531    RE_SV_DUMPLEN(r->anchored_substr), 30);
15532   PerlIO_printf(Perl_debug_log,
15533      "anchored %s%s at %"IVdf" ",
15534      s, RE_SV_TAIL(r->anchored_substr),
15535      (IV)r->anchored_offset);
15536  } else if (r->anchored_utf8) {
15537   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15538    RE_SV_DUMPLEN(r->anchored_utf8), 30);
15539   PerlIO_printf(Perl_debug_log,
15540      "anchored utf8 %s%s at %"IVdf" ",
15541      s, RE_SV_TAIL(r->anchored_utf8),
15542      (IV)r->anchored_offset);
15543  }
15544  if (r->float_substr) {
15545   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15546    RE_SV_DUMPLEN(r->float_substr), 30);
15547   PerlIO_printf(Perl_debug_log,
15548      "floating %s%s at %"IVdf"..%"UVuf" ",
15549      s, RE_SV_TAIL(r->float_substr),
15550      (IV)r->float_min_offset, (UV)r->float_max_offset);
15551  } else if (r->float_utf8) {
15552   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15553    RE_SV_DUMPLEN(r->float_utf8), 30);
15554   PerlIO_printf(Perl_debug_log,
15555      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15556      s, RE_SV_TAIL(r->float_utf8),
15557      (IV)r->float_min_offset, (UV)r->float_max_offset);
15558  }
15559  if (r->check_substr || r->check_utf8)
15560   PerlIO_printf(Perl_debug_log,
15561      (const char *)
15562      (r->check_substr == r->float_substr
15563      && r->check_utf8 == r->float_utf8
15564      ? "(checking floating" : "(checking anchored"));
15565  if (r->intflags & PREGf_NOSCAN)
15566   PerlIO_printf(Perl_debug_log, " noscan");
15567  if (r->extflags & RXf_CHECK_ALL)
15568   PerlIO_printf(Perl_debug_log, " isall");
15569  if (r->check_substr || r->check_utf8)
15570   PerlIO_printf(Perl_debug_log, ") ");
15571
15572  if (ri->regstclass) {
15573   regprop(r, sv, ri->regstclass, NULL);
15574   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15575  }
15576  if (r->intflags & PREGf_ANCH) {
15577   PerlIO_printf(Perl_debug_log, "anchored");
15578   if (r->intflags & PREGf_ANCH_BOL)
15579    PerlIO_printf(Perl_debug_log, "(BOL)");
15580   if (r->intflags & PREGf_ANCH_MBOL)
15581    PerlIO_printf(Perl_debug_log, "(MBOL)");
15582   if (r->intflags & PREGf_ANCH_SBOL)
15583    PerlIO_printf(Perl_debug_log, "(SBOL)");
15584   if (r->intflags & PREGf_ANCH_GPOS)
15585    PerlIO_printf(Perl_debug_log, "(GPOS)");
15586   PerlIO_putc(Perl_debug_log, ' ');
15587  }
15588  if (r->intflags & PREGf_GPOS_SEEN)
15589   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15590  if (r->intflags & PREGf_SKIP)
15591   PerlIO_printf(Perl_debug_log, "plus ");
15592  if (r->intflags & PREGf_IMPLICIT)
15593   PerlIO_printf(Perl_debug_log, "implicit ");
15594  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15595  if (r->extflags & RXf_EVAL_SEEN)
15596   PerlIO_printf(Perl_debug_log, "with eval ");
15597  PerlIO_printf(Perl_debug_log, "\n");
15598  DEBUG_FLAGS_r({
15599   regdump_extflags("r->extflags: ",r->extflags);
15600   regdump_intflags("r->intflags: ",r->intflags);
15601  });
15602 #else
15603  PERL_ARGS_ASSERT_REGDUMP;
15604  PERL_UNUSED_CONTEXT;
15605  PERL_UNUSED_ARG(r);
15606 #endif /* DEBUGGING */
15607 }
15608
15609 /*
15610 - regprop - printable representation of opcode, with run time support
15611 */
15612
15613 void
15614 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15615 {
15616 #ifdef DEBUGGING
15617  dVAR;
15618  int k;
15619
15620  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15621  static const char * const anyofs[] = {
15622 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15623  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15624  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15625  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15626  || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15627  || _CC_VERTSPACE != 16
15628   #error Need to adjust order of anyofs[]
15629 #endif
15630   "\\w",
15631   "\\W",
15632   "\\d",
15633   "\\D",
15634   "[:alpha:]",
15635   "[:^alpha:]",
15636   "[:lower:]",
15637   "[:^lower:]",
15638   "[:upper:]",
15639   "[:^upper:]",
15640   "[:punct:]",
15641   "[:^punct:]",
15642   "[:print:]",
15643   "[:^print:]",
15644   "[:alnum:]",
15645   "[:^alnum:]",
15646   "[:graph:]",
15647   "[:^graph:]",
15648   "[:cased:]",
15649   "[:^cased:]",
15650   "\\s",
15651   "\\S",
15652   "[:blank:]",
15653   "[:^blank:]",
15654   "[:xdigit:]",
15655   "[:^xdigit:]",
15656   "[:space:]",
15657   "[:^space:]",
15658   "[:cntrl:]",
15659   "[:^cntrl:]",
15660   "[:ascii:]",
15661   "[:^ascii:]",
15662   "\\v",
15663   "\\V"
15664  };
15665  RXi_GET_DECL(prog,progi);
15666  GET_RE_DEBUG_FLAGS_DECL;
15667
15668  PERL_ARGS_ASSERT_REGPROP;
15669
15670  sv_setpvs(sv, "");
15671
15672  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
15673   /* It would be nice to FAIL() here, but this may be called from
15674   regexec.c, and it would be hard to supply pRExC_state. */
15675   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15676            (int)OP(o), (int)REGNODE_MAX);
15677  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15678
15679  k = PL_regkind[OP(o)];
15680
15681  if (k == EXACT) {
15682   sv_catpvs(sv, " ");
15683   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15684   * is a crude hack but it may be the best for now since
15685   * we have no flag "this EXACTish node was UTF-8"
15686   * --jhi */
15687   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15688     PERL_PV_ESCAPE_UNI_DETECT |
15689     PERL_PV_ESCAPE_NONASCII   |
15690     PERL_PV_PRETTY_ELLIPSES   |
15691     PERL_PV_PRETTY_LTGT       |
15692     PERL_PV_PRETTY_NOCLEAR
15693     );
15694  } else if (k == TRIE) {
15695   /* print the details of the trie in dumpuntil instead, as
15696   * progi->data isn't available here */
15697   const char op = OP(o);
15698   const U32 n = ARG(o);
15699   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15700    (reg_ac_data *)progi->data->data[n] :
15701    NULL;
15702   const reg_trie_data * const trie
15703    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15704
15705   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15706   DEBUG_TRIE_COMPILE_r(
15707   Perl_sv_catpvf(aTHX_ sv,
15708    "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15709    (UV)trie->startstate,
15710    (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15711    (UV)trie->wordcount,
15712    (UV)trie->minlen,
15713    (UV)trie->maxlen,
15714    (UV)TRIE_CHARCOUNT(trie),
15715    (UV)trie->uniquecharcount
15716   );
15717   );
15718   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15719    sv_catpvs(sv, "[");
15720    (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15721             ? ANYOF_BITMAP(o)
15722             : TRIE_BITMAP(trie));
15723    sv_catpvs(sv, "]");
15724   }
15725
15726  } else if (k == CURLY) {
15727   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15728    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15729   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15730  }
15731  else if (k == WHILEM && o->flags)   /* Ordinal/of */
15732   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15733  else if (k == REF || k == OPEN || k == CLOSE
15734    || k == GROUPP || OP(o)==ACCEPT)
15735  {
15736   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15737   if ( RXp_PAREN_NAMES(prog) ) {
15738    if ( k != REF || (OP(o) < NREF)) {
15739     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15740     SV **name= av_fetch(list, ARG(o), 0 );
15741     if (name)
15742      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15743    }
15744    else {
15745     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15746     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15747     I32 *nums=(I32*)SvPVX(sv_dat);
15748     SV **name= av_fetch(list, nums[0], 0 );
15749     I32 n;
15750     if (name) {
15751      for ( n=0; n<SvIVX(sv_dat); n++ ) {
15752       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15753          (n ? "," : ""), (IV)nums[n]);
15754      }
15755      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15756     }
15757    }
15758   }
15759   if ( k == REF && reginfo) {
15760    U32 n = ARG(o);  /* which paren pair */
15761    I32 ln = prog->offs[n].start;
15762    if (prog->lastparen < n || ln == -1)
15763     Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15764    else if (ln == prog->offs[n].end)
15765     Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15766    else {
15767     const char *s = reginfo->strbeg + ln;
15768     Perl_sv_catpvf(aTHX_ sv, ": ");
15769     Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15770      PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15771    }
15772   }
15773  } else if (k == GOSUB)
15774   /* Paren and offset */
15775   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15776  else if (k == VERB) {
15777   if (!o->flags)
15778    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15779       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15780  } else if (k == LOGICAL)
15781   /* 2: embedded, otherwise 1 */
15782   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15783  else if (k == ANYOF) {
15784   const U8 flags = ANYOF_FLAGS(o);
15785   int do_sep = 0;
15786
15787
15788   if (flags & ANYOF_LOCALE_FLAGS)
15789    sv_catpvs(sv, "{loc}");
15790   if (flags & ANYOF_LOC_FOLD)
15791    sv_catpvs(sv, "{i}");
15792   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15793   if (flags & ANYOF_INVERT)
15794    sv_catpvs(sv, "^");
15795
15796   /* output what the standard cp 0-255 bitmap matches */
15797   do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15798
15799   /* output any special charclass tests (used entirely under use
15800   * locale) * */
15801   if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15802    int i;
15803    for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15804     if (ANYOF_POSIXL_TEST(o,i)) {
15805      sv_catpv(sv, anyofs[i]);
15806      do_sep = 1;
15807     }
15808    }
15809   }
15810
15811   if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15812      |ANYOF_UTF8
15813      |ANYOF_NONBITMAP_NON_UTF8
15814      |ANYOF_LOC_FOLD)))
15815   {
15816    if (do_sep) {
15817     Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15818     if (flags & ANYOF_INVERT)
15819      /*make sure the invert info is in each */
15820      sv_catpvs(sv, "^");
15821    }
15822
15823    if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15824     sv_catpvs(sv, "{non-utf8-latin1-all}");
15825    }
15826
15827    /* output information about the unicode matching */
15828    if (flags & ANYOF_ABOVE_LATIN1_ALL)
15829     sv_catpvs(sv, "{unicode_all}");
15830    else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15831     SV *lv; /* Set if there is something outside the bit map. */
15832     bool byte_output = FALSE;   /* If something in the bitmap has
15833            been output */
15834     SV *only_utf8_locale;
15835
15836     /* Get the stuff that wasn't in the bitmap */
15837     (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15838              &lv, &only_utf8_locale);
15839     if (lv && lv != &PL_sv_undef) {
15840      char *s = savesvpv(lv);
15841      char * const origs = s;
15842
15843      while (*s && *s != '\n')
15844       s++;
15845
15846      if (*s == '\n') {
15847       const char * const t = ++s;
15848
15849       if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15850        sv_catpvs(sv, "{outside bitmap}");
15851       }
15852       else {
15853        sv_catpvs(sv, "{utf8}");
15854       }
15855
15856       if (byte_output) {
15857        sv_catpvs(sv, " ");
15858       }
15859
15860       while (*s) {
15861        if (*s == '\n') {
15862
15863         /* Truncate very long output */
15864         if (s - origs > 256) {
15865          Perl_sv_catpvf(aTHX_ sv,
15866             "%.*s...",
15867             (int) (s - origs - 1),
15868             t);
15869          goto out_dump;
15870         }
15871         *s = ' ';
15872        }
15873        else if (*s == '\t') {
15874         *s = '-';
15875        }
15876        s++;
15877       }
15878       if (s[-1] == ' ')
15879        s[-1] = 0;
15880
15881       sv_catpv(sv, t);
15882      }
15883
15884     out_dump:
15885
15886      Safefree(origs);
15887      SvREFCNT_dec_NN(lv);
15888     }
15889
15890     if ((flags & ANYOF_LOC_FOLD)
15891      && only_utf8_locale
15892      && only_utf8_locale != &PL_sv_undef)
15893     {
15894      UV start, end;
15895      int max_entries = 256;
15896
15897      sv_catpvs(sv, "{utf8 locale}");
15898      invlist_iterinit(only_utf8_locale);
15899      while (invlist_iternext(only_utf8_locale,
15900            &start, &end)) {
15901       put_range(sv, start, end);
15902       max_entries --;
15903       if (max_entries < 0) {
15904        sv_catpvs(sv, "...");
15905        break;
15906       }
15907      }
15908      invlist_iterfinish(only_utf8_locale);
15909     }
15910    }
15911   }
15912
15913   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15914  }
15915  else if (k == POSIXD || k == NPOSIXD) {
15916   U8 index = FLAGS(o) * 2;
15917   if (index < C_ARRAY_LENGTH(anyofs)) {
15918    if (*anyofs[index] != '[')  {
15919     sv_catpv(sv, "[");
15920    }
15921    sv_catpv(sv, anyofs[index]);
15922    if (*anyofs[index] != '[')  {
15923     sv_catpv(sv, "]");
15924    }
15925   }
15926   else {
15927    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15928   }
15929  }
15930  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15931   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15932 #else
15933  PERL_UNUSED_CONTEXT;
15934  PERL_UNUSED_ARG(sv);
15935  PERL_UNUSED_ARG(o);
15936  PERL_UNUSED_ARG(prog);
15937  PERL_UNUSED_ARG(reginfo);
15938 #endif /* DEBUGGING */
15939 }
15940
15941
15942
15943 SV *
15944 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15945 {    /* Assume that RE_INTUIT is set */
15946  struct regexp *const prog = ReANY(r);
15947  GET_RE_DEBUG_FLAGS_DECL;
15948
15949  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15950  PERL_UNUSED_CONTEXT;
15951
15952  DEBUG_COMPILE_r(
15953   {
15954    const char * const s = SvPV_nolen_const(prog->check_substr
15955      ? prog->check_substr : prog->check_utf8);
15956
15957    if (!PL_colorset) reginitcolors();
15958    PerlIO_printf(Perl_debug_log,
15959      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15960      PL_colors[4],
15961      prog->check_substr ? "" : "utf8 ",
15962      PL_colors[5],PL_colors[0],
15963      s,
15964      PL_colors[1],
15965      (strlen(s) > 60 ? "..." : ""));
15966   } );
15967
15968  return prog->check_substr ? prog->check_substr : prog->check_utf8;
15969 }
15970
15971 /*
15972    pregfree()
15973
15974    handles refcounting and freeing the perl core regexp structure. When
15975    it is necessary to actually free the structure the first thing it
15976    does is call the 'free' method of the regexp_engine associated to
15977    the regexp, allowing the handling of the void *pprivate; member
15978    first. (This routine is not overridable by extensions, which is why
15979    the extensions free is called first.)
15980
15981    See regdupe and regdupe_internal if you change anything here.
15982 */
15983 #ifndef PERL_IN_XSUB_RE
15984 void
15985 Perl_pregfree(pTHX_ REGEXP *r)
15986 {
15987  SvREFCNT_dec(r);
15988 }
15989
15990 void
15991 Perl_pregfree2(pTHX_ REGEXP *rx)
15992 {
15993  struct regexp *const r = ReANY(rx);
15994  GET_RE_DEBUG_FLAGS_DECL;
15995
15996  PERL_ARGS_ASSERT_PREGFREE2;
15997
15998  if (r->mother_re) {
15999   ReREFCNT_dec(r->mother_re);
16000  } else {
16001   CALLREGFREE_PVT(rx); /* free the private data */
16002   SvREFCNT_dec(RXp_PAREN_NAMES(r));
16003   Safefree(r->xpv_len_u.xpvlenu_pv);
16004  }
16005  if (r->substrs) {
16006   SvREFCNT_dec(r->anchored_substr);
16007   SvREFCNT_dec(r->anchored_utf8);
16008   SvREFCNT_dec(r->float_substr);
16009   SvREFCNT_dec(r->float_utf8);
16010   Safefree(r->substrs);
16011  }
16012  RX_MATCH_COPY_FREE(rx);
16013 #ifdef PERL_ANY_COW
16014  SvREFCNT_dec(r->saved_copy);
16015 #endif
16016  Safefree(r->offs);
16017  SvREFCNT_dec(r->qr_anoncv);
16018  rx->sv_u.svu_rx = 0;
16019 }
16020
16021 /*  reg_temp_copy()
16022
16023  This is a hacky workaround to the structural issue of match results
16024  being stored in the regexp structure which is in turn stored in
16025  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16026  could be PL_curpm in multiple contexts, and could require multiple
16027  result sets being associated with the pattern simultaneously, such
16028  as when doing a recursive match with (??{$qr})
16029
16030  The solution is to make a lightweight copy of the regexp structure
16031  when a qr// is returned from the code executed by (??{$qr}) this
16032  lightweight copy doesn't actually own any of its data except for
16033  the starp/end and the actual regexp structure itself.
16034
16035 */
16036
16037
16038 REGEXP *
16039 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16040 {
16041  struct regexp *ret;
16042  struct regexp *const r = ReANY(rx);
16043  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16044
16045  PERL_ARGS_ASSERT_REG_TEMP_COPY;
16046
16047  if (!ret_x)
16048   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16049  else {
16050   SvOK_off((SV *)ret_x);
16051   if (islv) {
16052    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16053    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16054    made both spots point to the same regexp body.) */
16055    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16056    assert(!SvPVX(ret_x));
16057    ret_x->sv_u.svu_rx = temp->sv_any;
16058    temp->sv_any = NULL;
16059    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16060    SvREFCNT_dec_NN(temp);
16061    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16062    ing below will not set it. */
16063    SvCUR_set(ret_x, SvCUR(rx));
16064   }
16065  }
16066  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16067  sv_force_normal(sv) is called.  */
16068  SvFAKE_on(ret_x);
16069  ret = ReANY(ret_x);
16070
16071  SvFLAGS(ret_x) |= SvUTF8(rx);
16072  /* We share the same string buffer as the original regexp, on which we
16073  hold a reference count, incremented when mother_re is set below.
16074  The string pointer is copied here, being part of the regexp struct.
16075  */
16076  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16077   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16078  if (r->offs) {
16079   const I32 npar = r->nparens+1;
16080   Newx(ret->offs, npar, regexp_paren_pair);
16081   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16082  }
16083  if (r->substrs) {
16084   Newx(ret->substrs, 1, struct reg_substr_data);
16085   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16086
16087   SvREFCNT_inc_void(ret->anchored_substr);
16088   SvREFCNT_inc_void(ret->anchored_utf8);
16089   SvREFCNT_inc_void(ret->float_substr);
16090   SvREFCNT_inc_void(ret->float_utf8);
16091
16092   /* check_substr and check_utf8, if non-NULL, point to either their
16093   anchored or float namesakes, and don't hold a second reference.  */
16094  }
16095  RX_MATCH_COPIED_off(ret_x);
16096 #ifdef PERL_ANY_COW
16097  ret->saved_copy = NULL;
16098 #endif
16099  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16100  SvREFCNT_inc_void(ret->qr_anoncv);
16101
16102  return ret_x;
16103 }
16104 #endif
16105
16106 /* regfree_internal()
16107
16108    Free the private data in a regexp. This is overloadable by
16109    extensions. Perl takes care of the regexp structure in pregfree(),
16110    this covers the *pprivate pointer which technically perl doesn't
16111    know about, however of course we have to handle the
16112    regexp_internal structure when no extension is in use.
16113
16114    Note this is called before freeing anything in the regexp
16115    structure.
16116  */
16117
16118 void
16119 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16120 {
16121  struct regexp *const r = ReANY(rx);
16122  RXi_GET_DECL(r,ri);
16123  GET_RE_DEBUG_FLAGS_DECL;
16124
16125  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16126
16127  DEBUG_COMPILE_r({
16128   if (!PL_colorset)
16129    reginitcolors();
16130   {
16131    SV *dsv= sv_newmortal();
16132    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16133     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16134    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16135     PL_colors[4],PL_colors[5],s);
16136   }
16137  });
16138 #ifdef RE_TRACK_PATTERN_OFFSETS
16139  if (ri->u.offsets)
16140   Safefree(ri->u.offsets);             /* 20010421 MJD */
16141 #endif
16142  if (ri->code_blocks) {
16143   int n;
16144   for (n = 0; n < ri->num_code_blocks; n++)
16145    SvREFCNT_dec(ri->code_blocks[n].src_regex);
16146   Safefree(ri->code_blocks);
16147  }
16148
16149  if (ri->data) {
16150   int n = ri->data->count;
16151
16152   while (--n >= 0) {
16153   /* If you add a ->what type here, update the comment in regcomp.h */
16154    switch (ri->data->what[n]) {
16155    case 'a':
16156    case 'r':
16157    case 's':
16158    case 'S':
16159    case 'u':
16160     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16161     break;
16162    case 'f':
16163     Safefree(ri->data->data[n]);
16164     break;
16165    case 'l':
16166    case 'L':
16167     break;
16168    case 'T':
16169     { /* Aho Corasick add-on structure for a trie node.
16170      Used in stclass optimization only */
16171      U32 refcount;
16172      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16173 #ifdef USE_ITHREADS
16174      dVAR;
16175 #endif
16176      OP_REFCNT_LOCK;
16177      refcount = --aho->refcount;
16178      OP_REFCNT_UNLOCK;
16179      if ( !refcount ) {
16180       PerlMemShared_free(aho->states);
16181       PerlMemShared_free(aho->fail);
16182       /* do this last!!!! */
16183       PerlMemShared_free(ri->data->data[n]);
16184       /* we should only ever get called once, so
16185       * assert as much, and also guard the free
16186       * which /might/ happen twice. At the least
16187       * it will make code anlyzers happy and it
16188       * doesn't cost much. - Yves */
16189       assert(ri->regstclass);
16190       if (ri->regstclass) {
16191        PerlMemShared_free(ri->regstclass);
16192        ri->regstclass = 0;
16193       }
16194      }
16195     }
16196     break;
16197    case 't':
16198     {
16199      /* trie structure. */
16200      U32 refcount;
16201      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16202 #ifdef USE_ITHREADS
16203      dVAR;
16204 #endif
16205      OP_REFCNT_LOCK;
16206      refcount = --trie->refcount;
16207      OP_REFCNT_UNLOCK;
16208      if ( !refcount ) {
16209       PerlMemShared_free(trie->charmap);
16210       PerlMemShared_free(trie->states);
16211       PerlMemShared_free(trie->trans);
16212       if (trie->bitmap)
16213        PerlMemShared_free(trie->bitmap);
16214       if (trie->jump)
16215        PerlMemShared_free(trie->jump);
16216       PerlMemShared_free(trie->wordinfo);
16217       /* do this last!!!! */
16218       PerlMemShared_free(ri->data->data[n]);
16219      }
16220     }
16221     break;
16222    default:
16223     Perl_croak(aTHX_ "panic: regfree data code '%c'",
16224              ri->data->what[n]);
16225    }
16226   }
16227   Safefree(ri->data->what);
16228   Safefree(ri->data);
16229  }
16230
16231  Safefree(ri);
16232 }
16233
16234 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16235 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16236 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16237
16238 /*
16239    re_dup - duplicate a regexp.
16240
16241    This routine is expected to clone a given regexp structure. It is only
16242    compiled under USE_ITHREADS.
16243
16244    After all of the core data stored in struct regexp is duplicated
16245    the regexp_engine.dupe method is used to copy any private data
16246    stored in the *pprivate pointer. This allows extensions to handle
16247    any duplication it needs to do.
16248
16249    See pregfree() and regfree_internal() if you change anything here.
16250 */
16251 #if defined(USE_ITHREADS)
16252 #ifndef PERL_IN_XSUB_RE
16253 void
16254 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16255 {
16256  dVAR;
16257  I32 npar;
16258  const struct regexp *r = ReANY(sstr);
16259  struct regexp *ret = ReANY(dstr);
16260
16261  PERL_ARGS_ASSERT_RE_DUP_GUTS;
16262
16263  npar = r->nparens+1;
16264  Newx(ret->offs, npar, regexp_paren_pair);
16265  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16266
16267  if (ret->substrs) {
16268   /* Do it this way to avoid reading from *r after the StructCopy().
16269   That way, if any of the sv_dup_inc()s dislodge *r from the L1
16270   cache, it doesn't matter.  */
16271   const bool anchored = r->check_substr
16272    ? r->check_substr == r->anchored_substr
16273    : r->check_utf8 == r->anchored_utf8;
16274   Newx(ret->substrs, 1, struct reg_substr_data);
16275   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16276
16277   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16278   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16279   ret->float_substr = sv_dup_inc(ret->float_substr, param);
16280   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16281
16282   /* check_substr and check_utf8, if non-NULL, point to either their
16283   anchored or float namesakes, and don't hold a second reference.  */
16284
16285   if (ret->check_substr) {
16286    if (anchored) {
16287     assert(r->check_utf8 == r->anchored_utf8);
16288     ret->check_substr = ret->anchored_substr;
16289     ret->check_utf8 = ret->anchored_utf8;
16290    } else {
16291     assert(r->check_substr == r->float_substr);
16292     assert(r->check_utf8 == r->float_utf8);
16293     ret->check_substr = ret->float_substr;
16294     ret->check_utf8 = ret->float_utf8;
16295    }
16296   } else if (ret->check_utf8) {
16297    if (anchored) {
16298     ret->check_utf8 = ret->anchored_utf8;
16299    } else {
16300     ret->check_utf8 = ret->float_utf8;
16301    }
16302   }
16303  }
16304
16305  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16306  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16307
16308  if (ret->pprivate)
16309   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16310
16311  if (RX_MATCH_COPIED(dstr))
16312   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16313  else
16314   ret->subbeg = NULL;
16315 #ifdef PERL_ANY_COW
16316  ret->saved_copy = NULL;
16317 #endif
16318
16319  /* Whether mother_re be set or no, we need to copy the string.  We
16320  cannot refrain from copying it when the storage points directly to
16321  our mother regexp, because that's
16322    1: a buffer in a different thread
16323    2: something we no longer hold a reference on
16324    so we need to copy it locally.  */
16325  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16326  ret->mother_re   = NULL;
16327 }
16328 #endif /* PERL_IN_XSUB_RE */
16329
16330 /*
16331    regdupe_internal()
16332
16333    This is the internal complement to regdupe() which is used to copy
16334    the structure pointed to by the *pprivate pointer in the regexp.
16335    This is the core version of the extension overridable cloning hook.
16336    The regexp structure being duplicated will be copied by perl prior
16337    to this and will be provided as the regexp *r argument, however
16338    with the /old/ structures pprivate pointer value. Thus this routine
16339    may override any copying normally done by perl.
16340
16341    It returns a pointer to the new regexp_internal structure.
16342 */
16343
16344 void *
16345 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16346 {
16347  dVAR;
16348  struct regexp *const r = ReANY(rx);
16349  regexp_internal *reti;
16350  int len;
16351  RXi_GET_DECL(r,ri);
16352
16353  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16354
16355  len = ProgLen(ri);
16356
16357  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16358   char, regexp_internal);
16359  Copy(ri->program, reti->program, len+1, regnode);
16360
16361  reti->num_code_blocks = ri->num_code_blocks;
16362  if (ri->code_blocks) {
16363   int n;
16364   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16365     struct reg_code_block);
16366   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16367     struct reg_code_block);
16368   for (n = 0; n < ri->num_code_blocks; n++)
16369    reti->code_blocks[n].src_regex = (REGEXP*)
16370      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16371  }
16372  else
16373   reti->code_blocks = NULL;
16374
16375  reti->regstclass = NULL;
16376
16377  if (ri->data) {
16378   struct reg_data *d;
16379   const int count = ri->data->count;
16380   int i;
16381
16382   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16383     char, struct reg_data);
16384   Newx(d->what, count, U8);
16385
16386   d->count = count;
16387   for (i = 0; i < count; i++) {
16388    d->what[i] = ri->data->what[i];
16389    switch (d->what[i]) {
16390     /* see also regcomp.h and regfree_internal() */
16391    case 'a': /* actually an AV, but the dup function is identical.  */
16392    case 'r':
16393    case 's':
16394    case 'S':
16395    case 'u': /* actually an HV, but the dup function is identical.  */
16396     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16397     break;
16398    case 'f':
16399     /* This is cheating. */
16400     Newx(d->data[i], 1, regnode_ssc);
16401     StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16402     reti->regstclass = (regnode*)d->data[i];
16403     break;
16404    case 'T':
16405     /* Trie stclasses are readonly and can thus be shared
16406     * without duplication. We free the stclass in pregfree
16407     * when the corresponding reg_ac_data struct is freed.
16408     */
16409     reti->regstclass= ri->regstclass;
16410     /* FALLTHROUGH */
16411    case 't':
16412     OP_REFCNT_LOCK;
16413     ((reg_trie_data*)ri->data->data[i])->refcount++;
16414     OP_REFCNT_UNLOCK;
16415     /* FALLTHROUGH */
16416    case 'l':
16417    case 'L':
16418     d->data[i] = ri->data->data[i];
16419     break;
16420    default:
16421     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16422               ri->data->what[i]);
16423    }
16424   }
16425
16426   reti->data = d;
16427  }
16428  else
16429   reti->data = NULL;
16430
16431  reti->name_list_idx = ri->name_list_idx;
16432
16433 #ifdef RE_TRACK_PATTERN_OFFSETS
16434  if (ri->u.offsets) {
16435   Newx(reti->u.offsets, 2*len+1, U32);
16436   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16437  }
16438 #else
16439  SetProgLen(reti,len);
16440 #endif
16441
16442  return (void*)reti;
16443 }
16444
16445 #endif    /* USE_ITHREADS */
16446
16447 #ifndef PERL_IN_XSUB_RE
16448
16449 /*
16450  - regnext - dig the "next" pointer out of a node
16451  */
16452 regnode *
16453 Perl_regnext(pTHX_ regnode *p)
16454 {
16455  I32 offset;
16456
16457  if (!p)
16458   return(NULL);
16459
16460  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
16461   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16462             (int)OP(p), (int)REGNODE_MAX);
16463  }
16464
16465  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16466  if (offset == 0)
16467   return(NULL);
16468
16469  return(p+offset);
16470 }
16471 #endif
16472
16473 STATIC void
16474 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16475 {
16476  va_list args;
16477  STRLEN l1 = strlen(pat1);
16478  STRLEN l2 = strlen(pat2);
16479  char buf[512];
16480  SV *msv;
16481  const char *message;
16482
16483  PERL_ARGS_ASSERT_RE_CROAK2;
16484
16485  if (l1 > 510)
16486   l1 = 510;
16487  if (l1 + l2 > 510)
16488   l2 = 510 - l1;
16489  Copy(pat1, buf, l1 , char);
16490  Copy(pat2, buf + l1, l2 , char);
16491  buf[l1 + l2] = '\n';
16492  buf[l1 + l2 + 1] = '\0';
16493  va_start(args, pat2);
16494  msv = vmess(buf, &args);
16495  va_end(args);
16496  message = SvPV_const(msv,l1);
16497  if (l1 > 512)
16498   l1 = 512;
16499  Copy(message, buf, l1 , char);
16500  /* l1-1 to avoid \n */
16501  Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16502 }
16503
16504 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16505
16506 #ifndef PERL_IN_XSUB_RE
16507 void
16508 Perl_save_re_context(pTHX)
16509 {
16510  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16511  if (PL_curpm) {
16512   const REGEXP * const rx = PM_GETRE(PL_curpm);
16513   if (rx) {
16514    U32 i;
16515    for (i = 1; i <= RX_NPARENS(rx); i++) {
16516     char digits[TYPE_CHARS(long)];
16517     const STRLEN len = my_snprintf(digits, sizeof(digits),
16518            "%lu", (long)i);
16519     GV *const *const gvp
16520      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16521
16522     if (gvp) {
16523      GV * const gv = *gvp;
16524      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16525       save_scalar(gv);
16526     }
16527    }
16528   }
16529  }
16530 }
16531 #endif
16532
16533 #ifdef DEBUGGING
16534
16535 STATIC void
16536 S_put_byte(pTHX_ SV *sv, int c)
16537 {
16538  PERL_ARGS_ASSERT_PUT_BYTE;
16539
16540  if (!isPRINT(c)) {
16541   switch (c) {
16542    case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16543    case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16544    case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16545    case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16546    case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16547
16548    default:
16549     Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16550     break;
16551   }
16552  }
16553  else {
16554   const char string = c;
16555   if (c == '-' || c == ']' || c == '\\' || c == '^')
16556    sv_catpvs(sv, "\\");
16557   sv_catpvn(sv, &string, 1);
16558  }
16559 }
16560
16561 STATIC void
16562 S_put_range(pTHX_ SV *sv, UV start, UV end)
16563 {
16564
16565  /* Appends to 'sv' a displayable version of the range of code points from
16566  * 'start' to 'end'.  It assumes that only ASCII printables are displayable
16567  * as-is (though some of these will be escaped by put_byte()).  For the
16568  * time being, this subroutine only works for latin1 (< 256) code points */
16569
16570  assert(start <= end);
16571
16572  PERL_ARGS_ASSERT_PUT_RANGE;
16573
16574  while (start <= end) {
16575   if (end - start < 3) {  /* Individual chars in short ranges */
16576    for (; start <= end; start++) {
16577     put_byte(sv, start);
16578    }
16579    break;
16580   }
16581
16582   /* For small ranges that include printable ASCII characters, it's more
16583   * legible to print those characters rather than hex values.  For
16584   * larger ranges that include more than printables, it's probably
16585   * clearer to just give the start and end points of the range in hex,
16586   * and that's all we can do if there aren't any printables within the
16587   * range
16588   *
16589   * On ASCII platforms the range of printables is contiguous.  If the
16590   * entire range is printable, we print each character as such.  If the
16591   * range is partially printable and partially not, it's less likely
16592   * that the individual printables are meaningful, especially if all or
16593   * almost all of them are in the range.  But we err on the side of the
16594   * individual printables being meaningful by using the hex only if the
16595   * range contains all but 2 of the printables.
16596   *
16597   * On EBCDIC platforms, the printables are scattered around so that the
16598   * maximum range length containing only them is about 10.  Anything
16599   * longer we treat as hex; otherwise we examine the range character by
16600   * character to see */
16601 #ifdef EBCDIC
16602   if (start < 256 && (((end < 255) ? end : 255) - start <= 10))
16603 #else
16604   if ((isPRINT_A(start) && isPRINT_A(end))
16605    || (end >= 0x7F && (isPRINT_A(start) && start > 0x21))
16606    || ((end < 0x7D && isPRINT_A(end)) && start < 0x20))
16607 #endif
16608   {
16609    /* If the range beginning isn't an ASCII printable, we find the
16610    * last such in the range, then split the output, so all the
16611    * non-printables are in one subrange; then process the remaining
16612    * portion as usual.  If the entire range isn't printables, we
16613    * don't split, but drop down to print as hex */
16614    if (! isPRINT_A(start)) {
16615     UV temp_end = start + 1;
16616     while (temp_end <= end && ! isPRINT_A(temp_end)) {
16617      temp_end++;
16618     }
16619     if (temp_end <= end) {
16620      put_range(sv, start, temp_end - 1);
16621      start = temp_end;
16622      continue;
16623     }
16624    }
16625
16626    /* If the range beginning is a digit, output a subrange of just the
16627    * digits, then process the remaining portion as usual */
16628    if (isDIGIT_A(start)) {
16629     put_byte(sv, start);
16630     sv_catpvs(sv, "-");
16631     while (start <= end && isDIGIT_A(start)) start++;
16632     put_byte(sv, start - 1);
16633     continue;
16634    }
16635
16636    /* Similarly for alphabetics.  Because in both ASCII and EBCDIC,
16637    * the code points for upper and lower A-Z and a-z aren't
16638    * intermixed, the resulting subrange will consist solely of either
16639    * upper- or lower- alphabetics */
16640    if (isALPHA_A(start)) {
16641     put_byte(sv, start);
16642     sv_catpvs(sv, "-");
16643     while (start <= end && isALPHA_A(start)) start++;
16644     put_byte(sv, start - 1);
16645     continue;
16646    }
16647
16648    /* We output any remaining printables as individual characters */
16649    if (isPUNCT_A(start) || isSPACE_A(start)) {
16650     while (start <= end && (isPUNCT_A(start) || isSPACE_A(start))) {
16651      put_byte(sv, start);
16652      start++;
16653     }
16654     continue;
16655    }
16656   }
16657
16658   /* Here is a control or non-ascii.  Output the range or subrange as
16659   * hex. */
16660   Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16661      start,
16662      (end < 256) ? end : 255);
16663   break;
16664  }
16665 }
16666
16667 STATIC bool
16668 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16669 {
16670  /* Appends to 'sv' a displayable version of the innards of the bracketed
16671  * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16672  * output anything */
16673
16674  int i;
16675  bool has_output_anything = FALSE;
16676
16677  PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16678
16679  for (i = 0; i < 256; i++) {
16680   if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16681
16682    /* The character at index i should be output.  Find the next
16683    * character that should NOT be output */
16684    int j;
16685    for (j = i + 1; j <= 256; j++) {
16686     if (! BITMAP_TEST((U8 *) bitmap, j)) {
16687      break;
16688     }
16689    }
16690
16691    /* Everything between them is a single range that should be output
16692    * */
16693    put_range(sv, i, j - 1);
16694    has_output_anything = TRUE;
16695    i = j;
16696   }
16697  }
16698
16699  return has_output_anything;
16700 }
16701
16702 #define CLEAR_OPTSTART \
16703  if (optstart) STMT_START {                                               \
16704   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16705        " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16706   optstart=NULL;                                                       \
16707  } STMT_END
16708
16709 #define DUMPUNTIL(b,e)                                                       \
16710      CLEAR_OPTSTART;                                          \
16711      node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16712
16713 STATIC const regnode *
16714 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16715    const regnode *last, const regnode *plast,
16716    SV* sv, I32 indent, U32 depth)
16717 {
16718  dVAR;
16719  U8 op = PSEUDO; /* Arbitrary non-END op. */
16720  const regnode *next;
16721  const regnode *optstart= NULL;
16722
16723  RXi_GET_DECL(r,ri);
16724  GET_RE_DEBUG_FLAGS_DECL;
16725
16726  PERL_ARGS_ASSERT_DUMPUNTIL;
16727
16728 #ifdef DEBUG_DUMPUNTIL
16729  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16730   last ? last-start : 0,plast ? plast-start : 0);
16731 #endif
16732
16733  if (plast && plast < last)
16734   last= plast;
16735
16736  while (PL_regkind[op] != END && (!last || node < last)) {
16737   assert(node);
16738   /* While that wasn't END last time... */
16739   NODE_ALIGN(node);
16740   op = OP(node);
16741   if (op == CLOSE || op == WHILEM)
16742    indent--;
16743   next = regnext((regnode *)node);
16744
16745   /* Where, what. */
16746   if (OP(node) == OPTIMIZED) {
16747    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16748     optstart = node;
16749    else
16750     goto after_print;
16751   } else
16752    CLEAR_OPTSTART;
16753
16754   regprop(r, sv, node, NULL);
16755   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16756      (int)(2*indent + 1), "", SvPVX_const(sv));
16757
16758   if (OP(node) != OPTIMIZED) {
16759    if (next == NULL)  /* Next ptr. */
16760     PerlIO_printf(Perl_debug_log, " (0)");
16761    else if (PL_regkind[(U8)op] == BRANCH
16762      && PL_regkind[OP(next)] != BRANCH )
16763     PerlIO_printf(Perl_debug_log, " (FAIL)");
16764    else
16765     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16766    (void)PerlIO_putc(Perl_debug_log, '\n');
16767   }
16768
16769  after_print:
16770   if (PL_regkind[(U8)op] == BRANCHJ) {
16771    assert(next);
16772    {
16773     const regnode *nnode = (OP(next) == LONGJMP
16774          ? regnext((regnode *)next)
16775          : next);
16776     if (last && nnode > last)
16777      nnode = last;
16778     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16779    }
16780   }
16781   else if (PL_regkind[(U8)op] == BRANCH) {
16782    assert(next);
16783    DUMPUNTIL(NEXTOPER(node), next);
16784   }
16785   else if ( PL_regkind[(U8)op]  == TRIE ) {
16786    const regnode *this_trie = node;
16787    const char op = OP(node);
16788    const U32 n = ARG(node);
16789    const reg_ac_data * const ac = op>=AHOCORASICK ?
16790    (reg_ac_data *)ri->data->data[n] :
16791    NULL;
16792    const reg_trie_data * const trie =
16793     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16794 #ifdef DEBUGGING
16795    AV *const trie_words
16796       = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16797 #endif
16798    const regnode *nextbranch= NULL;
16799    I32 word_idx;
16800    sv_setpvs(sv, "");
16801    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16802     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16803
16804     PerlIO_printf(Perl_debug_log, "%*s%s ",
16805     (int)(2*(indent+3)), "",
16806      elem_ptr
16807      ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16808         SvCUR(*elem_ptr), 60,
16809         PL_colors[0], PL_colors[1],
16810         (SvUTF8(*elem_ptr)
16811         ? PERL_PV_ESCAPE_UNI
16812         : 0)
16813         | PERL_PV_PRETTY_ELLIPSES
16814         | PERL_PV_PRETTY_LTGT
16815        )
16816      : "???"
16817     );
16818     if (trie->jump) {
16819      U16 dist= trie->jump[word_idx+1];
16820      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16821        (UV)((dist ? this_trie + dist : next) - start));
16822      if (dist) {
16823       if (!nextbranch)
16824        nextbranch= this_trie + trie->jump[0];
16825       DUMPUNTIL(this_trie + dist, nextbranch);
16826      }
16827      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16828       nextbranch= regnext((regnode *)nextbranch);
16829     } else {
16830      PerlIO_printf(Perl_debug_log, "\n");
16831     }
16832    }
16833    if (last && next > last)
16834     node= last;
16835    else
16836     node= next;
16837   }
16838   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16839    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16840      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16841   }
16842   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16843    assert(next);
16844    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16845   }
16846   else if ( op == PLUS || op == STAR) {
16847    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16848   }
16849   else if (PL_regkind[(U8)op] == ANYOF) {
16850    /* arglen 1 + class block */
16851    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16852       ? ANYOF_POSIXL_SKIP
16853       : ANYOF_SKIP);
16854    node = NEXTOPER(node);
16855   }
16856   else if (PL_regkind[(U8)op] == EXACT) {
16857    /* Literal string, where present. */
16858    node += NODE_SZ_STR(node) - 1;
16859    node = NEXTOPER(node);
16860   }
16861   else {
16862    node = NEXTOPER(node);
16863    node += regarglen[(U8)op];
16864   }
16865   if (op == CURLYX || op == OPEN)
16866    indent++;
16867  }
16868  CLEAR_OPTSTART;
16869 #ifdef DEBUG_DUMPUNTIL
16870  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16871 #endif
16872  return node;
16873 }
16874
16875 #endif /* DEBUGGING */
16876
16877 /*
16878  * Local variables:
16879  * c-indentation-style: bsd
16880  * c-basic-offset: 4
16881  * indent-tabs-mode: nil
16882  * End:
16883  *
16884  * ex: set ts=8 sts=4 sw=4 et:
16885  */