src/5021001/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
  97  _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifndef STATIC
 102 #define STATIC static
 103 #endif
 104
 105
 106 struct RExC_state_t {
 107  U32  flags;   /* RXf_* are we folding, multilining? */
 108  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 109  char *precomp;  /* uncompiled string. */
 110  REGEXP *rx_sv;   /* The SV that is the regexp. */
 111  regexp *rx;                    /* perl core regexp structure */
 112  regexp_internal *rxi;           /* internal data for regexp object
 113           pprivate field */
 114  char *start;   /* Start of input for compile */
 115  char *end;   /* End of input for compile */
 116  char *parse;   /* Input-scan pointer. */
 117  SSize_t whilem_seen;  /* number of WHILEM in this expr */
 118  regnode *emit_start;  /* Start of emitted-code area */
 119  regnode *emit_bound;  /* First regnode outside of the
 120           allocated space */
 121  regnode *emit;   /* Code-emit pointer; if = &emit_dummy,
 122           implies compiling, so don't emit */
 123  regnode_ssc emit_dummy;  /* placeholder for emit to point to;
 124           large enough for the largest
 125           non-EXACTish node, so can use it as
 126           scratch in pass1 */
 127  I32  naughty;  /* How bad is this pattern? */
 128  I32  sawback;  /* Did we see \1, ...? */
 129  U32  seen;
 130  SSize_t size;   /* Code size. */
 131  I32                npar;            /* Capture buffer count, (OPEN) plus
 132           one. ("par" 0 is the whole
 133           pattern)*/
 134  I32  nestroot;  /* root parens we are in - used by
 135           accept */
 136  I32  extralen;
 137  I32  seen_zerolen;
 138  regnode **open_parens;  /* pointers to open parens */
 139  regnode **close_parens;  /* pointers to close parens */
 140  regnode *opend;   /* END node in program */
 141  I32  utf8;  /* whether the pattern is utf8 or not */
 142  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 143         /* XXX use this for future optimisation of case
 144         * where pattern must be upgraded to utf8. */
 145  I32  uni_semantics; /* If a d charset modifier should use unicode
 146         rules, even if the pattern is not in
 147         utf8 */
 148  HV  *paren_names;  /* Paren names */
 149
 150  regnode **recurse;  /* Recurse regops */
 151  I32  recurse_count;  /* Number of recurse regops */
 152  U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 153           through */
 154  U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 155  I32  in_lookbehind;
 156  I32  contains_locale;
 157  I32  contains_i;
 158  I32  override_recoding;
 159  I32  in_multi_char_class;
 160  struct reg_code_block *code_blocks; /* positions of literal (?{})
 161            within pattern */
 162  int  num_code_blocks; /* size of code_blocks[] */
 163  int  code_index;  /* next code_blocks[] slot */
 164  SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 165 #ifdef ADD_TO_REGEXEC
 166  char  *starttry;  /* -Dr: where regtry was called. */
 167 #define RExC_starttry (pRExC_state->starttry)
 168 #endif
 169  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 170 #ifdef DEBUGGING
 171  const char  *lastparse;
 172  I32         lastnum;
 173  AV          *paren_name_list;       /* idx -> name */
 174 #define RExC_lastparse (pRExC_state->lastparse)
 175 #define RExC_lastnum (pRExC_state->lastnum)
 176 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 177 #endif
 178 };
 179
 180 #define RExC_flags (pRExC_state->flags)
 181 #define RExC_pm_flags (pRExC_state->pm_flags)
 182 #define RExC_precomp (pRExC_state->precomp)
 183 #define RExC_rx_sv (pRExC_state->rx_sv)
 184 #define RExC_rx  (pRExC_state->rx)
 185 #define RExC_rxi (pRExC_state->rxi)
 186 #define RExC_start (pRExC_state->start)
 187 #define RExC_end (pRExC_state->end)
 188 #define RExC_parse (pRExC_state->parse)
 189 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 190 #ifdef RE_TRACK_PATTERN_OFFSETS
 191 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
 192               others */
 193 #endif
 194 #define RExC_emit (pRExC_state->emit)
 195 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 196 #define RExC_emit_start (pRExC_state->emit_start)
 197 #define RExC_emit_bound (pRExC_state->emit_bound)
 198 #define RExC_naughty (pRExC_state->naughty)
 199 #define RExC_sawback (pRExC_state->sawback)
 200 #define RExC_seen (pRExC_state->seen)
 201 #define RExC_size (pRExC_state->size)
 202 #define RExC_maxlen        (pRExC_state->maxlen)
 203 #define RExC_npar (pRExC_state->npar)
 204 #define RExC_nestroot   (pRExC_state->nestroot)
 205 #define RExC_extralen (pRExC_state->extralen)
 206 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 207 #define RExC_utf8 (pRExC_state->utf8)
 208 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 209 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 210 #define RExC_open_parens (pRExC_state->open_parens)
 211 #define RExC_close_parens (pRExC_state->close_parens)
 212 #define RExC_opend (pRExC_state->opend)
 213 #define RExC_paren_names (pRExC_state->paren_names)
 214 #define RExC_recurse (pRExC_state->recurse)
 215 #define RExC_recurse_count (pRExC_state->recurse_count)
 216 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 217 #define RExC_study_chunk_recursed_bytes  \
 218         (pRExC_state->study_chunk_recursed_bytes)
 219 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 220 #define RExC_contains_locale (pRExC_state->contains_locale)
 221 #define RExC_contains_i (pRExC_state->contains_i)
 222 #define RExC_override_recoding (pRExC_state->override_recoding)
 223 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 224
 225
 226 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 227 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 228   ((*s) == '{' && regcurly(s)))
 229
 230 /*
 231  * Flags to be passed up and down.
 232  */
 233 #define WORST  0 /* Worst case. */
 234 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 235
 236 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 237  * character.  (There needs to be a case: in the switch statement in regexec.c
 238  * for any node marked SIMPLE.)  Note that this is not the same thing as
 239  * REGNODE_SIMPLE */
 240 #define SIMPLE  0x02
 241 #define SPSTART  0x04 /* Starts with * or + */
 242 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 243 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 244 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 245
 246 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 247
 248 /* whether trie related optimizations are enabled */
 249 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 250 #define TRIE_STUDY_OPT
 251 #define FULL_TRIE_STUDY
 252 #define TRIE_STCLASS
 253 #endif
 254
 255
 256
 257 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 258 #define PBITVAL(paren) (1 << ((paren) & 7))
 259 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 260 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 261 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 262
 263 #define REQUIRE_UTF8 STMT_START {                                       \
 264          if (!UTF) {                           \
 265           *flagp = RESTART_UTF8;            \
 266           return NULL;                      \
 267          }                                     \
 268       } STMT_END
 269
 270 /* This converts the named class defined in regcomp.h to its equivalent class
 271  * number defined in handy.h. */
 272 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 273 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 274
 275 #define _invlist_union_complement_2nd(a, b, output) \
 276       _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 277 #define _invlist_intersection_complement_2nd(a, b, output) \
 278     _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 279
 280 /* About scan_data_t.
 281
 282   During optimisation we recurse through the regexp program performing
 283   various inplace (keyhole style) optimisations. In addition study_chunk
 284   and scan_commit populate this data structure with information about
 285   what strings MUST appear in the pattern. We look for the longest
 286   string that must appear at a fixed location, and we look for the
 287   longest string that may appear at a floating location. So for instance
 288   in the pattern:
 289
 290  /FOO[xX]A.*B[xX]BAR/
 291
 292   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 293   strings (because they follow a .* construct). study_chunk will identify
 294   both FOO and BAR as being the longest fixed and floating strings respectively.
 295
 296   The strings can be composites, for instance
 297
 298  /(f)(o)(o)/
 299
 300   will result in a composite fixed substring 'foo'.
 301
 302   For each string some basic information is maintained:
 303
 304   - offset or min_offset
 305  This is the position the string must appear at, or not before.
 306  It also implicitly (when combined with minlenp) tells us how many
 307  characters must match before the string we are searching for.
 308  Likewise when combined with minlenp and the length of the string it
 309  tells us how many characters must appear after the string we have
 310  found.
 311
 312   - max_offset
 313  Only used for floating strings. This is the rightmost point that
 314  the string can appear at. If set to SSize_t_MAX it indicates that the
 315  string can occur infinitely far to the right.
 316
 317   - minlenp
 318  A pointer to the minimum number of characters of the pattern that the
 319  string was found inside. This is important as in the case of positive
 320  lookahead or positive lookbehind we can have multiple patterns
 321  involved. Consider
 322
 323  /(?=FOO).*F/
 324
 325  The minimum length of the pattern overall is 3, the minimum length
 326  of the lookahead part is 3, but the minimum length of the part that
 327  will actually match is 1. So 'FOO's minimum length is 3, but the
 328  minimum length for the F is 1. This is important as the minimum length
 329  is used to determine offsets in front of and behind the string being
 330  looked for.  Since strings can be composites this is the length of the
 331  pattern at the time it was committed with a scan_commit. Note that
 332  the length is calculated by study_chunk, so that the minimum lengths
 333  are not known until the full pattern has been compiled, thus the
 334  pointer to the value.
 335
 336   - lookbehind
 337
 338  In the case of lookbehind the string being searched for can be
 339  offset past the start point of the final matching string.
 340  If this value was just blithely removed from the min_offset it would
 341  invalidate some of the calculations for how many chars must match
 342  before or after (as they are derived from min_offset and minlen and
 343  the length of the string being searched for).
 344  When the final pattern is compiled and the data is moved from the
 345  scan_data_t structure into the regexp structure the information
 346  about lookbehind is factored in, with the information that would
 347  have been lost precalculated in the end_shift field for the
 348  associated string.
 349
 350   The fields pos_min and pos_delta are used to store the minimum offset
 351   and the delta to the maximum offset at the current point in the pattern.
 352
 353 */
 354
 355 typedef struct scan_data_t {
 356  /*I32 len_min;      unused */
 357  /*I32 len_delta;    unused */
 358  SSize_t pos_min;
 359  SSize_t pos_delta;
 360  SV *last_found;
 361  SSize_t last_end;     /* min value, <0 unless valid. */
 362  SSize_t last_start_min;
 363  SSize_t last_start_max;
 364  SV **longest;     /* Either &l_fixed, or &l_float. */
 365  SV *longest_fixed;      /* longest fixed string found in pattern */
 366  SSize_t offset_fixed;   /* offset where it starts */
 367  SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 368  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 369  SV *longest_float;      /* longest floating string found in pattern */
 370  SSize_t offset_float_min; /* earliest point in string it can appear */
 371  SSize_t offset_float_max; /* latest point in string it can appear */
 372  SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 373  SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 374  I32 flags;
 375  I32 whilem_c;
 376  SSize_t *last_closep;
 377  regnode_ssc *start_class;
 378 } scan_data_t;
 379
 380 /* The below is perhaps overboard, but this allows us to save a test at the
 381  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 382  * and 'a' differ by a single bit; the same with the upper and lower case of
 383  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 384  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 385  * then inverts it to form a mask, with just a single 0, in the bit position
 386  * where the upper- and lowercase differ.  XXX There are about 40 other
 387  * instances in the Perl core where this micro-optimization could be used.
 388  * Should decide if maintenance cost is worse, before changing those
 389  *
 390  * Returns a boolean as to whether or not 'v' is either a lowercase or
 391  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 392  * compile-time constant, the generated code is better than some optimizing
 393  * compilers figure out, amounting to a mask and test.  The results are
 394  * meaningless if 'c' is not one of [A-Za-z] */
 395 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 396        (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 397
 398 /*
 399  * Forward declarations for pregcomp()'s friends.
 400  */
 401
 402 static const scan_data_t zero_scan_data =
 403   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 404
 405 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 406 #define SF_BEFORE_SEOL  0x0001
 407 #define SF_BEFORE_MEOL  0x0002
 408 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 409 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 410
 411 #define SF_FIX_SHIFT_EOL (+2)
 412 #define SF_FL_SHIFT_EOL  (+4)
 413
 414 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 415 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 416
 417 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 418 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 419 #define SF_IS_INF  0x0040
 420 #define SF_HAS_PAR  0x0080
 421 #define SF_IN_PAR  0x0100
 422 #define SF_HAS_EVAL  0x0200
 423 #define SCF_DO_SUBSTR  0x0400
 424 #define SCF_DO_STCLASS_AND 0x0800
 425 #define SCF_DO_STCLASS_OR 0x1000
 426 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 427 #define SCF_WHILEM_VISITED_POS 0x2000
 428
 429 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 430 #define SCF_SEEN_ACCEPT         0x8000
 431 #define SCF_TRIE_DOING_RESTUDY 0x10000
 432
 433 #define UTF cBOOL(RExC_utf8)
 434
 435 /* The enums for all these are ordered so things work out correctly */
 436 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 437 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 438              == REGEX_DEPENDS_CHARSET)
 439 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 440 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 441              >= REGEX_UNICODE_CHARSET)
 442 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 443            == REGEX_ASCII_RESTRICTED_CHARSET)
 444 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 445            >= REGEX_ASCII_RESTRICTED_CHARSET)
 446 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 447           == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 448
 449 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 450
 451 /* For programs that want to be strictly Unicode compatible by dying if any
 452  * attempt is made to match a non-Unicode code point against a Unicode
 453  * property.  */
 454 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 455
 456 #define OOB_NAMEDCLASS  -1
 457
 458 /* There is no code point that is out-of-bounds, so this is problematic.  But
 459  * its only current use is to initialize a variable that is always set before
 460  * looked at. */
 461 #define OOB_UNICODE  0xDEADBEEF
 462
 463 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 464 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 465
 466
 467 /* length of regex to show in messages that don't mark a position within */
 468 #define RegexLengthToShowInErrorMessages 127
 469
 470 /*
 471  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 472  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 473  * op/pragma/warn/regcomp.
 474  */
 475 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 476 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 477
 478 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 479       " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 480
 481 #define REPORT_LOCATION_ARGS(offset)            \
 482     UTF8fARG(UTF, offset, RExC_precomp), \
 483     UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 484
 485 /*
 486  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 487  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 488  * "...".
 489  */
 490 #define _FAIL(code) STMT_START {     \
 491  const char *ellipses = "";      \
 492  IV len = RExC_end - RExC_precomp;     \
 493                   \
 494  if (!SIZE_ONLY)       \
 495   SAVEFREESV(RExC_rx_sv);      \
 496  if (len > RegexLengthToShowInErrorMessages) {   \
 497   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 498   len = RegexLengthToShowInErrorMessages - 10;   \
 499   ellipses = "...";      \
 500  }         \
 501  code;                                                               \
 502 } STMT_END
 503
 504 #define FAIL(msg) _FAIL(       \
 505  Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",     \
 506    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 507
 508 #define FAIL2(msg,arg) _FAIL(       \
 509  Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",     \
 510    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 511
 512 /*
 513  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 514  */
 515 #define Simple_vFAIL(m) STMT_START {     \
 516  const IV offset = RExC_parse - RExC_precomp;   \
 517  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 518    m, REPORT_LOCATION_ARGS(offset)); \
 519 } STMT_END
 520
 521 /*
 522  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 523  */
 524 #define vFAIL(m) STMT_START {    \
 525  if (!SIZE_ONLY)     \
 526   SAVEFREESV(RExC_rx_sv);    \
 527  Simple_vFAIL(m);     \
 528 } STMT_END
 529
 530 /*
 531  * Like Simple_vFAIL(), but accepts two arguments.
 532  */
 533 #define Simple_vFAIL2(m,a1) STMT_START {   \
 534  const IV offset = RExC_parse - RExC_precomp;   \
 535  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,   \
 536      REPORT_LOCATION_ARGS(offset)); \
 537 } STMT_END
 538
 539 /*
 540  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 541  */
 542 #define vFAIL2(m,a1) STMT_START {   \
 543  if (!SIZE_ONLY)     \
 544   SAVEFREESV(RExC_rx_sv);    \
 545  Simple_vFAIL2(m, a1);    \
 546 } STMT_END
 547
 548
 549 /*
 550  * Like Simple_vFAIL(), but accepts three arguments.
 551  */
 552 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 553  const IV offset = RExC_parse - RExC_precomp;  \
 554  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 555    REPORT_LOCATION_ARGS(offset)); \
 556 } STMT_END
 557
 558 /*
 559  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 560  */
 561 #define vFAIL3(m,a1,a2) STMT_START {   \
 562  if (!SIZE_ONLY)     \
 563   SAVEFREESV(RExC_rx_sv);    \
 564  Simple_vFAIL3(m, a1, a2);    \
 565 } STMT_END
 566
 567 /*
 568  * Like Simple_vFAIL(), but accepts four arguments.
 569  */
 570 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 571  const IV offset = RExC_parse - RExC_precomp;  \
 572  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,  \
 573    REPORT_LOCATION_ARGS(offset)); \
 574 } STMT_END
 575
 576 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 577  if (!SIZE_ONLY)     \
 578   SAVEFREESV(RExC_rx_sv);    \
 579  Simple_vFAIL4(m, a1, a2, a3);   \
 580 } STMT_END
 581
 582 /* A specialized version of vFAIL2 that works with UTF8f */
 583 #define vFAIL2utf8f(m, a1) STMT_START { \
 584  const IV offset = RExC_parse - RExC_precomp;   \
 585  if (!SIZE_ONLY)                                \
 586   SAVEFREESV(RExC_rx_sv);                    \
 587  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 588    REPORT_LOCATION_ARGS(offset));         \
 589 } STMT_END
 590
 591
 592 /* m is not necessarily a "literal string", in this macro */
 593 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 594  const IV offset = loc - RExC_precomp;                               \
 595  Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 596    m, REPORT_LOCATION_ARGS(offset));       \
 597 } STMT_END
 598
 599 #define ckWARNreg(loc,m) STMT_START {     \
 600  const IV offset = loc - RExC_precomp;    \
 601  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 602    REPORT_LOCATION_ARGS(offset));  \
 603 } STMT_END
 604
 605 #define vWARN_dep(loc, m) STMT_START {            \
 606  const IV offset = loc - RExC_precomp;    \
 607  Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 608    REPORT_LOCATION_ARGS(offset));         \
 609 } STMT_END
 610
 611 #define ckWARNdep(loc,m) STMT_START {            \
 612  const IV offset = loc - RExC_precomp;    \
 613  Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 614    m REPORT_LOCATION,      \
 615    REPORT_LOCATION_ARGS(offset));  \
 616 } STMT_END
 617
 618 #define ckWARNregdep(loc,m) STMT_START {    \
 619  const IV offset = loc - RExC_precomp;    \
 620  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 621    m REPORT_LOCATION,      \
 622    REPORT_LOCATION_ARGS(offset));  \
 623 } STMT_END
 624
 625 #define ckWARN2reg_d(loc,m, a1) STMT_START {    \
 626  const IV offset = loc - RExC_precomp;    \
 627  Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),   \
 628    m REPORT_LOCATION,      \
 629    a1, REPORT_LOCATION_ARGS(offset)); \
 630 } STMT_END
 631
 632 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 633  const IV offset = loc - RExC_precomp;    \
 634  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 635    a1, REPORT_LOCATION_ARGS(offset)); \
 636 } STMT_END
 637
 638 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 639  const IV offset = loc - RExC_precomp;    \
 640  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 641    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 642 } STMT_END
 643
 644 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 645  const IV offset = loc - RExC_precomp;    \
 646  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 647    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 648 } STMT_END
 649
 650 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 651  const IV offset = loc - RExC_precomp;    \
 652  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 653    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 654 } STMT_END
 655
 656 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 657  const IV offset = loc - RExC_precomp;    \
 658  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 659    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 660 } STMT_END
 661
 662 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 663  const IV offset = loc - RExC_precomp;    \
 664  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 665    a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 666 } STMT_END
 667
 668
 669 /* Allow for side effects in s */
 670 #define REGC(c,s) STMT_START {   \
 671  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 672 } STMT_END
 673
 674 /* Macros for recording node offsets.   20001227 mjd@plover.com
 675  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 676  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 677  * Element 0 holds the number n.
 678  * Position is 1 indexed.
 679  */
 680 #ifndef RE_TRACK_PATTERN_OFFSETS
 681 #define Set_Node_Offset_To_R(node,byte)
 682 #define Set_Node_Offset(node,byte)
 683 #define Set_Cur_Node_Offset
 684 #define Set_Node_Length_To_R(node,len)
 685 #define Set_Node_Length(node,len)
 686 #define Set_Node_Cur_Length(node,start)
 687 #define Node_Offset(n)
 688 #define Node_Length(n)
 689 #define Set_Node_Offset_Length(node,offset,len)
 690 #define ProgLen(ri) ri->u.proglen
 691 #define SetProgLen(ri,x) ri->u.proglen = x
 692 #else
 693 #define ProgLen(ri) ri->u.offsets[0]
 694 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 695 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 696  if (! SIZE_ONLY) {       \
 697   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 698      __LINE__, (int)(node), (int)(byte)));  \
 699   if((node) < 0) {      \
 700    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 701           (int)(node));                  \
 702   } else {       \
 703    RExC_offsets[2*(node)-1] = (byte);    \
 704   }        \
 705  }         \
 706 } STMT_END
 707
 708 #define Set_Node_Offset(node,byte) \
 709  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 710 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 711
 712 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 713  if (! SIZE_ONLY) {       \
 714   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 715     __LINE__, (int)(node), (int)(len)));   \
 716   if((node) < 0) {      \
 717    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 718           (int)(node));                  \
 719   } else {       \
 720    RExC_offsets[2*(node)] = (len);    \
 721   }        \
 722  }         \
 723 } STMT_END
 724
 725 #define Set_Node_Length(node,len) \
 726  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 727 #define Set_Node_Cur_Length(node, start)                \
 728  Set_Node_Length(node, RExC_parse - start)
 729
 730 /* Get offsets and lengths */
 731 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 732 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 733
 734 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 735  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 736  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 737 } STMT_END
 738 #endif
 739
 740 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 741 #define EXPERIMENTAL_INPLACESCAN
 742 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 743
 744 #define DEBUG_RExC_seen() \
 745   DEBUG_OPTIMISE_MORE_r({                                             \
 746    PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 747                    \
 748    if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 749     PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 750                    \
 751    if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 752     PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 753                    \
 754    if (RExC_seen & REG_GPOS_SEEN)                                  \
 755     PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 756                    \
 757    if (RExC_seen & REG_CANY_SEEN)                                  \
 758     PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 759                    \
 760    if (RExC_seen & REG_RECURSE_SEEN)                               \
 761     PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 762                    \
 763    if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 764     PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 765                    \
 766    if (RExC_seen & REG_VERBARG_SEEN)                               \
 767     PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 768                    \
 769    if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 770     PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 771                    \
 772    if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 773     PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 774                    \
 775    if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 776     PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 777                    \
 778    if (RExC_seen & REG_GOSTART_SEEN)                               \
 779     PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 780                    \
 781    if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 782     PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 783                    \
 784    PerlIO_printf(Perl_debug_log,"\n");                             \
 785   });
 786
 787 #define DEBUG_STUDYDATA(str,data,depth)                              \
 788 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 789  PerlIO_printf(Perl_debug_log,                                    \
 790   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 791   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 792   (int)(depth)*2, "",                                          \
 793   (IV)((data)->pos_min),                                       \
 794   (IV)((data)->pos_delta),                                     \
 795   (UV)((data)->flags),                                         \
 796   (IV)((data)->whilem_c),                                      \
 797   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 798   is_inf ? "INF " : ""                                         \
 799  );                                                               \
 800  if ((data)->last_found)                                          \
 801   PerlIO_printf(Perl_debug_log,                                \
 802    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 803    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 804    SvPVX_const((data)->last_found),                         \
 805    (IV)((data)->last_end),                                  \
 806    (IV)((data)->last_start_min),                            \
 807    (IV)((data)->last_start_max),                            \
 808    ((data)->longest &&                                      \
 809    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 810    SvPVX_const((data)->longest_fixed),                      \
 811    (IV)((data)->offset_fixed),                              \
 812    ((data)->longest &&                                      \
 813    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 814    SvPVX_const((data)->longest_float),                      \
 815    (IV)((data)->offset_float_min),                          \
 816    (IV)((data)->offset_float_max)                           \
 817   );                                                           \
 818  PerlIO_printf(Perl_debug_log,"\n");                              \
 819 });
 820
 821 /* Mark that we cannot extend a found fixed substring at this point.
 822    Update the longest found anchored substring and the longest found
 823    floating substrings if needed. */
 824
 825 STATIC void
 826 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 827      SSize_t *minlenp, int is_inf)
 828 {
 829  const STRLEN l = CHR_SVLEN(data->last_found);
 830  const STRLEN old_l = CHR_SVLEN(*data->longest);
 831  GET_RE_DEBUG_FLAGS_DECL;
 832
 833  PERL_ARGS_ASSERT_SCAN_COMMIT;
 834
 835  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 836   SvSetMagicSV(*data->longest, data->last_found);
 837   if (*data->longest == data->longest_fixed) {
 838    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 839    if (data->flags & SF_BEFORE_EOL)
 840     data->flags
 841      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 842    else
 843     data->flags &= ~SF_FIX_BEFORE_EOL;
 844    data->minlen_fixed=minlenp;
 845    data->lookbehind_fixed=0;
 846   }
 847   else { /* *data->longest == data->longest_float */
 848    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 849    data->offset_float_max = (l
 850          ? data->last_start_max
 851          : (data->pos_delta == SSize_t_MAX
 852           ? SSize_t_MAX
 853           : data->pos_min + data->pos_delta));
 854    if (is_inf
 855     || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 856     data->offset_float_max = SSize_t_MAX;
 857    if (data->flags & SF_BEFORE_EOL)
 858     data->flags
 859      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 860    else
 861     data->flags &= ~SF_FL_BEFORE_EOL;
 862    data->minlen_float=minlenp;
 863    data->lookbehind_float=0;
 864   }
 865  }
 866  SvCUR_set(data->last_found, 0);
 867  {
 868   SV * const sv = data->last_found;
 869   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 870    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 871    if (mg)
 872     mg->mg_len = 0;
 873   }
 874  }
 875  data->last_end = -1;
 876  data->flags &= ~SF_BEFORE_EOL;
 877  DEBUG_STUDYDATA("commit: ",data,0);
 878 }
 879
 880 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 881  * list that describes which code points it matches */
 882
 883 STATIC void
 884 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 885 {
 886  /* Set the SSC 'ssc' to match an empty string or any code point */
 887
 888  PERL_ARGS_ASSERT_SSC_ANYTHING;
 889
 890  assert(is_ANYOF_SYNTHETIC(ssc));
 891
 892  ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 893  _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 894  ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 895 }
 896
 897 STATIC int
 898 S_ssc_is_anything(const regnode_ssc *ssc)
 899 {
 900  /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 901  * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 902  * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 903  * in any way, so there's no point in using it */
 904
 905  UV start, end;
 906  bool ret;
 907
 908  PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 909
 910  assert(is_ANYOF_SYNTHETIC(ssc));
 911
 912  if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 913   return FALSE;
 914  }
 915
 916  /* See if the list consists solely of the range 0 - Infinity */
 917  invlist_iterinit(ssc->invlist);
 918  ret = invlist_iternext(ssc->invlist, &start, &end)
 919   && start == 0
 920   && end == UV_MAX;
 921
 922  invlist_iterfinish(ssc->invlist);
 923
 924  if (ret) {
 925   return TRUE;
 926  }
 927
 928  /* If e.g., both \w and \W are set, matches everything */
 929  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 930   int i;
 931   for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 932    if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 933     return TRUE;
 934    }
 935   }
 936  }
 937
 938  return FALSE;
 939 }
 940
 941 STATIC void
 942 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 943 {
 944  /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 945  * string, any code point, or any posix class under locale */
 946
 947  PERL_ARGS_ASSERT_SSC_INIT;
 948
 949  Zero(ssc, 1, regnode_ssc);
 950  set_ANYOF_SYNTHETIC(ssc);
 951  ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 952  ssc_anything(ssc);
 953
 954  /* If any portion of the regex is to operate under locale rules,
 955  * initialization includes it.  The reason this isn't done for all regexes
 956  * is that the optimizer was written under the assumption that locale was
 957  * all-or-nothing.  Given the complexity and lack of documentation in the
 958  * optimizer, and that there are inadequate test cases for locale, many
 959  * parts of it may not work properly, it is safest to avoid locale unless
 960  * necessary. */
 961  if (RExC_contains_locale) {
 962   ANYOF_POSIXL_SETALL(ssc);
 963  }
 964  else {
 965   ANYOF_POSIXL_ZERO(ssc);
 966  }
 967 }
 968
 969 STATIC int
 970 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
 971       const regnode_ssc *ssc)
 972 {
 973  /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 974  * to the list of code points matched, and locale posix classes; hence does
 975  * not check its flags) */
 976
 977  UV start, end;
 978  bool ret;
 979
 980  PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 981
 982  assert(is_ANYOF_SYNTHETIC(ssc));
 983
 984  invlist_iterinit(ssc->invlist);
 985  ret = invlist_iternext(ssc->invlist, &start, &end)
 986   && start == 0
 987   && end == UV_MAX;
 988
 989  invlist_iterfinish(ssc->invlist);
 990
 991  if (! ret) {
 992   return FALSE;
 993  }
 994
 995  if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 996   return FALSE;
 997  }
 998
 999  return TRUE;
1000 }
1001
1002 STATIC SV*
1003 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1004        const regnode_charclass* const node)
1005 {
1006  /* Returns a mortal inversion list defining which code points are matched
1007  * by 'node', which is of type ANYOF.  Handles complementing the result if
1008  * appropriate.  If some code points aren't knowable at this time, the
1009  * returned list must, and will, contain every code point that is a
1010  * possibility. */
1011
1012  SV* invlist = sv_2mortal(_new_invlist(0));
1013  SV* only_utf8_locale_invlist = NULL;
1014  unsigned int i;
1015  const U32 n = ARG(node);
1016  bool new_node_has_latin1 = FALSE;
1017
1018  PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1019
1020  /* Look at the data structure created by S_set_ANYOF_arg() */
1021  if (n != ANYOF_NONBITMAP_EMPTY) {
1022   SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1023   AV * const av = MUTABLE_AV(SvRV(rv));
1024   SV **const ary = AvARRAY(av);
1025   assert(RExC_rxi->data->what[n] == 's');
1026
1027   if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1028    invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1029   }
1030   else if (ary[0] && ary[0] != &PL_sv_undef) {
1031
1032    /* Here, no compile-time swash, and there are things that won't be
1033    * known until runtime -- we have to assume it could be anything */
1034    return _add_range_to_invlist(invlist, 0, UV_MAX);
1035   }
1036   else if (ary[3] && ary[3] != &PL_sv_undef) {
1037
1038    /* Here no compile-time swash, and no run-time only data.  Use the
1039    * node's inversion list */
1040    invlist = sv_2mortal(invlist_clone(ary[3]));
1041   }
1042
1043   /* Get the code points valid only under UTF-8 locales */
1044   if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1045    && ary[2] && ary[2] != &PL_sv_undef)
1046   {
1047    only_utf8_locale_invlist = ary[2];
1048   }
1049  }
1050
1051  /* An ANYOF node contains a bitmap for the first 256 code points, and an
1052  * inversion list for the others, but if there are code points that should
1053  * match only conditionally on the target string being UTF-8, those are
1054  * placed in the inversion list, and not the bitmap.  Since there are
1055  * circumstances under which they could match, they are included in the
1056  * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1057  * here, so that when we invert below, the end result actually does include
1058  * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1059  * before we add the unconditionally matched code points */
1060  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1061   _invlist_intersection_complement_2nd(invlist,
1062            PL_UpperLatin1,
1063            &invlist);
1064  }
1065
1066  /* Add in the points from the bit map */
1067  for (i = 0; i < 256; i++) {
1068   if (ANYOF_BITMAP_TEST(node, i)) {
1069    invlist = add_cp_to_invlist(invlist, i);
1070    new_node_has_latin1 = TRUE;
1071   }
1072  }
1073
1074  /* If this can match all upper Latin1 code points, have to add them
1075  * as well */
1076  if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1077   _invlist_union(invlist, PL_UpperLatin1, &invlist);
1078  }
1079
1080  /* Similarly for these */
1081  if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1082   invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1083  }
1084
1085  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1086   _invlist_invert(invlist);
1087  }
1088  else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1089
1090   /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1091   * locale.  We can skip this if there are no 0-255 at all. */
1092   _invlist_union(invlist, PL_Latin1, &invlist);
1093  }
1094
1095  /* Similarly add the UTF-8 locale possible matches.  These have to be
1096  * deferred until after the non-UTF-8 locale ones are taken care of just
1097  * above, or it leads to wrong results under ANYOF_INVERT */
1098  if (only_utf8_locale_invlist) {
1099   _invlist_union_maybe_complement_2nd(invlist,
1100            only_utf8_locale_invlist,
1101            ANYOF_FLAGS(node) & ANYOF_INVERT,
1102            &invlist);
1103  }
1104
1105  return invlist;
1106 }
1107
1108 /* These two functions currently do the exact same thing */
1109 #define ssc_init_zero  ssc_init
1110
1111 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1112 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1113
1114 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1115  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1116  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1117
1118 STATIC void
1119 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1120     const regnode_charclass *and_with)
1121 {
1122  /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1123  * another SSC or a regular ANYOF class.  Can create false positives. */
1124
1125  SV* anded_cp_list;
1126  U8  anded_flags;
1127
1128  PERL_ARGS_ASSERT_SSC_AND;
1129
1130  assert(is_ANYOF_SYNTHETIC(ssc));
1131
1132  /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1133  * the code point inversion list and just the relevant flags */
1134  if (is_ANYOF_SYNTHETIC(and_with)) {
1135   anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1136   anded_flags = ANYOF_FLAGS(and_with);
1137
1138   /* XXX This is a kludge around what appears to be deficiencies in the
1139   * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1140   * there are paths through the optimizer where it doesn't get weeded
1141   * out when it should.  And if we don't make some extra provision for
1142   * it like the code just below, it doesn't get added when it should.
1143   * This solution is to add it only when AND'ing, which is here, and
1144   * only when what is being AND'ed is the pristine, original node
1145   * matching anything.  Thus it is like adding it to ssc_anything() but
1146   * only when the result is to be AND'ed.  Probably the same solution
1147   * could be adopted for the same problem we have with /l matching,
1148   * which is solved differently in S_ssc_init(), and that would lead to
1149   * fewer false positives than that solution has.  But if this solution
1150   * creates bugs, the consequences are only that a warning isn't raised
1151   * that should be; while the consequences for having /l bugs is
1152   * incorrect matches */
1153   if (ssc_is_anything((regnode_ssc *)and_with)) {
1154    anded_flags |= ANYOF_WARN_SUPER;
1155   }
1156  }
1157  else {
1158   anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1159   anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1160  }
1161
1162  ANYOF_FLAGS(ssc) &= anded_flags;
1163
1164  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1165  * C2 is the list of code points in 'and-with'; P2, its posix classes.
1166  * 'and_with' may be inverted.  When not inverted, we have the situation of
1167  * computing:
1168  *  (C1 | P1) & (C2 | P2)
1169  *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1170  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1171  *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1172  *                    <=  ((C1 & C2) | P1 | P2)
1173  * Alternatively, the last few steps could be:
1174  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1175  *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1176  *                    <=  (C1 | C2 | (P1 & P2))
1177  * We favor the second approach if either P1 or P2 is non-empty.  This is
1178  * because these components are a barrier to doing optimizations, as what
1179  * they match cannot be known until the moment of matching as they are
1180  * dependent on the current locale, 'AND"ing them likely will reduce or
1181  * eliminate them.
1182  * But we can do better if we know that C1,P1 are in their initial state (a
1183  * frequent occurrence), each matching everything:
1184  *  (<everything>) & (C2 | P2) =  C2 | P2
1185  * Similarly, if C2,P2 are in their initial state (again a frequent
1186  * occurrence), the result is a no-op
1187  *  (C1 | P1) & (<everything>) =  C1 | P1
1188  *
1189  * Inverted, we have
1190  *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1191  *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1192  *                         <=  (C1 & ~C2) | (P1 & ~P2)
1193  * */
1194
1195  if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1196   && ! is_ANYOF_SYNTHETIC(and_with))
1197  {
1198   unsigned int i;
1199
1200   ssc_intersection(ssc,
1201       anded_cp_list,
1202       FALSE /* Has already been inverted */
1203       );
1204
1205   /* If either P1 or P2 is empty, the intersection will be also; can skip
1206   * the loop */
1207   if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1208    ANYOF_POSIXL_ZERO(ssc);
1209   }
1210   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1211
1212    /* Note that the Posix class component P from 'and_with' actually
1213    * looks like:
1214    *      P = Pa | Pb | ... | Pn
1215    * where each component is one posix class, such as in [\w\s].
1216    * Thus
1217    *      ~P = ~(Pa | Pb | ... | Pn)
1218    *         = ~Pa & ~Pb & ... & ~Pn
1219    *        <= ~Pa | ~Pb | ... | ~Pn
1220    * The last is something we can easily calculate, but unfortunately
1221    * is likely to have many false positives.  We could do better
1222    * in some (but certainly not all) instances if two classes in
1223    * P have known relationships.  For example
1224    *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1225    * So
1226    *      :lower: & :print: = :lower:
1227    * And similarly for classes that must be disjoint.  For example,
1228    * since \s and \w can have no elements in common based on rules in
1229    * the POSIX standard,
1230    *      \w & ^\S = nothing
1231    * Unfortunately, some vendor locales do not meet the Posix
1232    * standard, in particular almost everything by Microsoft.
1233    * The loop below just changes e.g., \w into \W and vice versa */
1234
1235    regnode_charclass_posixl temp;
1236    int add = 1;    /* To calculate the index of the complement */
1237
1238    ANYOF_POSIXL_ZERO(&temp);
1239    for (i = 0; i < ANYOF_MAX; i++) {
1240     assert(i % 2 != 0
1241      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1242      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1243
1244     if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1245      ANYOF_POSIXL_SET(&temp, i + add);
1246     }
1247     add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1248    }
1249    ANYOF_POSIXL_AND(&temp, ssc);
1250
1251   } /* else ssc already has no posixes */
1252  } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1253   in its initial state */
1254  else if (! is_ANYOF_SYNTHETIC(and_with)
1255    || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1256  {
1257   /* But if 'ssc' is in its initial state, the result is just 'and_with';
1258   * copy it over 'ssc' */
1259   if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1260    if (is_ANYOF_SYNTHETIC(and_with)) {
1261     StructCopy(and_with, ssc, regnode_ssc);
1262    }
1263    else {
1264     ssc->invlist = anded_cp_list;
1265     ANYOF_POSIXL_ZERO(ssc);
1266     if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1267      ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1268     }
1269    }
1270   }
1271   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1272     || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1273   {
1274    /* One or the other of P1, P2 is non-empty. */
1275    if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1276     ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1277    }
1278    ssc_union(ssc, anded_cp_list, FALSE);
1279   }
1280   else { /* P1 = P2 = empty */
1281    ssc_intersection(ssc, anded_cp_list, FALSE);
1282   }
1283  }
1284 }
1285
1286 STATIC void
1287 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1288    const regnode_charclass *or_with)
1289 {
1290  /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1291  * another SSC or a regular ANYOF class.  Can create false positives if
1292  * 'or_with' is to be inverted. */
1293
1294  SV* ored_cp_list;
1295  U8 ored_flags;
1296
1297  PERL_ARGS_ASSERT_SSC_OR;
1298
1299  assert(is_ANYOF_SYNTHETIC(ssc));
1300
1301  /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1302  * the code point inversion list and just the relevant flags */
1303  if (is_ANYOF_SYNTHETIC(or_with)) {
1304   ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1305   ored_flags = ANYOF_FLAGS(or_with);
1306  }
1307  else {
1308   ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1309   ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1310  }
1311
1312  ANYOF_FLAGS(ssc) |= ored_flags;
1313
1314  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1315  * C2 is the list of code points in 'or-with'; P2, its posix classes.
1316  * 'or_with' may be inverted.  When not inverted, we have the simple
1317  * situation of computing:
1318  *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1319  * If P1|P2 yields a situation with both a class and its complement are
1320  * set, like having both \w and \W, this matches all code points, and we
1321  * can delete these from the P component of the ssc going forward.  XXX We
1322  * might be able to delete all the P components, but I (khw) am not certain
1323  * about this, and it is better to be safe.
1324  *
1325  * Inverted, we have
1326  *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1327  *                         <=  (C1 | P1) | ~C2
1328  *                         <=  (C1 | ~C2) | P1
1329  * (which results in actually simpler code than the non-inverted case)
1330  * */
1331
1332  if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1333   && ! is_ANYOF_SYNTHETIC(or_with))
1334  {
1335   /* We ignore P2, leaving P1 going forward */
1336  }   /* else  Not inverted */
1337  else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1338   ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1339   if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1340    unsigned int i;
1341    for (i = 0; i < ANYOF_MAX; i += 2) {
1342     if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1343     {
1344      ssc_match_all_cp(ssc);
1345      ANYOF_POSIXL_CLEAR(ssc, i);
1346      ANYOF_POSIXL_CLEAR(ssc, i+1);
1347     }
1348    }
1349   }
1350  }
1351
1352  ssc_union(ssc,
1353    ored_cp_list,
1354    FALSE /* Already has been inverted */
1355    );
1356 }
1357
1358 PERL_STATIC_INLINE void
1359 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1360 {
1361  PERL_ARGS_ASSERT_SSC_UNION;
1362
1363  assert(is_ANYOF_SYNTHETIC(ssc));
1364
1365  _invlist_union_maybe_complement_2nd(ssc->invlist,
1366           invlist,
1367           invert2nd,
1368           &ssc->invlist);
1369 }
1370
1371 PERL_STATIC_INLINE void
1372 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1373       SV* const invlist,
1374       const bool invert2nd)
1375 {
1376  PERL_ARGS_ASSERT_SSC_INTERSECTION;
1377
1378  assert(is_ANYOF_SYNTHETIC(ssc));
1379
1380  _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1381            invlist,
1382            invert2nd,
1383            &ssc->invlist);
1384 }
1385
1386 PERL_STATIC_INLINE void
1387 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1388 {
1389  PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1390
1391  assert(is_ANYOF_SYNTHETIC(ssc));
1392
1393  ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1394 }
1395
1396 PERL_STATIC_INLINE void
1397 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1398 {
1399  /* AND just the single code point 'cp' into the SSC 'ssc' */
1400
1401  SV* cp_list = _new_invlist(2);
1402
1403  PERL_ARGS_ASSERT_SSC_CP_AND;
1404
1405  assert(is_ANYOF_SYNTHETIC(ssc));
1406
1407  cp_list = add_cp_to_invlist(cp_list, cp);
1408  ssc_intersection(ssc, cp_list,
1409      FALSE /* Not inverted */
1410      );
1411  SvREFCNT_dec_NN(cp_list);
1412 }
1413
1414 PERL_STATIC_INLINE void
1415 S_ssc_clear_locale(regnode_ssc *ssc)
1416 {
1417  /* Set the SSC 'ssc' to not match any locale things */
1418  PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1419
1420  assert(is_ANYOF_SYNTHETIC(ssc));
1421
1422  ANYOF_POSIXL_ZERO(ssc);
1423  ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1424 }
1425
1426 STATIC void
1427 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1428 {
1429  /* The inversion list in the SSC is marked mortal; now we need a more
1430  * permanent copy, which is stored the same way that is done in a regular
1431  * ANYOF node, with the first 256 code points in a bit map */
1432
1433  SV* invlist = invlist_clone(ssc->invlist);
1434
1435  PERL_ARGS_ASSERT_SSC_FINALIZE;
1436
1437  assert(is_ANYOF_SYNTHETIC(ssc));
1438
1439  /* The code in this file assumes that all but these flags aren't relevant
1440  * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1441  * time we reach here */
1442  assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1443
1444  populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1445
1446  set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1447         NULL, NULL, NULL, FALSE);
1448
1449  /* Make sure is clone-safe */
1450  ssc->invlist = NULL;
1451
1452  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1453   ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1454  }
1455
1456  assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1457 }
1458
1459 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1460 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1461 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1462 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1463        ? (TRIE_LIST_CUR( idx ) - 1)           \
1464        : 0 )
1465
1466
1467 #ifdef DEBUGGING
1468 /*
1469    dump_trie(trie,widecharmap,revcharmap)
1470    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1471    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1472
1473    These routines dump out a trie in a somewhat readable format.
1474    The _interim_ variants are used for debugging the interim
1475    tables that are used to generate the final compressed
1476    representation which is what dump_trie expects.
1477
1478    Part of the reason for their existence is to provide a form
1479    of documentation as to how the different representations function.
1480
1481 */
1482
1483 /*
1484   Dumps the final compressed table form of the trie to Perl_debug_log.
1485   Used for debugging make_trie().
1486 */
1487
1488 STATIC void
1489 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1490    AV *revcharmap, U32 depth)
1491 {
1492  U32 state;
1493  SV *sv=sv_newmortal();
1494  int colwidth= widecharmap ? 6 : 4;
1495  U16 word;
1496  GET_RE_DEBUG_FLAGS_DECL;
1497
1498  PERL_ARGS_ASSERT_DUMP_TRIE;
1499
1500  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1501   (int)depth * 2 + 2,"",
1502   "Match","Base","Ofs" );
1503
1504  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1505   SV ** const tmp = av_fetch( revcharmap, state, 0);
1506   if ( tmp ) {
1507    PerlIO_printf( Perl_debug_log, "%*s",
1508     colwidth,
1509     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1510        PL_colors[0], PL_colors[1],
1511        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1512        PERL_PV_ESCAPE_FIRSTCHAR
1513     )
1514    );
1515   }
1516  }
1517  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1518   (int)depth * 2 + 2,"");
1519
1520  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1521   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1522  PerlIO_printf( Perl_debug_log, "\n");
1523
1524  for( state = 1 ; state < trie->statecount ; state++ ) {
1525   const U32 base = trie->states[ state ].trans.base;
1526
1527   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1528          (int)depth * 2 + 2,"", (UV)state);
1529
1530   if ( trie->states[ state ].wordnum ) {
1531    PerlIO_printf( Perl_debug_log, " W%4X",
1532           trie->states[ state ].wordnum );
1533   } else {
1534    PerlIO_printf( Perl_debug_log, "%6s", "" );
1535   }
1536
1537   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1538
1539   if ( base ) {
1540    U32 ofs = 0;
1541
1542    while( ( base + ofs  < trie->uniquecharcount ) ||
1543     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1544      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1545                  != state))
1546      ofs++;
1547
1548    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1549
1550    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1551     if ( ( base + ofs >= trie->uniquecharcount )
1552       && ( base + ofs - trie->uniquecharcount
1553               < trie->lasttrans )
1554       && trie->trans[ base + ofs
1555          - trie->uniquecharcount ].check == state )
1556     {
1557     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1558      colwidth,
1559      (UV)trie->trans[ base + ofs
1560            - trie->uniquecharcount ].next );
1561     } else {
1562      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1563     }
1564    }
1565
1566    PerlIO_printf( Perl_debug_log, "]");
1567
1568   }
1569   PerlIO_printf( Perl_debug_log, "\n" );
1570  }
1571  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1572         (int)depth*2, "");
1573  for (word=1; word <= trie->wordcount; word++) {
1574   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1575    (int)word, (int)(trie->wordinfo[word].prev),
1576    (int)(trie->wordinfo[word].len));
1577  }
1578  PerlIO_printf(Perl_debug_log, "\n" );
1579 }
1580 /*
1581   Dumps a fully constructed but uncompressed trie in list form.
1582   List tries normally only are used for construction when the number of
1583   possible chars (trie->uniquecharcount) is very high.
1584   Used for debugging make_trie().
1585 */
1586 STATIC void
1587 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1588       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1589       U32 depth)
1590 {
1591  U32 state;
1592  SV *sv=sv_newmortal();
1593  int colwidth= widecharmap ? 6 : 4;
1594  GET_RE_DEBUG_FLAGS_DECL;
1595
1596  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1597
1598  /* print out the table precompression.  */
1599  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1600   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1601   "------:-----+-----------------\n" );
1602
1603  for( state=1 ; state < next_alloc ; state ++ ) {
1604   U16 charid;
1605
1606   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1607    (int)depth * 2 + 2,"", (UV)state  );
1608   if ( ! trie->states[ state ].wordnum ) {
1609    PerlIO_printf( Perl_debug_log, "%5s| ","");
1610   } else {
1611    PerlIO_printf( Perl_debug_log, "W%4x| ",
1612     trie->states[ state ].wordnum
1613    );
1614   }
1615   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1616    SV ** const tmp = av_fetch( revcharmap,
1617           TRIE_LIST_ITEM(state,charid).forid, 0);
1618    if ( tmp ) {
1619     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1620      colwidth,
1621      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1622        colwidth,
1623        PL_colors[0], PL_colors[1],
1624        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1625        | PERL_PV_ESCAPE_FIRSTCHAR
1626      ) ,
1627      TRIE_LIST_ITEM(state,charid).forid,
1628      (UV)TRIE_LIST_ITEM(state,charid).newstate
1629     );
1630     if (!(charid % 10))
1631      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1632       (int)((depth * 2) + 14), "");
1633    }
1634   }
1635   PerlIO_printf( Perl_debug_log, "\n");
1636  }
1637 }
1638
1639 /*
1640   Dumps a fully constructed but uncompressed trie in table form.
1641   This is the normal DFA style state transition table, with a few
1642   twists to facilitate compression later.
1643   Used for debugging make_trie().
1644 */
1645 STATIC void
1646 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1647       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1648       U32 depth)
1649 {
1650  U32 state;
1651  U16 charid;
1652  SV *sv=sv_newmortal();
1653  int colwidth= widecharmap ? 6 : 4;
1654  GET_RE_DEBUG_FLAGS_DECL;
1655
1656  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1657
1658  /*
1659  print out the table precompression so that we can do a visual check
1660  that they are identical.
1661  */
1662
1663  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1664
1665  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1666   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1667   if ( tmp ) {
1668    PerlIO_printf( Perl_debug_log, "%*s",
1669     colwidth,
1670     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1671        PL_colors[0], PL_colors[1],
1672        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1673        PERL_PV_ESCAPE_FIRSTCHAR
1674     )
1675    );
1676   }
1677  }
1678
1679  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1680
1681  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1682   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1683  }
1684
1685  PerlIO_printf( Perl_debug_log, "\n" );
1686
1687  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1688
1689   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1690    (int)depth * 2 + 2,"",
1691    (UV)TRIE_NODENUM( state ) );
1692
1693   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1694    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1695    if (v)
1696     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1697    else
1698     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1699   }
1700   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1701    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1702            (UV)trie->trans[ state ].check );
1703   } else {
1704    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1705            (UV)trie->trans[ state ].check,
1706    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1707   }
1708  }
1709 }
1710
1711 #endif
1712
1713
1714 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1715   startbranch: the first branch in the whole branch sequence
1716   first      : start branch of sequence of branch-exact nodes.
1717    May be the same as startbranch
1718   last       : Thing following the last branch.
1719    May be the same as tail.
1720   tail       : item following the branch sequence
1721   count      : words in the sequence
1722   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1723   depth      : indent depth
1724
1725 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1726
1727 A trie is an N'ary tree where the branches are determined by digital
1728 decomposition of the key. IE, at the root node you look up the 1st character and
1729 follow that branch repeat until you find the end of the branches. Nodes can be
1730 marked as "accepting" meaning they represent a complete word. Eg:
1731
1732   /he|she|his|hers/
1733
1734 would convert into the following structure. Numbers represent states, letters
1735 following numbers represent valid transitions on the letter from that state, if
1736 the number is in square brackets it represents an accepting state, otherwise it
1737 will be in parenthesis.
1738
1739  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1740  |    |
1741  |   (2)
1742  |    |
1743  (1)   +-i->(6)-+-s->[7]
1744  |
1745  +-s->(3)-+-h->(4)-+-e->[5]
1746
1747  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1748
1749 This shows that when matching against the string 'hers' we will begin at state 1
1750 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1751 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1752 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1753 single traverse. We store a mapping from accepting to state to which word was
1754 matched, and then when we have multiple possibilities we try to complete the
1755 rest of the regex in the order in which they occured in the alternation.
1756
1757 The only prior NFA like behaviour that would be changed by the TRIE support is
1758 the silent ignoring of duplicate alternations which are of the form:
1759
1760  / (DUPE|DUPE) X? (?{ ... }) Y /x
1761
1762 Thus EVAL blocks following a trie may be called a different number of times with
1763 and without the optimisation. With the optimisations dupes will be silently
1764 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1765 the following demonstrates:
1766
1767  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1768
1769 which prints out 'word' three times, but
1770
1771  'words'=~/(word|word|word)(?{ print $1 })S/
1772
1773 which doesnt print it out at all. This is due to other optimisations kicking in.
1774
1775 Example of what happens on a structural level:
1776
1777 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1778
1779    1: CURLYM[1] {1,32767}(18)
1780    5:   BRANCH(8)
1781    6:     EXACT <ac>(16)
1782    8:   BRANCH(11)
1783    9:     EXACT <ad>(16)
1784   11:   BRANCH(14)
1785   12:     EXACT <ab>(16)
1786   16:   SUCCEED(0)
1787   17:   NOTHING(18)
1788   18: END(0)
1789
1790 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1791 and should turn into:
1792
1793    1: CURLYM[1] {1,32767}(18)
1794    5:   TRIE(16)
1795   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1796   <ac>
1797   <ad>
1798   <ab>
1799   16:   SUCCEED(0)
1800   17:   NOTHING(18)
1801   18: END(0)
1802
1803 Cases where tail != last would be like /(?foo|bar)baz/:
1804
1805    1: BRANCH(4)
1806    2:   EXACT <foo>(8)
1807    4: BRANCH(7)
1808    5:   EXACT <bar>(8)
1809    7: TAIL(8)
1810    8: EXACT <baz>(10)
1811   10: END(0)
1812
1813 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1814 and would end up looking like:
1815
1816  1: TRIE(8)
1817  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1818   <foo>
1819   <bar>
1820    7: TAIL(8)
1821    8: EXACT <baz>(10)
1822   10: END(0)
1823
1824  d = uvchr_to_utf8_flags(d, uv, 0);
1825
1826 is the recommended Unicode-aware way of saying
1827
1828  *(d++) = uv;
1829 */
1830
1831 #define TRIE_STORE_REVCHAR(val)                                            \
1832  STMT_START {                                                           \
1833   if (UTF) {          \
1834    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1835    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1836    unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1837    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1838    SvPOK_on(zlopp);         \
1839    SvUTF8_on(zlopp);         \
1840    av_push(revcharmap, zlopp);        \
1841   } else {          \
1842    char ooooff = (char)val;                                           \
1843    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1844   }           \
1845   } STMT_END
1846
1847 /* This gets the next character from the input, folding it if not already
1848  * folded. */
1849 #define TRIE_READ_CHAR STMT_START {                                           \
1850  wordlen++;                                                                \
1851  if ( UTF ) {                                                              \
1852   /* if it is UTF then it is either already folded, or does not need    \
1853   * folding */                                                         \
1854   uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1855  }                                                                         \
1856  else if (folder == PL_fold_latin1) {                                      \
1857   /* This folder implies Unicode rules, which in the range expressible  \
1858   *  by not UTF is the lower case, with the two exceptions, one of     \
1859   *  which should have been taken care of before calling this */       \
1860   assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1861   uvc = toLOWER_L1(*uc);                                                \
1862   if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1863   len = 1;                                                              \
1864  } else {                                                                  \
1865   /* raw data, will be folded later if needed */                        \
1866   uvc = (U32)*uc;                                                       \
1867   len = 1;                                                              \
1868  }                                                                         \
1869 } STMT_END
1870
1871
1872
1873 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1874  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1875   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1876   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1877  }                                                           \
1878  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1879  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1880  TRIE_LIST_CUR( state )++;                                   \
1881 } STMT_END
1882
1883 #define TRIE_LIST_NEW(state) STMT_START {                       \
1884  Newxz( trie->states[ state ].trans.list,               \
1885   4, reg_trie_trans_le );                                 \
1886  TRIE_LIST_CUR( state ) = 1;                                \
1887  TRIE_LIST_LEN( state ) = 4;                                \
1888 } STMT_END
1889
1890 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1891  U16 dupe= trie->states[ state ].wordnum;                    \
1892  regnode * const noper_next = regnext( noper );              \
1893                 \
1894  DEBUG_r({                                                   \
1895   /* store the word for dumping */                        \
1896   SV* tmp;                                                \
1897   if (OP(noper) != NOTHING)                               \
1898    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1899   else                                                    \
1900    tmp = newSVpvn_utf8( "", 0, UTF );   \
1901   av_push( trie_words, tmp );                             \
1902  });                                                         \
1903                 \
1904  curword++;                                                  \
1905  trie->wordinfo[curword].prev   = 0;                         \
1906  trie->wordinfo[curword].len    = wordlen;                   \
1907  trie->wordinfo[curword].accept = state;                     \
1908                 \
1909  if ( noper_next < tail ) {                                  \
1910   if (!trie->jump)                                        \
1911    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1912             sizeof(U16) ); \
1913   trie->jump[curword] = (U16)(noper_next - convert);      \
1914   if (!jumper)                                            \
1915    jumper = noper_next;                                \
1916   if (!nextbranch)                                        \
1917    nextbranch= regnext(cur);                           \
1918  }                                                           \
1919                 \
1920  if ( dupe ) {                                               \
1921   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1922   /* chain, so that when the bits of chain are later    */\
1923   /* linked together, the dups appear in the chain      */\
1924   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1925   trie->wordinfo[dupe].prev = curword;                    \
1926  } else {                                                    \
1927   /* we haven't inserted this word yet.                */ \
1928   trie->states[ state ].wordnum = curword;                \
1929  }                                                           \
1930 } STMT_END
1931
1932
1933 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1934  ( ( base + charid >=  ucharcount     \
1935   && base + charid < ubound     \
1936   && state == trie->trans[ base - ucharcount + charid ].check \
1937   && trie->trans[ base - ucharcount + charid ].next )  \
1938   ? trie->trans[ base - ucharcount + charid ].next  \
1939   : ( state==1 ? special : 0 )     \
1940  )
1941
1942 #define MADE_TRIE       1
1943 #define MADE_JUMP_TRIE  2
1944 #define MADE_EXACT_TRIE 4
1945
1946 STATIC I32
1947 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1948     regnode *first, regnode *last, regnode *tail,
1949     U32 word_count, U32 flags, U32 depth)
1950 {
1951  dVAR;
1952  /* first pass, loop through and scan words */
1953  reg_trie_data *trie;
1954  HV *widecharmap = NULL;
1955  AV *revcharmap = newAV();
1956  regnode *cur;
1957  STRLEN len = 0;
1958  UV uvc = 0;
1959  U16 curword = 0;
1960  U32 next_alloc = 0;
1961  regnode *jumper = NULL;
1962  regnode *nextbranch = NULL;
1963  regnode *convert = NULL;
1964  U32 *prev_states; /* temp array mapping each state to previous one */
1965  /* we just use folder as a flag in utf8 */
1966  const U8 * folder = NULL;
1967
1968 #ifdef DEBUGGING
1969  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1970  AV *trie_words = NULL;
1971  /* along with revcharmap, this only used during construction but both are
1972  * useful during debugging so we store them in the struct when debugging.
1973  */
1974 #else
1975  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1976  STRLEN trie_charcount=0;
1977 #endif
1978  SV *re_trie_maxbuff;
1979  GET_RE_DEBUG_FLAGS_DECL;
1980
1981  PERL_ARGS_ASSERT_MAKE_TRIE;
1982 #ifndef DEBUGGING
1983  PERL_UNUSED_ARG(depth);
1984 #endif
1985
1986  switch (flags) {
1987   case EXACT: break;
1988   case EXACTFA:
1989   case EXACTFU_SS:
1990   case EXACTFU: folder = PL_fold_latin1; break;
1991   case EXACTF:  folder = PL_fold; break;
1992   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1993  }
1994
1995  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1996  trie->refcount = 1;
1997  trie->startstate = 1;
1998  trie->wordcount = word_count;
1999  RExC_rxi->data->data[ data_slot ] = (void*)trie;
2000  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2001  if (flags == EXACT)
2002   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2003  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2004      trie->wordcount+1, sizeof(reg_trie_wordinfo));
2005
2006  DEBUG_r({
2007   trie_words = newAV();
2008  });
2009
2010  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2011  assert(re_trie_maxbuff);
2012  if (!SvIOK(re_trie_maxbuff)) {
2013   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2014  }
2015  DEBUG_TRIE_COMPILE_r({
2016   PerlIO_printf( Perl_debug_log,
2017   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2018   (int)depth * 2 + 2, "",
2019   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2020   REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2021  });
2022
2023    /* Find the node we are going to overwrite */
2024  if ( first == startbranch && OP( last ) != BRANCH ) {
2025   /* whole branch chain */
2026   convert = first;
2027  } else {
2028   /* branch sub-chain */
2029   convert = NEXTOPER( first );
2030  }
2031
2032  /*  -- First loop and Setup --
2033
2034  We first traverse the branches and scan each word to determine if it
2035  contains widechars, and how many unique chars there are, this is
2036  important as we have to build a table with at least as many columns as we
2037  have unique chars.
2038
2039  We use an array of integers to represent the character codes 0..255
2040  (trie->charmap) and we use a an HV* to store Unicode characters. We use
2041  the native representation of the character value as the key and IV's for
2042  the coded index.
2043
2044  *TODO* If we keep track of how many times each character is used we can
2045  remap the columns so that the table compression later on is more
2046  efficient in terms of memory by ensuring the most common value is in the
2047  middle and the least common are on the outside.  IMO this would be better
2048  than a most to least common mapping as theres a decent chance the most
2049  common letter will share a node with the least common, meaning the node
2050  will not be compressible. With a middle is most common approach the worst
2051  case is when we have the least common nodes twice.
2052
2053  */
2054
2055  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2056   regnode *noper = NEXTOPER( cur );
2057   const U8 *uc = (U8*)STRING( noper );
2058   const U8 *e  = uc + STR_LEN( noper );
2059   int foldlen = 0;
2060   U32 wordlen      = 0;         /* required init */
2061   STRLEN minchars = 0;
2062   STRLEN maxchars = 0;
2063   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2064            bitmap?*/
2065
2066   if (OP(noper) == NOTHING) {
2067    regnode *noper_next= regnext(noper);
2068    if (noper_next != tail && OP(noper_next) == flags) {
2069     noper = noper_next;
2070     uc= (U8*)STRING(noper);
2071     e= uc + STR_LEN(noper);
2072     trie->minlen= STR_LEN(noper);
2073    } else {
2074     trie->minlen= 0;
2075     continue;
2076    }
2077   }
2078
2079   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2080    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2081           regardless of encoding */
2082    if (OP( noper ) == EXACTFU_SS) {
2083     /* false positives are ok, so just set this */
2084     TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2085    }
2086   }
2087   for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2088           branch */
2089    TRIE_CHARCOUNT(trie)++;
2090    TRIE_READ_CHAR;
2091
2092    /* TRIE_READ_CHAR returns the current character, or its fold if /i
2093    * is in effect.  Under /i, this character can match itself, or
2094    * anything that folds to it.  If not under /i, it can match just
2095    * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2096    * all fold to k, and all are single characters.   But some folds
2097    * expand to more than one character, so for example LATIN SMALL
2098    * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2099    * the string beginning at 'uc' is 'ffi', it could be matched by
2100    * three characters, or just by the one ligature character. (It
2101    * could also be matched by two characters: LATIN SMALL LIGATURE FF
2102    * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2103    * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2104    * match.)  The trie needs to know the minimum and maximum number
2105    * of characters that could match so that it can use size alone to
2106    * quickly reject many match attempts.  The max is simple: it is
2107    * the number of folded characters in this branch (since a fold is
2108    * never shorter than what folds to it. */
2109
2110    maxchars++;
2111
2112    /* And the min is equal to the max if not under /i (indicated by
2113    * 'folder' being NULL), or there are no multi-character folds.  If
2114    * there is a multi-character fold, the min is incremented just
2115    * once, for the character that folds to the sequence.  Each
2116    * character in the sequence needs to be added to the list below of
2117    * characters in the trie, but we count only the first towards the
2118    * min number of characters needed.  This is done through the
2119    * variable 'foldlen', which is returned by the macros that look
2120    * for these sequences as the number of bytes the sequence
2121    * occupies.  Each time through the loop, we decrement 'foldlen' by
2122    * how many bytes the current char occupies.  Only when it reaches
2123    * 0 do we increment 'minchars' or look for another multi-character
2124    * sequence. */
2125    if (folder == NULL) {
2126     minchars++;
2127    }
2128    else if (foldlen > 0) {
2129     foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2130    }
2131    else {
2132     minchars++;
2133
2134     /* See if *uc is the beginning of a multi-character fold.  If
2135     * so, we decrement the length remaining to look at, to account
2136     * for the current character this iteration.  (We can use 'uc'
2137     * instead of the fold returned by TRIE_READ_CHAR because for
2138     * non-UTF, the latin1_safe macro is smart enough to account
2139     * for all the unfolded characters, and because for UTF, the
2140     * string will already have been folded earlier in the
2141     * compilation process */
2142     if (UTF) {
2143      if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2144       foldlen -= UTF8SKIP(uc);
2145      }
2146     }
2147     else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2148      foldlen--;
2149     }
2150    }
2151
2152    /* The current character (and any potential folds) should be added
2153    * to the possible matching characters for this position in this
2154    * branch */
2155    if ( uvc < 256 ) {
2156     if ( folder ) {
2157      U8 folded= folder[ (U8) uvc ];
2158      if ( !trie->charmap[ folded ] ) {
2159       trie->charmap[ folded ]=( ++trie->uniquecharcount );
2160       TRIE_STORE_REVCHAR( folded );
2161      }
2162     }
2163     if ( !trie->charmap[ uvc ] ) {
2164      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2165      TRIE_STORE_REVCHAR( uvc );
2166     }
2167     if ( set_bit ) {
2168      /* store the codepoint in the bitmap, and its folded
2169      * equivalent. */
2170      TRIE_BITMAP_SET(trie, uvc);
2171
2172      /* store the folded codepoint */
2173      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2174
2175      if ( !UTF ) {
2176       /* store first byte of utf8 representation of
2177       variant codepoints */
2178       if (! UVCHR_IS_INVARIANT(uvc)) {
2179        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2180       }
2181      }
2182      set_bit = 0; /* We've done our bit :-) */
2183     }
2184    } else {
2185
2186     /* XXX We could come up with the list of code points that fold
2187     * to this using PL_utf8_foldclosures, except not for
2188     * multi-char folds, as there may be multiple combinations
2189     * there that could work, which needs to wait until runtime to
2190     * resolve (The comment about LIGATURE FFI above is such an
2191     * example */
2192
2193     SV** svpp;
2194     if ( !widecharmap )
2195      widecharmap = newHV();
2196
2197     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2198
2199     if ( !svpp )
2200      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2201
2202     if ( !SvTRUE( *svpp ) ) {
2203      sv_setiv( *svpp, ++trie->uniquecharcount );
2204      TRIE_STORE_REVCHAR(uvc);
2205     }
2206    }
2207   } /* end loop through characters in this branch of the trie */
2208
2209   /* We take the min and max for this branch and combine to find the min
2210   * and max for all branches processed so far */
2211   if( cur == first ) {
2212    trie->minlen = minchars;
2213    trie->maxlen = maxchars;
2214   } else if (minchars < trie->minlen) {
2215    trie->minlen = minchars;
2216   } else if (maxchars > trie->maxlen) {
2217    trie->maxlen = maxchars;
2218   }
2219  } /* end first pass */
2220  DEBUG_TRIE_COMPILE_r(
2221   PerlIO_printf( Perl_debug_log,
2222     "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2223     (int)depth * 2 + 2,"",
2224     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2225     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2226     (int)trie->minlen, (int)trie->maxlen )
2227  );
2228
2229  /*
2230   We now know what we are dealing with in terms of unique chars and
2231   string sizes so we can calculate how much memory a naive
2232   representation using a flat table  will take. If it's over a reasonable
2233   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2234   conservative but potentially much slower representation using an array
2235   of lists.
2236
2237   At the end we convert both representations into the same compressed
2238   form that will be used in regexec.c for matching with. The latter
2239   is a form that cannot be used to construct with but has memory
2240   properties similar to the list form and access properties similar
2241   to the table form making it both suitable for fast searches and
2242   small enough that its feasable to store for the duration of a program.
2243
2244   See the comment in the code where the compressed table is produced
2245   inplace from the flat tabe representation for an explanation of how
2246   the compression works.
2247
2248  */
2249
2250
2251  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2252  prev_states[1] = 0;
2253
2254  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2255              > SvIV(re_trie_maxbuff) )
2256  {
2257   /*
2258    Second Pass -- Array Of Lists Representation
2259
2260    Each state will be represented by a list of charid:state records
2261    (reg_trie_trans_le) the first such element holds the CUR and LEN
2262    points of the allocated array. (See defines above).
2263
2264    We build the initial structure using the lists, and then convert
2265    it into the compressed table form which allows faster lookups
2266    (but cant be modified once converted).
2267   */
2268
2269   STRLEN transcount = 1;
2270
2271   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2272    "%*sCompiling trie using list compiler\n",
2273    (int)depth * 2 + 2, ""));
2274
2275   trie->states = (reg_trie_state *)
2276    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2277         sizeof(reg_trie_state) );
2278   TRIE_LIST_NEW(1);
2279   next_alloc = 2;
2280
2281   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2282
2283    regnode *noper   = NEXTOPER( cur );
2284    U8 *uc           = (U8*)STRING( noper );
2285    const U8 *e      = uc + STR_LEN( noper );
2286    U32 state        = 1;         /* required init */
2287    U16 charid       = 0;         /* sanity init */
2288    U32 wordlen      = 0;         /* required init */
2289
2290    if (OP(noper) == NOTHING) {
2291     regnode *noper_next= regnext(noper);
2292     if (noper_next != tail && OP(noper_next) == flags) {
2293      noper = noper_next;
2294      uc= (U8*)STRING(noper);
2295      e= uc + STR_LEN(noper);
2296     }
2297    }
2298
2299    if (OP(noper) != NOTHING) {
2300     for ( ; uc < e ; uc += len ) {
2301
2302      TRIE_READ_CHAR;
2303
2304      if ( uvc < 256 ) {
2305       charid = trie->charmap[ uvc ];
2306      } else {
2307       SV** const svpp = hv_fetch( widecharmap,
2308              (char*)&uvc,
2309              sizeof( UV ),
2310              0);
2311       if ( !svpp ) {
2312        charid = 0;
2313       } else {
2314        charid=(U16)SvIV( *svpp );
2315       }
2316      }
2317      /* charid is now 0 if we dont know the char read, or
2318      * nonzero if we do */
2319      if ( charid ) {
2320
2321       U16 check;
2322       U32 newstate = 0;
2323
2324       charid--;
2325       if ( !trie->states[ state ].trans.list ) {
2326        TRIE_LIST_NEW( state );
2327       }
2328       for ( check = 1;
2329        check <= TRIE_LIST_USED( state );
2330        check++ )
2331       {
2332        if ( TRIE_LIST_ITEM( state, check ).forid
2333                  == charid )
2334        {
2335         newstate = TRIE_LIST_ITEM( state, check ).newstate;
2336         break;
2337        }
2338       }
2339       if ( ! newstate ) {
2340        newstate = next_alloc++;
2341        prev_states[newstate] = state;
2342        TRIE_LIST_PUSH( state, charid, newstate );
2343        transcount++;
2344       }
2345       state = newstate;
2346      } else {
2347       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2348      }
2349     }
2350    }
2351    TRIE_HANDLE_WORD(state);
2352
2353   } /* end second pass */
2354
2355   /* next alloc is the NEXT state to be allocated */
2356   trie->statecount = next_alloc;
2357   trie->states = (reg_trie_state *)
2358    PerlMemShared_realloc( trie->states,
2359         next_alloc
2360         * sizeof(reg_trie_state) );
2361
2362   /* and now dump it out before we compress it */
2363   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2364               revcharmap, next_alloc,
2365               depth+1)
2366   );
2367
2368   trie->trans = (reg_trie_trans *)
2369    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2370   {
2371    U32 state;
2372    U32 tp = 0;
2373    U32 zp = 0;
2374
2375
2376    for( state=1 ; state < next_alloc ; state ++ ) {
2377     U32 base=0;
2378
2379     /*
2380     DEBUG_TRIE_COMPILE_MORE_r(
2381      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2382     );
2383     */
2384
2385     if (trie->states[state].trans.list) {
2386      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2387      U16 maxid=minid;
2388      U16 idx;
2389
2390      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2391       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2392       if ( forid < minid ) {
2393        minid=forid;
2394       } else if ( forid > maxid ) {
2395        maxid=forid;
2396       }
2397      }
2398      if ( transcount < tp + maxid - minid + 1) {
2399       transcount *= 2;
2400       trie->trans = (reg_trie_trans *)
2401        PerlMemShared_realloc( trie->trans,
2402              transcount
2403              * sizeof(reg_trie_trans) );
2404       Zero( trie->trans + (transcount / 2),
2405        transcount / 2,
2406        reg_trie_trans );
2407      }
2408      base = trie->uniquecharcount + tp - minid;
2409      if ( maxid == minid ) {
2410       U32 set = 0;
2411       for ( ; zp < tp ; zp++ ) {
2412        if ( ! trie->trans[ zp ].next ) {
2413         base = trie->uniquecharcount + zp - minid;
2414         trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2415                 1).newstate;
2416         trie->trans[ zp ].check = state;
2417         set = 1;
2418         break;
2419        }
2420       }
2421       if ( !set ) {
2422        trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2423                 1).newstate;
2424        trie->trans[ tp ].check = state;
2425        tp++;
2426        zp = tp;
2427       }
2428      } else {
2429       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2430        const U32 tid = base
2431           - trie->uniquecharcount
2432           + TRIE_LIST_ITEM( state, idx ).forid;
2433        trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2434                 idx ).newstate;
2435        trie->trans[ tid ].check = state;
2436       }
2437       tp += ( maxid - minid + 1 );
2438      }
2439      Safefree(trie->states[ state ].trans.list);
2440     }
2441     /*
2442     DEBUG_TRIE_COMPILE_MORE_r(
2443      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2444     );
2445     */
2446     trie->states[ state ].trans.base=base;
2447    }
2448    trie->lasttrans = tp + 1;
2449   }
2450  } else {
2451   /*
2452   Second Pass -- Flat Table Representation.
2453
2454   we dont use the 0 slot of either trans[] or states[] so we add 1 to
2455   each.  We know that we will need Charcount+1 trans at most to store
2456   the data (one row per char at worst case) So we preallocate both
2457   structures assuming worst case.
2458
2459   We then construct the trie using only the .next slots of the entry
2460   structs.
2461
2462   We use the .check field of the first entry of the node temporarily
2463   to make compression both faster and easier by keeping track of how
2464   many non zero fields are in the node.
2465
2466   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2467   transition.
2468
2469   There are two terms at use here: state as a TRIE_NODEIDX() which is
2470   a number representing the first entry of the node, and state as a
2471   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2472   and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2473   if there are 2 entrys per node. eg:
2474
2475    A B       A B
2476   1. 2 4    1. 3 7
2477   2. 0 3    3. 0 5
2478   3. 0 0    5. 0 0
2479   4. 0 0    7. 0 0
2480
2481   The table is internally in the right hand, idx form. However as we
2482   also have to deal with the states array which is indexed by nodenum
2483   we have to use TRIE_NODENUM() to convert.
2484
2485   */
2486   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2487    "%*sCompiling trie using table compiler\n",
2488    (int)depth * 2 + 2, ""));
2489
2490   trie->trans = (reg_trie_trans *)
2491    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2492         * trie->uniquecharcount + 1,
2493         sizeof(reg_trie_trans) );
2494   trie->states = (reg_trie_state *)
2495    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2496         sizeof(reg_trie_state) );
2497   next_alloc = trie->uniquecharcount + 1;
2498
2499
2500   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2501
2502    regnode *noper   = NEXTOPER( cur );
2503    const U8 *uc     = (U8*)STRING( noper );
2504    const U8 *e      = uc + STR_LEN( noper );
2505
2506    U32 state        = 1;         /* required init */
2507
2508    U16 charid       = 0;         /* sanity init */
2509    U32 accept_state = 0;         /* sanity init */
2510
2511    U32 wordlen      = 0;         /* required init */
2512
2513    if (OP(noper) == NOTHING) {
2514     regnode *noper_next= regnext(noper);
2515     if (noper_next != tail && OP(noper_next) == flags) {
2516      noper = noper_next;
2517      uc= (U8*)STRING(noper);
2518      e= uc + STR_LEN(noper);
2519     }
2520    }
2521
2522    if ( OP(noper) != NOTHING ) {
2523     for ( ; uc < e ; uc += len ) {
2524
2525      TRIE_READ_CHAR;
2526
2527      if ( uvc < 256 ) {
2528       charid = trie->charmap[ uvc ];
2529      } else {
2530       SV* const * const svpp = hv_fetch( widecharmap,
2531               (char*)&uvc,
2532               sizeof( UV ),
2533               0);
2534       charid = svpp ? (U16)SvIV(*svpp) : 0;
2535      }
2536      if ( charid ) {
2537       charid--;
2538       if ( !trie->trans[ state + charid ].next ) {
2539        trie->trans[ state + charid ].next = next_alloc;
2540        trie->trans[ state ].check++;
2541        prev_states[TRIE_NODENUM(next_alloc)]
2542          = TRIE_NODENUM(state);
2543        next_alloc += trie->uniquecharcount;
2544       }
2545       state = trie->trans[ state + charid ].next;
2546      } else {
2547       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2548      }
2549      /* charid is now 0 if we dont know the char read, or
2550      * nonzero if we do */
2551     }
2552    }
2553    accept_state = TRIE_NODENUM( state );
2554    TRIE_HANDLE_WORD(accept_state);
2555
2556   } /* end second pass */
2557
2558   /* and now dump it out before we compress it */
2559   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2560               revcharmap,
2561               next_alloc, depth+1));
2562
2563   {
2564   /*
2565   * Inplace compress the table.*
2566
2567   For sparse data sets the table constructed by the trie algorithm will
2568   be mostly 0/FAIL transitions or to put it another way mostly empty.
2569   (Note that leaf nodes will not contain any transitions.)
2570
2571   This algorithm compresses the tables by eliminating most such
2572   transitions, at the cost of a modest bit of extra work during lookup:
2573
2574   - Each states[] entry contains a .base field which indicates the
2575   index in the state[] array wheres its transition data is stored.
2576
2577   - If .base is 0 there are no valid transitions from that node.
2578
2579   - If .base is nonzero then charid is added to it to find an entry in
2580   the trans array.
2581
2582   -If trans[states[state].base+charid].check!=state then the
2583   transition is taken to be a 0/Fail transition. Thus if there are fail
2584   transitions at the front of the node then the .base offset will point
2585   somewhere inside the previous nodes data (or maybe even into a node
2586   even earlier), but the .check field determines if the transition is
2587   valid.
2588
2589   XXX - wrong maybe?
2590   The following process inplace converts the table to the compressed
2591   table: We first do not compress the root node 1,and mark all its
2592   .check pointers as 1 and set its .base pointer as 1 as well. This
2593   allows us to do a DFA construction from the compressed table later,
2594   and ensures that any .base pointers we calculate later are greater
2595   than 0.
2596
2597   - We set 'pos' to indicate the first entry of the second node.
2598
2599   - We then iterate over the columns of the node, finding the first and
2600   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2601   and set the .check pointers accordingly, and advance pos
2602   appropriately and repreat for the next node. Note that when we copy
2603   the next pointers we have to convert them from the original
2604   NODEIDX form to NODENUM form as the former is not valid post
2605   compression.
2606
2607   - If a node has no transitions used we mark its base as 0 and do not
2608   advance the pos pointer.
2609
2610   - If a node only has one transition we use a second pointer into the
2611   structure to fill in allocated fail transitions from other states.
2612   This pointer is independent of the main pointer and scans forward
2613   looking for null transitions that are allocated to a state. When it
2614   finds one it writes the single transition into the "hole".  If the
2615   pointer doesnt find one the single transition is appended as normal.
2616
2617   - Once compressed we can Renew/realloc the structures to release the
2618   excess space.
2619
2620   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2621   specifically Fig 3.47 and the associated pseudocode.
2622
2623   demq
2624   */
2625   const U32 laststate = TRIE_NODENUM( next_alloc );
2626   U32 state, charid;
2627   U32 pos = 0, zp=0;
2628   trie->statecount = laststate;
2629
2630   for ( state = 1 ; state < laststate ; state++ ) {
2631    U8 flag = 0;
2632    const U32 stateidx = TRIE_NODEIDX( state );
2633    const U32 o_used = trie->trans[ stateidx ].check;
2634    U32 used = trie->trans[ stateidx ].check;
2635    trie->trans[ stateidx ].check = 0;
2636
2637    for ( charid = 0;
2638     used && charid < trie->uniquecharcount;
2639     charid++ )
2640    {
2641     if ( flag || trie->trans[ stateidx + charid ].next ) {
2642      if ( trie->trans[ stateidx + charid ].next ) {
2643       if (o_used == 1) {
2644        for ( ; zp < pos ; zp++ ) {
2645         if ( ! trie->trans[ zp ].next ) {
2646          break;
2647         }
2648        }
2649        trie->states[ state ].trans.base
2650              = zp
2651              + trie->uniquecharcount
2652              - charid ;
2653        trie->trans[ zp ].next
2654         = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2655                + charid ].next );
2656        trie->trans[ zp ].check = state;
2657        if ( ++zp > pos ) pos = zp;
2658        break;
2659       }
2660       used--;
2661      }
2662      if ( !flag ) {
2663       flag = 1;
2664       trie->states[ state ].trans.base
2665          = pos + trie->uniquecharcount - charid ;
2666      }
2667      trie->trans[ pos ].next
2668       = SAFE_TRIE_NODENUM(
2669          trie->trans[ stateidx + charid ].next );
2670      trie->trans[ pos ].check = state;
2671      pos++;
2672     }
2673    }
2674   }
2675   trie->lasttrans = pos + 1;
2676   trie->states = (reg_trie_state *)
2677    PerlMemShared_realloc( trie->states, laststate
2678         * sizeof(reg_trie_state) );
2679   DEBUG_TRIE_COMPILE_MORE_r(
2680    PerlIO_printf( Perl_debug_log,
2681     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2682     (int)depth * 2 + 2,"",
2683     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2684      + 1 ),
2685     (IV)next_alloc,
2686     (IV)pos,
2687     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2688    );
2689
2690   } /* end table compress */
2691  }
2692  DEBUG_TRIE_COMPILE_MORE_r(
2693    PerlIO_printf(Perl_debug_log,
2694     "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2695     (int)depth * 2 + 2, "",
2696     (UV)trie->statecount,
2697     (UV)trie->lasttrans)
2698  );
2699  /* resize the trans array to remove unused space */
2700  trie->trans = (reg_trie_trans *)
2701   PerlMemShared_realloc( trie->trans, trie->lasttrans
2702        * sizeof(reg_trie_trans) );
2703
2704  {   /* Modify the program and insert the new TRIE node */
2705   U8 nodetype =(U8)(flags & 0xFF);
2706   char *str=NULL;
2707
2708 #ifdef DEBUGGING
2709   regnode *optimize = NULL;
2710 #ifdef RE_TRACK_PATTERN_OFFSETS
2711
2712   U32 mjd_offset = 0;
2713   U32 mjd_nodelen = 0;
2714 #endif /* RE_TRACK_PATTERN_OFFSETS */
2715 #endif /* DEBUGGING */
2716   /*
2717   This means we convert either the first branch or the first Exact,
2718   depending on whether the thing following (in 'last') is a branch
2719   or not and whther first is the startbranch (ie is it a sub part of
2720   the alternation or is it the whole thing.)
2721   Assuming its a sub part we convert the EXACT otherwise we convert
2722   the whole branch sequence, including the first.
2723   */
2724   /* Find the node we are going to overwrite */
2725   if ( first != startbranch || OP( last ) == BRANCH ) {
2726    /* branch sub-chain */
2727    NEXT_OFF( first ) = (U16)(last - first);
2728 #ifdef RE_TRACK_PATTERN_OFFSETS
2729    DEBUG_r({
2730     mjd_offset= Node_Offset((convert));
2731     mjd_nodelen= Node_Length((convert));
2732    });
2733 #endif
2734    /* whole branch chain */
2735   }
2736 #ifdef RE_TRACK_PATTERN_OFFSETS
2737   else {
2738    DEBUG_r({
2739     const  regnode *nop = NEXTOPER( convert );
2740     mjd_offset= Node_Offset((nop));
2741     mjd_nodelen= Node_Length((nop));
2742    });
2743   }
2744   DEBUG_OPTIMISE_r(
2745    PerlIO_printf(Perl_debug_log,
2746     "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2747     (int)depth * 2 + 2, "",
2748     (UV)mjd_offset, (UV)mjd_nodelen)
2749   );
2750 #endif
2751   /* But first we check to see if there is a common prefix we can
2752   split out as an EXACT and put in front of the TRIE node.  */
2753   trie->startstate= 1;
2754   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2755    U32 state;
2756    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2757     U32 ofs = 0;
2758     I32 idx = -1;
2759     U32 count = 0;
2760     const U32 base = trie->states[ state ].trans.base;
2761
2762     if ( trie->states[state].wordnum )
2763       count = 1;
2764
2765     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2766      if ( ( base + ofs >= trie->uniquecharcount ) &&
2767       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2768       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2769      {
2770       if ( ++count > 1 ) {
2771        SV **tmp = av_fetch( revcharmap, ofs, 0);
2772        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2773        if ( state == 1 ) break;
2774        if ( count == 2 ) {
2775         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2776         DEBUG_OPTIMISE_r(
2777          PerlIO_printf(Perl_debug_log,
2778           "%*sNew Start State=%"UVuf" Class: [",
2779           (int)depth * 2 + 2, "",
2780           (UV)state));
2781         if (idx >= 0) {
2782          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2783          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2784
2785          TRIE_BITMAP_SET(trie,*ch);
2786          if ( folder )
2787           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2788          DEBUG_OPTIMISE_r(
2789           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2790          );
2791         }
2792        }
2793        TRIE_BITMAP_SET(trie,*ch);
2794        if ( folder )
2795         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2796        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2797       }
2798       idx = ofs;
2799      }
2800     }
2801     if ( count == 1 ) {
2802      SV **tmp = av_fetch( revcharmap, idx, 0);
2803      STRLEN len;
2804      char *ch = SvPV( *tmp, len );
2805      DEBUG_OPTIMISE_r({
2806       SV *sv=sv_newmortal();
2807       PerlIO_printf( Perl_debug_log,
2808        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2809        (int)depth * 2 + 2, "",
2810        (UV)state, (UV)idx,
2811        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2812         PL_colors[0], PL_colors[1],
2813         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2814         PERL_PV_ESCAPE_FIRSTCHAR
2815        )
2816       );
2817      });
2818      if ( state==1 ) {
2819       OP( convert ) = nodetype;
2820       str=STRING(convert);
2821       STR_LEN(convert)=0;
2822      }
2823      STR_LEN(convert) += len;
2824      while (len--)
2825       *str++ = *ch++;
2826     } else {
2827 #ifdef DEBUGGING
2828      if (state>1)
2829       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2830 #endif
2831      break;
2832     }
2833    }
2834    trie->prefixlen = (state-1);
2835    if (str) {
2836     regnode *n = convert+NODE_SZ_STR(convert);
2837     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2838     trie->startstate = state;
2839     trie->minlen -= (state - 1);
2840     trie->maxlen -= (state - 1);
2841 #ifdef DEBUGGING
2842    /* At least the UNICOS C compiler choked on this
2843     * being argument to DEBUG_r(), so let's just have
2844     * it right here. */
2845    if (
2846 #ifdef PERL_EXT_RE_BUILD
2847     1
2848 #else
2849     DEBUG_r_TEST
2850 #endif
2851     ) {
2852     regnode *fix = convert;
2853     U32 word = trie->wordcount;
2854     mjd_nodelen++;
2855     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2856     while( ++fix < n ) {
2857      Set_Node_Offset_Length(fix, 0, 0);
2858     }
2859     while (word--) {
2860      SV ** const tmp = av_fetch( trie_words, word, 0 );
2861      if (tmp) {
2862       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2863        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2864       else
2865        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2866      }
2867     }
2868    }
2869 #endif
2870     if (trie->maxlen) {
2871      convert = n;
2872     } else {
2873      NEXT_OFF(convert) = (U16)(tail - convert);
2874      DEBUG_r(optimize= n);
2875     }
2876    }
2877   }
2878   if (!jumper)
2879    jumper = last;
2880   if ( trie->maxlen ) {
2881    NEXT_OFF( convert ) = (U16)(tail - convert);
2882    ARG_SET( convert, data_slot );
2883    /* Store the offset to the first unabsorbed branch in
2884    jump[0], which is otherwise unused by the jump logic.
2885    We use this when dumping a trie and during optimisation. */
2886    if (trie->jump)
2887     trie->jump[0] = (U16)(nextbranch - convert);
2888
2889    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2890    *   and there is a bitmap
2891    *   and the first "jump target" node we found leaves enough room
2892    * then convert the TRIE node into a TRIEC node, with the bitmap
2893    * embedded inline in the opcode - this is hypothetically faster.
2894    */
2895    if ( !trie->states[trie->startstate].wordnum
2896     && trie->bitmap
2897     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2898    {
2899     OP( convert ) = TRIEC;
2900     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2901     PerlMemShared_free(trie->bitmap);
2902     trie->bitmap= NULL;
2903    } else
2904     OP( convert ) = TRIE;
2905
2906    /* store the type in the flags */
2907    convert->flags = nodetype;
2908    DEBUG_r({
2909    optimize = convert
2910      + NODE_STEP_REGNODE
2911      + regarglen[ OP( convert ) ];
2912    });
2913    /* XXX We really should free up the resource in trie now,
2914     as we won't use them - (which resources?) dmq */
2915   }
2916   /* needed for dumping*/
2917   DEBUG_r(if (optimize) {
2918    regnode *opt = convert;
2919
2920    while ( ++opt < optimize) {
2921     Set_Node_Offset_Length(opt,0,0);
2922    }
2923    /*
2924     Try to clean up some of the debris left after the
2925     optimisation.
2926    */
2927    while( optimize < jumper ) {
2928     mjd_nodelen += Node_Length((optimize));
2929     OP( optimize ) = OPTIMIZED;
2930     Set_Node_Offset_Length(optimize,0,0);
2931     optimize++;
2932    }
2933    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2934   });
2935  } /* end node insert */
2936  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2937
2938  /*  Finish populating the prev field of the wordinfo array.  Walk back
2939  *  from each accept state until we find another accept state, and if
2940  *  so, point the first word's .prev field at the second word. If the
2941  *  second already has a .prev field set, stop now. This will be the
2942  *  case either if we've already processed that word's accept state,
2943  *  or that state had multiple words, and the overspill words were
2944  *  already linked up earlier.
2945  */
2946  {
2947   U16 word;
2948   U32 state;
2949   U16 prev;
2950
2951   for (word=1; word <= trie->wordcount; word++) {
2952    prev = 0;
2953    if (trie->wordinfo[word].prev)
2954     continue;
2955    state = trie->wordinfo[word].accept;
2956    while (state) {
2957     state = prev_states[state];
2958     if (!state)
2959      break;
2960     prev = trie->states[state].wordnum;
2961     if (prev)
2962      break;
2963    }
2964    trie->wordinfo[word].prev = prev;
2965   }
2966   Safefree(prev_states);
2967  }
2968
2969
2970  /* and now dump out the compressed format */
2971  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2972
2973  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2974 #ifdef DEBUGGING
2975  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2976  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2977 #else
2978  SvREFCNT_dec_NN(revcharmap);
2979 #endif
2980  return trie->jump
2981   ? MADE_JUMP_TRIE
2982   : trie->startstate>1
2983    ? MADE_EXACT_TRIE
2984    : MADE_TRIE;
2985 }
2986
2987 STATIC regnode *
2988 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
2989 {
2990 /* The Trie is constructed and compressed now so we can build a fail array if
2991  * it's needed
2992
2993    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2994    3.32 in the
2995    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2996    Ullman 1985/88
2997    ISBN 0-201-10088-6
2998
2999    We find the fail state for each state in the trie, this state is the longest
3000    proper suffix of the current state's 'word' that is also a proper prefix of
3001    another word in our trie. State 1 represents the word '' and is thus the
3002    default fail state. This allows the DFA not to have to restart after its
3003    tried and failed a word at a given point, it simply continues as though it
3004    had been matching the other word in the first place.
3005    Consider
3006  'abcdgu'=~/abcdefg|cdgu/
3007    When we get to 'd' we are still matching the first word, we would encounter
3008    'g' which would fail, which would bring us to the state representing 'd' in
3009    the second word where we would try 'g' and succeed, proceeding to match
3010    'cdgu'.
3011  */
3012  /* add a fail transition */
3013  const U32 trie_offset = ARG(source);
3014  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3015  U32 *q;
3016  const U32 ucharcount = trie->uniquecharcount;
3017  const U32 numstates = trie->statecount;
3018  const U32 ubound = trie->lasttrans + ucharcount;
3019  U32 q_read = 0;
3020  U32 q_write = 0;
3021  U32 charid;
3022  U32 base = trie->states[ 1 ].trans.base;
3023  U32 *fail;
3024  reg_ac_data *aho;
3025  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3026  regnode *stclass;
3027  GET_RE_DEBUG_FLAGS_DECL;
3028
3029  PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3030  PERL_UNUSED_CONTEXT;
3031 #ifndef DEBUGGING
3032  PERL_UNUSED_ARG(depth);
3033 #endif
3034
3035  if ( OP(source) == TRIE ) {
3036   struct regnode_1 *op = (struct regnode_1 *)
3037    PerlMemShared_calloc(1, sizeof(struct regnode_1));
3038   StructCopy(source,op,struct regnode_1);
3039   stclass = (regnode *)op;
3040  } else {
3041   struct regnode_charclass *op = (struct regnode_charclass *)
3042    PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3043   StructCopy(source,op,struct regnode_charclass);
3044   stclass = (regnode *)op;
3045  }
3046  OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3047
3048  ARG_SET( stclass, data_slot );
3049  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3050  RExC_rxi->data->data[ data_slot ] = (void*)aho;
3051  aho->trie=trie_offset;
3052  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3053  Copy( trie->states, aho->states, numstates, reg_trie_state );
3054  Newxz( q, numstates, U32);
3055  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3056  aho->refcount = 1;
3057  fail = aho->fail;
3058  /* initialize fail[0..1] to be 1 so that we always have
3059  a valid final fail state */
3060  fail[ 0 ] = fail[ 1 ] = 1;
3061
3062  for ( charid = 0; charid < ucharcount ; charid++ ) {
3063   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3064   if ( newstate ) {
3065    q[ q_write ] = newstate;
3066    /* set to point at the root */
3067    fail[ q[ q_write++ ] ]=1;
3068   }
3069  }
3070  while ( q_read < q_write) {
3071   const U32 cur = q[ q_read++ % numstates ];
3072   base = trie->states[ cur ].trans.base;
3073
3074   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3075    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3076    if (ch_state) {
3077     U32 fail_state = cur;
3078     U32 fail_base;
3079     do {
3080      fail_state = fail[ fail_state ];
3081      fail_base = aho->states[ fail_state ].trans.base;
3082     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3083
3084     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3085     fail[ ch_state ] = fail_state;
3086     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3087     {
3088       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3089     }
3090     q[ q_write++ % numstates] = ch_state;
3091    }
3092   }
3093  }
3094  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3095  when we fail in state 1, this allows us to use the
3096  charclass scan to find a valid start char. This is based on the principle
3097  that theres a good chance the string being searched contains lots of stuff
3098  that cant be a start char.
3099  */
3100  fail[ 0 ] = fail[ 1 ] = 0;
3101  DEBUG_TRIE_COMPILE_r({
3102   PerlIO_printf(Perl_debug_log,
3103      "%*sStclass Failtable (%"UVuf" states): 0",
3104      (int)(depth * 2), "", (UV)numstates
3105   );
3106   for( q_read=1; q_read<numstates; q_read++ ) {
3107    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3108   }
3109   PerlIO_printf(Perl_debug_log, "\n");
3110  });
3111  Safefree(q);
3112  /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3113  return stclass;
3114 }
3115
3116
3117 #define DEBUG_PEEP(str,scan,depth) \
3118  DEBUG_OPTIMISE_r({if (scan){ \
3119  SV * const mysv=sv_newmortal(); \
3120  regnode *Next = regnext(scan); \
3121  regprop(RExC_rx, mysv, scan, NULL); \
3122  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3123  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3124  Next ? (REG_NODE_NUM(Next)) : 0 ); \
3125    }});
3126
3127
3128 /* The below joins as many adjacent EXACTish nodes as possible into a single
3129  * one.  The regop may be changed if the node(s) contain certain sequences that
3130  * require special handling.  The joining is only done if:
3131  * 1) there is room in the current conglomerated node to entirely contain the
3132  *    next one.
3133  * 2) they are the exact same node type
3134  *
3135  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3136  * these get optimized out
3137  *
3138  * If a node is to match under /i (folded), the number of characters it matches
3139  * can be different than its character length if it contains a multi-character
3140  * fold.  *min_subtract is set to the total delta number of characters of the
3141  * input nodes.
3142  *
3143  * And *unfolded_multi_char is set to indicate whether or not the node contains
3144  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3145  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3146  * SMALL LETTER SHARP S, as only if the target string being matched against
3147  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3148  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3149  * whose components are all above the Latin1 range are not run-time locale
3150  * dependent, and have already been folded by the time this function is
3151  * called.)
3152  *
3153  * This is as good a place as any to discuss the design of handling these
3154  * multi-character fold sequences.  It's been wrong in Perl for a very long
3155  * time.  There are three code points in Unicode whose multi-character folds
3156  * were long ago discovered to mess things up.  The previous designs for
3157  * dealing with these involved assigning a special node for them.  This
3158  * approach doesn't always work, as evidenced by this example:
3159  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3160  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3161  * would match just the \xDF, it won't be able to handle the case where a
3162  * successful match would have to cross the node's boundary.  The new approach
3163  * that hopefully generally solves the problem generates an EXACTFU_SS node
3164  * that is "sss" in this case.
3165  *
3166  * It turns out that there are problems with all multi-character folds, and not
3167  * just these three.  Now the code is general, for all such cases.  The
3168  * approach taken is:
3169  * 1)   This routine examines each EXACTFish node that could contain multi-
3170  *      character folded sequences.  Since a single character can fold into
3171  *      such a sequence, the minimum match length for this node is less than
3172  *      the number of characters in the node.  This routine returns in
3173  *      *min_subtract how many characters to subtract from the the actual
3174  *      length of the string to get a real minimum match length; it is 0 if
3175  *      there are no multi-char foldeds.  This delta is used by the caller to
3176  *      adjust the min length of the match, and the delta between min and max,
3177  *      so that the optimizer doesn't reject these possibilities based on size
3178  *      constraints.
3179  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3180  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3181  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3182  *      there is a possible fold length change.  That means that a regular
3183  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3184  *      with length changes, and so can be processed faster.  regexec.c takes
3185  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3186  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3187  *      known until runtime).  This saves effort in regex matching.  However,
3188  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3189  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3190  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3191  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3192  *      possibilities for the non-UTF8 patterns are quite simple, except for
3193  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3194  *      members of a fold-pair, and arrays are set up for all of them so that
3195  *      the other member of the pair can be found quickly.  Code elsewhere in
3196  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3197  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3198  *      described in the next item.
3199  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3200  *      validity of the fold won't be known until runtime, and so must remain
3201  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3202  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3203  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3204  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3205  *      The reason this is a problem is that the optimizer part of regexec.c
3206  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3207  *      that a character in the pattern corresponds to at most a single
3208  *      character in the target string.  (And I do mean character, and not byte
3209  *      here, unlike other parts of the documentation that have never been
3210  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3211  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3212  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3213  *      nodes, violate the assumption, and they are the only instances where it
3214  *      is violated.  I'm reluctant to try to change the assumption, as the
3215  *      code involved is impenetrable to me (khw), so instead the code here
3216  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3217  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3218  *      boolean indicating whether or not the node contains such a fold.  When
3219  *      it is true, the caller sets a flag that later causes the optimizer in
3220  *      this file to not set values for the floating and fixed string lengths,
3221  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3222  *      assumption.  Thus, there is no optimization based on string lengths for
3223  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3224  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3225  *      assumption is wrong only in these cases is that all other non-UTF-8
3226  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3227  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3228  *      EXACTF nodes because we don't know at compile time if it actually
3229  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3230  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3231  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3232  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3233  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3234  *      string would require the pattern to be forced into UTF-8, the overhead
3235  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3236  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3237  *      locale.)
3238  *
3239  *      Similarly, the code that generates tries doesn't currently handle
3240  *      not-already-folded multi-char folds, and it looks like a pain to change
3241  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3242  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3243  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3244  *      using /iaa matching will be doing so almost entirely with ASCII
3245  *      strings, so this should rarely be encountered in practice */
3246
3247 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3248  if (PL_regkind[OP(scan)] == EXACT) \
3249   join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3250
3251 STATIC U32
3252 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3253     UV *min_subtract, bool *unfolded_multi_char,
3254     U32 flags,regnode *val, U32 depth)
3255 {
3256  /* Merge several consecutive EXACTish nodes into one. */
3257  regnode *n = regnext(scan);
3258  U32 stringok = 1;
3259  regnode *next = scan + NODE_SZ_STR(scan);
3260  U32 merged = 0;
3261  U32 stopnow = 0;
3262 #ifdef DEBUGGING
3263  regnode *stop = scan;
3264  GET_RE_DEBUG_FLAGS_DECL;
3265 #else
3266  PERL_UNUSED_ARG(depth);
3267 #endif
3268
3269  PERL_ARGS_ASSERT_JOIN_EXACT;
3270 #ifndef EXPERIMENTAL_INPLACESCAN
3271  PERL_UNUSED_ARG(flags);
3272  PERL_UNUSED_ARG(val);
3273 #endif
3274  DEBUG_PEEP("join",scan,depth);
3275
3276  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3277  * EXACT ones that are mergeable to the current one. */
3278  while (n
3279   && (PL_regkind[OP(n)] == NOTHING
3280    || (stringok && OP(n) == OP(scan)))
3281   && NEXT_OFF(n)
3282   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3283  {
3284
3285   if (OP(n) == TAIL || n > next)
3286    stringok = 0;
3287   if (PL_regkind[OP(n)] == NOTHING) {
3288    DEBUG_PEEP("skip:",n,depth);
3289    NEXT_OFF(scan) += NEXT_OFF(n);
3290    next = n + NODE_STEP_REGNODE;
3291 #ifdef DEBUGGING
3292    if (stringok)
3293     stop = n;
3294 #endif
3295    n = regnext(n);
3296   }
3297   else if (stringok) {
3298    const unsigned int oldl = STR_LEN(scan);
3299    regnode * const nnext = regnext(n);
3300
3301    /* XXX I (khw) kind of doubt that this works on platforms (should
3302    * Perl ever run on one) where U8_MAX is above 255 because of lots
3303    * of other assumptions */
3304    /* Don't join if the sum can't fit into a single node */
3305    if (oldl + STR_LEN(n) > U8_MAX)
3306     break;
3307
3308    DEBUG_PEEP("merg",n,depth);
3309    merged++;
3310
3311    NEXT_OFF(scan) += NEXT_OFF(n);
3312    STR_LEN(scan) += STR_LEN(n);
3313    next = n + NODE_SZ_STR(n);
3314    /* Now we can overwrite *n : */
3315    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3316 #ifdef DEBUGGING
3317    stop = next - 1;
3318 #endif
3319    n = nnext;
3320    if (stopnow) break;
3321   }
3322
3323 #ifdef EXPERIMENTAL_INPLACESCAN
3324   if (flags && !NEXT_OFF(n)) {
3325    DEBUG_PEEP("atch", val, depth);
3326    if (reg_off_by_arg[OP(n)]) {
3327     ARG_SET(n, val - n);
3328    }
3329    else {
3330     NEXT_OFF(n) = val - n;
3331    }
3332    stopnow = 1;
3333   }
3334 #endif
3335  }
3336
3337  *min_subtract = 0;
3338  *unfolded_multi_char = FALSE;
3339
3340  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3341  * can now analyze for sequences of problematic code points.  (Prior to
3342  * this final joining, sequences could have been split over boundaries, and
3343  * hence missed).  The sequences only happen in folding, hence for any
3344  * non-EXACT EXACTish node */
3345  if (OP(scan) != EXACT) {
3346   U8* s0 = (U8*) STRING(scan);
3347   U8* s = s0;
3348   U8* s_end = s0 + STR_LEN(scan);
3349
3350   int total_count_delta = 0;  /* Total delta number of characters that
3351          multi-char folds expand to */
3352
3353   /* One pass is made over the node's string looking for all the
3354   * possibilities.  To avoid some tests in the loop, there are two main
3355   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3356   * non-UTF-8 */
3357   if (UTF) {
3358    U8* folded = NULL;
3359
3360    if (OP(scan) == EXACTFL) {
3361     U8 *d;
3362
3363     /* An EXACTFL node would already have been changed to another
3364     * node type unless there is at least one character in it that
3365     * is problematic; likely a character whose fold definition
3366     * won't be known until runtime, and so has yet to be folded.
3367     * For all but the UTF-8 locale, folds are 1-1 in length, but
3368     * to handle the UTF-8 case, we need to create a temporary
3369     * folded copy using UTF-8 locale rules in order to analyze it.
3370     * This is because our macros that look to see if a sequence is
3371     * a multi-char fold assume everything is folded (otherwise the
3372     * tests in those macros would be too complicated and slow).
3373     * Note that here, the non-problematic folds will have already
3374     * been done, so we can just copy such characters.  We actually
3375     * don't completely fold the EXACTFL string.  We skip the
3376     * unfolded multi-char folds, as that would just create work
3377     * below to figure out the size they already are */
3378
3379     Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3380     d = folded;
3381     while (s < s_end) {
3382      STRLEN s_len = UTF8SKIP(s);
3383      if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3384       Copy(s, d, s_len, U8);
3385       d += s_len;
3386      }
3387      else if (is_FOLDS_TO_MULTI_utf8(s)) {
3388       *unfolded_multi_char = TRUE;
3389       Copy(s, d, s_len, U8);
3390       d += s_len;
3391      }
3392      else if (isASCII(*s)) {
3393       *(d++) = toFOLD(*s);
3394      }
3395      else {
3396       STRLEN len;
3397       _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3398       d += len;
3399      }
3400      s += s_len;
3401     }
3402
3403     /* Point the remainder of the routine to look at our temporary
3404     * folded copy */
3405     s = folded;
3406     s_end = d;
3407    } /* End of creating folded copy of EXACTFL string */
3408
3409    /* Examine the string for a multi-character fold sequence.  UTF-8
3410    * patterns have all characters pre-folded by the time this code is
3411    * executed */
3412    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3413          length sequence we are looking for is 2 */
3414    {
3415     int count = 0;  /* How many characters in a multi-char fold */
3416     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3417     if (! len) {    /* Not a multi-char fold: get next char */
3418      s += UTF8SKIP(s);
3419      continue;
3420     }
3421
3422     /* Nodes with 'ss' require special handling, except for
3423     * EXACTFA-ish for which there is no multi-char fold to this */
3424     if (len == 2 && *s == 's' && *(s+1) == 's'
3425      && OP(scan) != EXACTFA
3426      && OP(scan) != EXACTFA_NO_TRIE)
3427     {
3428      count = 2;
3429      if (OP(scan) != EXACTFL) {
3430       OP(scan) = EXACTFU_SS;
3431      }
3432      s += 2;
3433     }
3434     else { /* Here is a generic multi-char fold. */
3435      U8* multi_end  = s + len;
3436
3437      /* Count how many characters are in it.  In the case of
3438      * /aa, no folds which contain ASCII code points are
3439      * allowed, so check for those, and skip if found. */
3440      if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3441       count = utf8_length(s, multi_end);
3442       s = multi_end;
3443      }
3444      else {
3445       while (s < multi_end) {
3446        if (isASCII(*s)) {
3447         s++;
3448         goto next_iteration;
3449        }
3450        else {
3451         s += UTF8SKIP(s);
3452        }
3453        count++;
3454       }
3455      }
3456     }
3457
3458     /* The delta is how long the sequence is minus 1 (1 is how long
3459     * the character that folds to the sequence is) */
3460     total_count_delta += count - 1;
3461    next_iteration: ;
3462    }
3463
3464    /* We created a temporary folded copy of the string in EXACTFL
3465    * nodes.  Therefore we need to be sure it doesn't go below zero,
3466    * as the real string could be shorter */
3467    if (OP(scan) == EXACTFL) {
3468     int total_chars = utf8_length((U8*) STRING(scan),
3469           (U8*) STRING(scan) + STR_LEN(scan));
3470     if (total_count_delta > total_chars) {
3471      total_count_delta = total_chars;
3472     }
3473    }
3474
3475    *min_subtract += total_count_delta;
3476    Safefree(folded);
3477   }
3478   else if (OP(scan) == EXACTFA) {
3479
3480    /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3481    * fold to the ASCII range (and there are no existing ones in the
3482    * upper latin1 range).  But, as outlined in the comments preceding
3483    * this function, we need to flag any occurrences of the sharp s.
3484    * This character forbids trie formation (because of added
3485    * complexity) */
3486    while (s < s_end) {
3487     if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3488      OP(scan) = EXACTFA_NO_TRIE;
3489      *unfolded_multi_char = TRUE;
3490      break;
3491     }
3492     s++;
3493     continue;
3494    }
3495   }
3496   else {
3497
3498    /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3499    * folds that are all Latin1.  As explained in the comments
3500    * preceding this function, we look also for the sharp s in EXACTF
3501    * and EXACTFL nodes; it can be in the final position.  Otherwise
3502    * we can stop looking 1 byte earlier because have to find at least
3503    * two characters for a multi-fold */
3504    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3505        ? s_end
3506        : s_end -1;
3507
3508    while (s < upper) {
3509     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3510     if (! len) {    /* Not a multi-char fold. */
3511      if (*s == LATIN_SMALL_LETTER_SHARP_S
3512       && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3513      {
3514       *unfolded_multi_char = TRUE;
3515      }
3516      s++;
3517      continue;
3518     }
3519
3520     if (len == 2
3521      && isARG2_lower_or_UPPER_ARG1('s', *s)
3522      && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3523     {
3524
3525      /* EXACTF nodes need to know that the minimum length
3526      * changed so that a sharp s in the string can match this
3527      * ss in the pattern, but they remain EXACTF nodes, as they
3528      * won't match this unless the target string is is UTF-8,
3529      * which we don't know until runtime.  EXACTFL nodes can't
3530      * transform into EXACTFU nodes */
3531      if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3532       OP(scan) = EXACTFU_SS;
3533      }
3534     }
3535
3536     *min_subtract += len - 1;
3537     s += len;
3538    }
3539   }
3540  }
3541
3542 #ifdef DEBUGGING
3543  /* Allow dumping but overwriting the collection of skipped
3544  * ops and/or strings with fake optimized ops */
3545  n = scan + NODE_SZ_STR(scan);
3546  while (n <= stop) {
3547   OP(n) = OPTIMIZED;
3548   FLAGS(n) = 0;
3549   NEXT_OFF(n) = 0;
3550   n++;
3551  }
3552 #endif
3553  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3554  return stopnow;
3555 }
3556
3557 /* REx optimizer.  Converts nodes into quicker variants "in place".
3558    Finds fixed substrings.  */
3559
3560 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3561    to the position after last scanned or to NULL. */
3562
3563 #define INIT_AND_WITHP \
3564  assert(!and_withp); \
3565  Newx(and_withp,1, regnode_ssc); \
3566  SAVEFREEPV(and_withp)
3567
3568 /* this is a chain of data about sub patterns we are processing that
3569    need to be handled separately/specially in study_chunk. Its so
3570    we can simulate recursion without losing state.  */
3571 struct scan_frame;
3572 typedef struct scan_frame {
3573  regnode *last;  /* last node to process in this frame */
3574  regnode *next;  /* next node to process when last is reached */
3575  struct scan_frame *prev; /*previous frame*/
3576  U32 prev_recursed_depth;
3577  I32 stop; /* what stopparen do we use */
3578 } scan_frame;
3579
3580
3581 STATIC SSize_t
3582 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3583       SSize_t *minlenp, SSize_t *deltap,
3584       regnode *last,
3585       scan_data_t *data,
3586       I32 stopparen,
3587       U32 recursed_depth,
3588       regnode_ssc *and_withp,
3589       U32 flags, U32 depth)
3590       /* scanp: Start here (read-write). */
3591       /* deltap: Write maxlen-minlen here. */
3592       /* last: Stop before this one. */
3593       /* data: string data about the pattern */
3594       /* stopparen: treat close N as END */
3595       /* recursed: which subroutines have we recursed into */
3596       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3597 {
3598  dVAR;
3599  /* There must be at least this number of characters to match */
3600  SSize_t min = 0;
3601  I32 pars = 0, code;
3602  regnode *scan = *scanp, *next;
3603  SSize_t delta = 0;
3604  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3605  int is_inf_internal = 0;  /* The studied chunk is infinite */
3606  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3607  scan_data_t data_fake;
3608  SV *re_trie_maxbuff = NULL;
3609  regnode *first_non_open = scan;
3610  SSize_t stopmin = SSize_t_MAX;
3611  scan_frame *frame = NULL;
3612  GET_RE_DEBUG_FLAGS_DECL;
3613
3614  PERL_ARGS_ASSERT_STUDY_CHUNK;
3615
3616 #ifdef DEBUGGING
3617  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3618 #endif
3619  if ( depth == 0 ) {
3620   while (first_non_open && OP(first_non_open) == OPEN)
3621    first_non_open=regnext(first_non_open);
3622  }
3623
3624
3625   fake_study_recurse:
3626  while ( scan && OP(scan) != END && scan < last ){
3627   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3628         node length to get a real minimum (because
3629         the folded version may be shorter) */
3630   bool unfolded_multi_char = FALSE;
3631   /* Peephole optimizer: */
3632   DEBUG_OPTIMISE_MORE_r(
3633   {
3634    PerlIO_printf(Perl_debug_log,
3635     "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3636     ((int) depth*2), "", (long)stopparen,
3637     (unsigned long)depth, (unsigned long)recursed_depth);
3638    if (recursed_depth) {
3639     U32 i;
3640     U32 j;
3641     for ( j = 0 ; j < recursed_depth ; j++ ) {
3642      PerlIO_printf(Perl_debug_log,"[");
3643      for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3644       PerlIO_printf(Perl_debug_log,"%d",
3645        PAREN_TEST(RExC_study_chunk_recursed +
3646          (j * RExC_study_chunk_recursed_bytes), i)
3647        ? 1 : 0
3648       );
3649      PerlIO_printf(Perl_debug_log,"]");
3650     }
3651    }
3652    PerlIO_printf(Perl_debug_log,"\n");
3653   }
3654   );
3655   DEBUG_STUDYDATA("Peep:", data, depth);
3656   DEBUG_PEEP("Peep", scan, depth);
3657
3658
3659   /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3660   * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3661   * by a different invocation of reg() -- Yves
3662   */
3663   JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3664
3665   /* Follow the next-chain of the current node and optimize
3666   away all the NOTHINGs from it.  */
3667   if (OP(scan) != CURLYX) {
3668    const int max = (reg_off_by_arg[OP(scan)]
3669      ? I32_MAX
3670      /* I32 may be smaller than U16 on CRAYs! */
3671      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3672    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3673    int noff;
3674    regnode *n = scan;
3675
3676    /* Skip NOTHING and LONGJMP. */
3677    while ((n = regnext(n))
3678     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3679      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3680     && off + noff < max)
3681     off += noff;
3682    if (reg_off_by_arg[OP(scan)])
3683     ARG(scan) = off;
3684    else
3685     NEXT_OFF(scan) = off;
3686   }
3687
3688
3689
3690   /* The principal pseudo-switch.  Cannot be a switch, since we
3691   look into several different things.  */
3692   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3693     || OP(scan) == IFTHEN) {
3694    next = regnext(scan);
3695    code = OP(scan);
3696    /* demq: the op(next)==code check is to see if we have
3697    * "branch-branch" AFAICT */
3698
3699    if (OP(next) == code || code == IFTHEN) {
3700     /* NOTE - There is similar code to this block below for
3701     * handling TRIE nodes on a re-study.  If you change stuff here
3702     * check there too. */
3703     SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3704     regnode_ssc accum;
3705     regnode * const startbranch=scan;
3706
3707     if (flags & SCF_DO_SUBSTR) {
3708      /* Cannot merge strings after this. */
3709      scan_commit(pRExC_state, data, minlenp, is_inf);
3710     }
3711
3712     if (flags & SCF_DO_STCLASS)
3713      ssc_init_zero(pRExC_state, &accum);
3714
3715     while (OP(scan) == code) {
3716      SSize_t deltanext, minnext, fake;
3717      I32 f = 0;
3718      regnode_ssc this_class;
3719
3720      num++;
3721      data_fake.flags = 0;
3722      if (data) {
3723       data_fake.whilem_c = data->whilem_c;
3724       data_fake.last_closep = data->last_closep;
3725      }
3726      else
3727       data_fake.last_closep = &fake;
3728
3729      data_fake.pos_delta = delta;
3730      next = regnext(scan);
3731      scan = NEXTOPER(scan);
3732      if (code != BRANCH)
3733       scan = NEXTOPER(scan);
3734      if (flags & SCF_DO_STCLASS) {
3735       ssc_init(pRExC_state, &this_class);
3736       data_fake.start_class = &this_class;
3737       f = SCF_DO_STCLASS_AND;
3738      }
3739      if (flags & SCF_WHILEM_VISITED_POS)
3740       f |= SCF_WHILEM_VISITED_POS;
3741
3742      /* we suppose the run is continuous, last=next...*/
3743      minnext = study_chunk(pRExC_state, &scan, minlenp,
3744          &deltanext, next, &data_fake, stopparen,
3745          recursed_depth, NULL, f,depth+1);
3746      if (min1 > minnext)
3747       min1 = minnext;
3748      if (deltanext == SSize_t_MAX) {
3749       is_inf = is_inf_internal = 1;
3750       max1 = SSize_t_MAX;
3751      } else if (max1 < minnext + deltanext)
3752       max1 = minnext + deltanext;
3753      scan = next;
3754      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3755       pars++;
3756      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3757       if ( stopmin > minnext)
3758        stopmin = min + min1;
3759       flags &= ~SCF_DO_SUBSTR;
3760       if (data)
3761        data->flags |= SCF_SEEN_ACCEPT;
3762      }
3763      if (data) {
3764       if (data_fake.flags & SF_HAS_EVAL)
3765        data->flags |= SF_HAS_EVAL;
3766       data->whilem_c = data_fake.whilem_c;
3767      }
3768      if (flags & SCF_DO_STCLASS)
3769       ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3770     }
3771     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3772      min1 = 0;
3773     if (flags & SCF_DO_SUBSTR) {
3774      data->pos_min += min1;
3775      if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3776       data->pos_delta = SSize_t_MAX;
3777      else
3778       data->pos_delta += max1 - min1;
3779      if (max1 != min1 || is_inf)
3780       data->longest = &(data->longest_float);
3781     }
3782     min += min1;
3783     if (delta == SSize_t_MAX
3784     || SSize_t_MAX - delta - (max1 - min1) < 0)
3785      delta = SSize_t_MAX;
3786     else
3787      delta += max1 - min1;
3788     if (flags & SCF_DO_STCLASS_OR) {
3789      ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3790      if (min1) {
3791       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3792       flags &= ~SCF_DO_STCLASS;
3793      }
3794     }
3795     else if (flags & SCF_DO_STCLASS_AND) {
3796      if (min1) {
3797       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3798       flags &= ~SCF_DO_STCLASS;
3799      }
3800      else {
3801       /* Switch to OR mode: cache the old value of
3802       * data->start_class */
3803       INIT_AND_WITHP;
3804       StructCopy(data->start_class, and_withp, regnode_ssc);
3805       flags &= ~SCF_DO_STCLASS_AND;
3806       StructCopy(&accum, data->start_class, regnode_ssc);
3807       flags |= SCF_DO_STCLASS_OR;
3808      }
3809     }
3810
3811     if (PERL_ENABLE_TRIE_OPTIMISATION &&
3812       OP( startbranch ) == BRANCH )
3813     {
3814     /* demq.
3815
3816     Assuming this was/is a branch we are dealing with: 'scan'
3817     now points at the item that follows the branch sequence,
3818     whatever it is. We now start at the beginning of the
3819     sequence and look for subsequences of
3820
3821     BRANCH->EXACT=>x1
3822     BRANCH->EXACT=>x2
3823     tail
3824
3825     which would be constructed from a pattern like
3826     /A|LIST|OF|WORDS/
3827
3828     If we can find such a subsequence we need to turn the first
3829     element into a trie and then add the subsequent branch exact
3830     strings to the trie.
3831
3832     We have two cases
3833
3834      1. patterns where the whole set of branches can be
3835       converted.
3836
3837      2. patterns where only a subset can be converted.
3838
3839     In case 1 we can replace the whole set with a single regop
3840     for the trie. In case 2 we need to keep the start and end
3841     branches so
3842
3843      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3844      becomes BRANCH TRIE; BRANCH X;
3845
3846     There is an additional case, that being where there is a
3847     common prefix, which gets split out into an EXACT like node
3848     preceding the TRIE node.
3849
3850     If x(1..n)==tail then we can do a simple trie, if not we make
3851     a "jump" trie, such that when we match the appropriate word
3852     we "jump" to the appropriate tail node. Essentially we turn
3853     a nested if into a case structure of sorts.
3854
3855     */
3856
3857      int made=0;
3858      if (!re_trie_maxbuff) {
3859       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3860       if (!SvIOK(re_trie_maxbuff))
3861        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3862      }
3863      if ( SvIV(re_trie_maxbuff)>=0  ) {
3864       regnode *cur;
3865       regnode *first = (regnode *)NULL;
3866       regnode *last = (regnode *)NULL;
3867       regnode *tail = scan;
3868       U8 trietype = 0;
3869       U32 count=0;
3870
3871 #ifdef DEBUGGING
3872       SV * const mysv = sv_newmortal();   /* for dumping */
3873 #endif
3874       /* var tail is used because there may be a TAIL
3875       regop in the way. Ie, the exacts will point to the
3876       thing following the TAIL, but the last branch will
3877       point at the TAIL. So we advance tail. If we
3878       have nested (?:) we may have to move through several
3879       tails.
3880       */
3881
3882       while ( OP( tail ) == TAIL ) {
3883        /* this is the TAIL generated by (?:) */
3884        tail = regnext( tail );
3885       }
3886
3887
3888       DEBUG_TRIE_COMPILE_r({
3889        regprop(RExC_rx, mysv, tail, NULL);
3890        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3891        (int)depth * 2 + 2, "",
3892        "Looking for TRIE'able sequences. Tail node is: ",
3893        SvPV_nolen_const( mysv )
3894        );
3895       });
3896
3897       /*
3898
3899        Step through the branches
3900         cur represents each branch,
3901         noper is the first thing to be matched as part
3902          of that branch
3903         noper_next is the regnext() of that node.
3904
3905        We normally handle a case like this
3906        /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3907        support building with NOJUMPTRIE, which restricts
3908        the trie logic to structures like /FOO|BAR/.
3909
3910        If noper is a trieable nodetype then the branch is
3911        a possible optimization target. If we are building
3912        under NOJUMPTRIE then we require that noper_next is
3913        the same as scan (our current position in the regex
3914        program).
3915
3916        Once we have two or more consecutive such branches
3917        we can create a trie of the EXACT's contents and
3918        stitch it in place into the program.
3919
3920        If the sequence represents all of the branches in
3921        the alternation we replace the entire thing with a
3922        single TRIE node.
3923
3924        Otherwise when it is a subsequence we need to
3925        stitch it in place and replace only the relevant
3926        branches. This means the first branch has to remain
3927        as it is used by the alternation logic, and its
3928        next pointer, and needs to be repointed at the item
3929        on the branch chain following the last branch we
3930        have optimized away.
3931
3932        This could be either a BRANCH, in which case the
3933        subsequence is internal, or it could be the item
3934        following the branch sequence in which case the
3935        subsequence is at the end (which does not
3936        necessarily mean the first node is the start of the
3937        alternation).
3938
3939        TRIE_TYPE(X) is a define which maps the optype to a
3940        trietype.
3941
3942         optype          |  trietype
3943         ----------------+-----------
3944         NOTHING         | NOTHING
3945         EXACT           | EXACT
3946         EXACTFU         | EXACTFU
3947         EXACTFU_SS      | EXACTFU
3948         EXACTFA         | EXACTFA
3949
3950
3951       */
3952 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3953      ( EXACT == (X) )   ? EXACT :        \
3954      ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3955      ( EXACTFA == (X) ) ? EXACTFA :        \
3956      0 )
3957
3958       /* dont use tail as the end marker for this traverse */
3959       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3960        regnode * const noper = NEXTOPER( cur );
3961        U8 noper_type = OP( noper );
3962        U8 noper_trietype = TRIE_TYPE( noper_type );
3963 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3964        regnode * const noper_next = regnext( noper );
3965        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3966        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3967 #endif
3968
3969        DEBUG_TRIE_COMPILE_r({
3970         regprop(RExC_rx, mysv, cur, NULL);
3971         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3972         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3973
3974         regprop(RExC_rx, mysv, noper, NULL);
3975         PerlIO_printf( Perl_debug_log, " -> %s",
3976          SvPV_nolen_const(mysv));
3977
3978         if ( noper_next ) {
3979         regprop(RExC_rx, mysv, noper_next, NULL);
3980         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3981          SvPV_nolen_const(mysv));
3982         }
3983         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3984         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3985         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3986         );
3987        });
3988
3989        /* Is noper a trieable nodetype that can be merged
3990        * with the current trie (if there is one)? */
3991        if ( noper_trietype
3992         &&
3993         (
3994           ( noper_trietype == NOTHING)
3995           || ( trietype == NOTHING )
3996           || ( trietype == noper_trietype )
3997         )
3998 #ifdef NOJUMPTRIE
3999         && noper_next == tail
4000 #endif
4001         && count < U16_MAX)
4002        {
4003         /* Handle mergable triable node Either we are
4004         * the first node in a new trieable sequence,
4005         * in which case we do some bookkeeping,
4006         * otherwise we update the end pointer. */
4007         if ( !first ) {
4008          first = cur;
4009          if ( noper_trietype == NOTHING ) {
4010 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4011           regnode * const noper_next = regnext( noper );
4012           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4013           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4014 #endif
4015
4016           if ( noper_next_trietype ) {
4017            trietype = noper_next_trietype;
4018           } else if (noper_next_type)  {
4019            /* a NOTHING regop is 1 regop wide.
4020            * We need at least two for a trie
4021            * so we can't merge this in */
4022            first = NULL;
4023           }
4024          } else {
4025           trietype = noper_trietype;
4026          }
4027         } else {
4028          if ( trietype == NOTHING )
4029           trietype = noper_trietype;
4030          last = cur;
4031         }
4032         if (first)
4033          count++;
4034        } /* end handle mergable triable node */
4035        else {
4036         /* handle unmergable node -
4037         * noper may either be a triable node which can
4038         * not be tried together with the current trie,
4039         * or a non triable node */
4040         if ( last ) {
4041          /* If last is set and trietype is not
4042          * NOTHING then we have found at least two
4043          * triable branch sequences in a row of a
4044          * similar trietype so we can turn them
4045          * into a trie. If/when we allow NOTHING to
4046          * start a trie sequence this condition
4047          * will be required, and it isn't expensive
4048          * so we leave it in for now. */
4049          if ( trietype && trietype != NOTHING )
4050           make_trie( pRExC_state,
4051             startbranch, first, cur, tail,
4052             count, trietype, depth+1 );
4053          last = NULL; /* note: we clear/update
4054              first, trietype etc below,
4055              so we dont do it here */
4056         }
4057         if ( noper_trietype
4058 #ifdef NOJUMPTRIE
4059          && noper_next == tail
4060 #endif
4061         ){
4062          /* noper is triable, so we can start a new
4063          * trie sequence */
4064          count = 1;
4065          first = cur;
4066          trietype = noper_trietype;
4067         } else if (first) {
4068          /* if we already saw a first but the
4069          * current node is not triable then we have
4070          * to reset the first information. */
4071          count = 0;
4072          first = NULL;
4073          trietype = 0;
4074         }
4075        } /* end handle unmergable node */
4076       } /* loop over branches */
4077       DEBUG_TRIE_COMPILE_r({
4078        regprop(RExC_rx, mysv, cur, NULL);
4079        PerlIO_printf( Perl_debug_log,
4080        "%*s- %s (%d) <SCAN FINISHED>\n",
4081        (int)depth * 2 + 2,
4082        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4083
4084       });
4085       if ( last && trietype ) {
4086        if ( trietype != NOTHING ) {
4087         /* the last branch of the sequence was part of
4088         * a trie, so we have to construct it here
4089         * outside of the loop */
4090         made= make_trie( pRExC_state, startbranch,
4091             first, scan, tail, count,
4092             trietype, depth+1 );
4093 #ifdef TRIE_STUDY_OPT
4094         if ( ((made == MADE_EXACT_TRIE &&
4095          startbranch == first)
4096          || ( first_non_open == first )) &&
4097          depth==0 ) {
4098          flags |= SCF_TRIE_RESTUDY;
4099          if ( startbranch == first
4100           && scan == tail )
4101          {
4102           RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4103          }
4104         }
4105 #endif
4106        } else {
4107         /* at this point we know whatever we have is a
4108         * NOTHING sequence/branch AND if 'startbranch'
4109         * is 'first' then we can turn the whole thing
4110         * into a NOTHING
4111         */
4112         if ( startbranch == first ) {
4113          regnode *opt;
4114          /* the entire thing is a NOTHING sequence,
4115          * something like this: (?:|) So we can
4116          * turn it into a plain NOTHING op. */
4117          DEBUG_TRIE_COMPILE_r({
4118           regprop(RExC_rx, mysv, cur, NULL);
4119           PerlIO_printf( Perl_debug_log,
4120           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4121           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4122
4123          });
4124          OP(startbranch)= NOTHING;
4125          NEXT_OFF(startbranch)= tail - startbranch;
4126          for ( opt= startbranch + 1; opt < tail ; opt++ )
4127           OP(opt)= OPTIMIZED;
4128         }
4129        }
4130       } /* end if ( last) */
4131      } /* TRIE_MAXBUF is non zero */
4132
4133     } /* do trie */
4134
4135    }
4136    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4137     scan = NEXTOPER(NEXTOPER(scan));
4138    } else   /* single branch is optimized. */
4139     scan = NEXTOPER(scan);
4140    continue;
4141   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4142    scan_frame *newframe = NULL;
4143    I32 paren;
4144    regnode *start;
4145    regnode *end;
4146    U32 my_recursed_depth= recursed_depth;
4147
4148    if (OP(scan) != SUSPEND) {
4149     /* set the pointer */
4150     if (OP(scan) == GOSUB) {
4151      paren = ARG(scan);
4152      RExC_recurse[ARG2L(scan)] = scan;
4153      start = RExC_open_parens[paren-1];
4154      end   = RExC_close_parens[paren-1];
4155     } else {
4156      paren = 0;
4157      start = RExC_rxi->program + 1;
4158      end   = RExC_opend;
4159     }
4160     if (!recursed_depth
4161      ||
4162      !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4163     ) {
4164      if (!recursed_depth) {
4165       Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4166      } else {
4167       Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4168        RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4169        RExC_study_chunk_recursed_bytes, U8);
4170      }
4171      /* we havent recursed into this paren yet, so recurse into it */
4172      DEBUG_STUDYDATA("set:", data,depth);
4173      PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4174      my_recursed_depth= recursed_depth + 1;
4175      Newx(newframe,1,scan_frame);
4176     } else {
4177      DEBUG_STUDYDATA("inf:", data,depth);
4178      /* some form of infinite recursion, assume infinite length
4179      * */
4180      if (flags & SCF_DO_SUBSTR) {
4181       scan_commit(pRExC_state, data, minlenp, is_inf);
4182       data->longest = &(data->longest_float);
4183      }
4184      is_inf = is_inf_internal = 1;
4185      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4186       ssc_anything(data->start_class);
4187      flags &= ~SCF_DO_STCLASS;
4188     }
4189    } else {
4190     Newx(newframe,1,scan_frame);
4191     paren = stopparen;
4192     start = scan+2;
4193     end = regnext(scan);
4194    }
4195    if (newframe) {
4196     assert(start);
4197     assert(end);
4198     SAVEFREEPV(newframe);
4199     newframe->next = regnext(scan);
4200     newframe->last = last;
4201     newframe->stop = stopparen;
4202     newframe->prev = frame;
4203     newframe->prev_recursed_depth = recursed_depth;
4204
4205     DEBUG_STUDYDATA("frame-new:",data,depth);
4206     DEBUG_PEEP("fnew", scan, depth);
4207
4208     frame = newframe;
4209     scan =  start;
4210     stopparen = paren;
4211     last = end;
4212     depth = depth + 1;
4213     recursed_depth= my_recursed_depth;
4214
4215     continue;
4216    }
4217   }
4218   else if (OP(scan) == EXACT) {
4219    SSize_t l = STR_LEN(scan);
4220    UV uc;
4221    if (UTF) {
4222     const U8 * const s = (U8*)STRING(scan);
4223     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4224     l = utf8_length(s, s + l);
4225    } else {
4226     uc = *((U8*)STRING(scan));
4227    }
4228    min += l;
4229    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4230     /* The code below prefers earlier match for fixed
4231     offset, later match for variable offset.  */
4232     if (data->last_end == -1) { /* Update the start info. */
4233      data->last_start_min = data->pos_min;
4234      data->last_start_max = is_inf
4235       ? SSize_t_MAX : data->pos_min + data->pos_delta;
4236     }
4237     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4238     if (UTF)
4239      SvUTF8_on(data->last_found);
4240     {
4241      SV * const sv = data->last_found;
4242      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4243       mg_find(sv, PERL_MAGIC_utf8) : NULL;
4244      if (mg && mg->mg_len >= 0)
4245       mg->mg_len += utf8_length((U8*)STRING(scan),
4246            (U8*)STRING(scan)+STR_LEN(scan));
4247     }
4248     data->last_end = data->pos_min + l;
4249     data->pos_min += l; /* As in the first entry. */
4250     data->flags &= ~SF_BEFORE_EOL;
4251    }
4252
4253    /* ANDing the code point leaves at most it, and not in locale, and
4254    * can't match null string */
4255    if (flags & SCF_DO_STCLASS_AND) {
4256     ssc_cp_and(data->start_class, uc);
4257     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4258     ssc_clear_locale(data->start_class);
4259    }
4260    else if (flags & SCF_DO_STCLASS_OR) {
4261     ssc_add_cp(data->start_class, uc);
4262     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4263
4264     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4265     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4266    }
4267    flags &= ~SCF_DO_STCLASS;
4268   }
4269   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
4270              EXACTFish */
4271    SSize_t l = STR_LEN(scan);
4272    UV uc = *((U8*)STRING(scan));
4273    SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4274              separate code points */
4275    const U8 * s = (U8*)STRING(scan);
4276
4277    /* Search for fixed substrings supports EXACT only. */
4278    if (flags & SCF_DO_SUBSTR) {
4279     assert(data);
4280     scan_commit(pRExC_state, data, minlenp, is_inf);
4281    }
4282    if (UTF) {
4283     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4284     l = utf8_length(s, s + l);
4285    }
4286    if (unfolded_multi_char) {
4287     RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4288    }
4289    min += l - min_subtract;
4290    assert (min >= 0);
4291    delta += min_subtract;
4292    if (flags & SCF_DO_SUBSTR) {
4293     data->pos_min += l - min_subtract;
4294     if (data->pos_min < 0) {
4295      data->pos_min = 0;
4296     }
4297     data->pos_delta += min_subtract;
4298     if (min_subtract) {
4299      data->longest = &(data->longest_float);
4300     }
4301    }
4302
4303    if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4304     ssc_clear_locale(data->start_class);
4305    }
4306
4307    if (! UTF) {
4308
4309     /* We punt and assume can match anything if the node begins
4310     * with a multi-character fold.  Things are complicated.  For
4311     * example, /ffi/i could match any of:
4312     *  "\N{LATIN SMALL LIGATURE FFI}"
4313     *  "\N{LATIN SMALL LIGATURE FF}I"
4314     *  "F\N{LATIN SMALL LIGATURE FI}"
4315     *  plus several other things; and making sure we have all the
4316     *  possibilities is hard. */
4317     if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4318      EXACTF_invlist =
4319        _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4320     }
4321     else {
4322
4323      /* Any Latin1 range character can potentially match any
4324      * other depending on the locale */
4325      if (OP(scan) == EXACTFL) {
4326       _invlist_union(EXACTF_invlist, PL_Latin1,
4327                &EXACTF_invlist);
4328      }
4329      else {
4330       /* But otherwise, it matches at least itself.  We can
4331       * quickly tell if it has a distinct fold, and if so,
4332       * it matches that as well */
4333       EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4334       if (IS_IN_SOME_FOLD_L1(uc)) {
4335        EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4336               PL_fold_latin1[uc]);
4337       }
4338      }
4339
4340      /* Some characters match above-Latin1 ones under /i.  This
4341      * is true of EXACTFL ones when the locale is UTF-8 */
4342      if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4343       && (! isASCII(uc) || (OP(scan) != EXACTFA
4344            && OP(scan) != EXACTFA_NO_TRIE)))
4345      {
4346       add_above_Latin1_folds(pRExC_state,
4347            (U8) uc,
4348            &EXACTF_invlist);
4349      }
4350     }
4351    }
4352    else {  /* Pattern is UTF-8 */
4353     U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4354     STRLEN foldlen = UTF8SKIP(s);
4355     const U8* e = s + STR_LEN(scan);
4356     SV** listp;
4357
4358     /* The only code points that aren't folded in a UTF EXACTFish
4359     * node are are the problematic ones in EXACTFL nodes */
4360     if (OP(scan) == EXACTFL
4361      && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4362     {
4363      /* We need to check for the possibility that this EXACTFL
4364      * node begins with a multi-char fold.  Therefore we fold
4365      * the first few characters of it so that we can make that
4366      * check */
4367      U8 *d = folded;
4368      int i;
4369
4370      for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4371       if (isASCII(*s)) {
4372        *(d++) = (U8) toFOLD(*s);
4373        s++;
4374       }
4375       else {
4376        STRLEN len;
4377        to_utf8_fold(s, d, &len);
4378        d += len;
4379        s += UTF8SKIP(s);
4380       }
4381      }
4382
4383      /* And set up so the code below that looks in this folded
4384      * buffer instead of the node's string */
4385      e = d;
4386      foldlen = UTF8SKIP(folded);
4387      s = folded;
4388     }
4389
4390     /* When we reach here 's' points to the fold of the first
4391     * character(s) of the node; and 'e' points to far enough along
4392     * the folded string to be just past any possible multi-char
4393     * fold. 'foldlen' is the length in bytes of the first
4394     * character in 's'
4395     *
4396     * Unlike the non-UTF-8 case, the macro for determining if a
4397     * string is a multi-char fold requires all the characters to
4398     * already be folded.  This is because of all the complications
4399     * if not.  Note that they are folded anyway, except in EXACTFL
4400     * nodes.  Like the non-UTF case above, we punt if the node
4401     * begins with a multi-char fold  */
4402
4403     if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4404      EXACTF_invlist =
4405        _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4406     }
4407     else {  /* Single char fold */
4408
4409      /* It matches all the things that fold to it, which are
4410      * found in PL_utf8_foldclosures (including itself) */
4411      EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4412      if (! PL_utf8_foldclosures) {
4413       _load_PL_utf8_foldclosures();
4414      }
4415      if ((listp = hv_fetch(PL_utf8_foldclosures,
4416           (char *) s, foldlen, FALSE)))
4417      {
4418       AV* list = (AV*) *listp;
4419       IV k;
4420       for (k = 0; k <= av_tindex(list); k++) {
4421        SV** c_p = av_fetch(list, k, FALSE);
4422        UV c;
4423        assert(c_p);
4424
4425        c = SvUV(*c_p);
4426
4427        /* /aa doesn't allow folds between ASCII and non- */
4428        if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4429         && isASCII(c) != isASCII(uc))
4430        {
4431         continue;
4432        }
4433
4434        EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4435       }
4436      }
4437     }
4438    }
4439    if (flags & SCF_DO_STCLASS_AND) {
4440     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4441     ANYOF_POSIXL_ZERO(data->start_class);
4442     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4443    }
4444    else if (flags & SCF_DO_STCLASS_OR) {
4445     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4446     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4447
4448     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4449     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4450    }
4451    flags &= ~SCF_DO_STCLASS;
4452    SvREFCNT_dec(EXACTF_invlist);
4453   }
4454   else if (REGNODE_VARIES(OP(scan))) {
4455    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4456    I32 fl = 0, f = flags;
4457    regnode * const oscan = scan;
4458    regnode_ssc this_class;
4459    regnode_ssc *oclass = NULL;
4460    I32 next_is_eval = 0;
4461
4462    switch (PL_regkind[OP(scan)]) {
4463    case WHILEM:  /* End of (?:...)* . */
4464     scan = NEXTOPER(scan);
4465     goto finish;
4466    case PLUS:
4467     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4468      next = NEXTOPER(scan);
4469      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4470       mincount = 1;
4471       maxcount = REG_INFTY;
4472       next = regnext(scan);
4473       scan = NEXTOPER(scan);
4474       goto do_curly;
4475      }
4476     }
4477     if (flags & SCF_DO_SUBSTR)
4478      data->pos_min++;
4479     min++;
4480     /* FALLTHROUGH */
4481    case STAR:
4482     if (flags & SCF_DO_STCLASS) {
4483      mincount = 0;
4484      maxcount = REG_INFTY;
4485      next = regnext(scan);
4486      scan = NEXTOPER(scan);
4487      goto do_curly;
4488     }
4489     if (flags & SCF_DO_SUBSTR) {
4490      scan_commit(pRExC_state, data, minlenp, is_inf);
4491      /* Cannot extend fixed substrings */
4492      data->longest = &(data->longest_float);
4493     }
4494     is_inf = is_inf_internal = 1;
4495     scan = regnext(scan);
4496     goto optimize_curly_tail;
4497    case CURLY:
4498     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4499      && (scan->flags == stopparen))
4500     {
4501      mincount = 1;
4502      maxcount = 1;
4503     } else {
4504      mincount = ARG1(scan);
4505      maxcount = ARG2(scan);
4506     }
4507     next = regnext(scan);
4508     if (OP(scan) == CURLYX) {
4509      I32 lp = (data ? *(data->last_closep) : 0);
4510      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4511     }
4512     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4513     next_is_eval = (OP(scan) == EVAL);
4514    do_curly:
4515     if (flags & SCF_DO_SUBSTR) {
4516      if (mincount == 0)
4517       scan_commit(pRExC_state, data, minlenp, is_inf);
4518      /* Cannot extend fixed substrings */
4519      pos_before = data->pos_min;
4520     }
4521     if (data) {
4522      fl = data->flags;
4523      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4524      if (is_inf)
4525       data->flags |= SF_IS_INF;
4526     }
4527     if (flags & SCF_DO_STCLASS) {
4528      ssc_init(pRExC_state, &this_class);
4529      oclass = data->start_class;
4530      data->start_class = &this_class;
4531      f |= SCF_DO_STCLASS_AND;
4532      f &= ~SCF_DO_STCLASS_OR;
4533     }
4534     /* Exclude from super-linear cache processing any {n,m}
4535     regops for which the combination of input pos and regex
4536     pos is not enough information to determine if a match
4537     will be possible.
4538
4539     For example, in the regex /foo(bar\s*){4,8}baz/ with the
4540     regex pos at the \s*, the prospects for a match depend not
4541     only on the input position but also on how many (bar\s*)
4542     repeats into the {4,8} we are. */
4543    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4544      f &= ~SCF_WHILEM_VISITED_POS;
4545
4546     /* This will finish on WHILEM, setting scan, or on NULL: */
4547     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4548         last, data, stopparen, recursed_depth, NULL,
4549         (mincount == 0
4550         ? (f & ~SCF_DO_SUBSTR)
4551         : f)
4552         ,depth+1);
4553
4554     if (flags & SCF_DO_STCLASS)
4555      data->start_class = oclass;
4556     if (mincount == 0 || minnext == 0) {
4557      if (flags & SCF_DO_STCLASS_OR) {
4558       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4559      }
4560      else if (flags & SCF_DO_STCLASS_AND) {
4561       /* Switch to OR mode: cache the old value of
4562       * data->start_class */
4563       INIT_AND_WITHP;
4564       StructCopy(data->start_class, and_withp, regnode_ssc);
4565       flags &= ~SCF_DO_STCLASS_AND;
4566       StructCopy(&this_class, data->start_class, regnode_ssc);
4567       flags |= SCF_DO_STCLASS_OR;
4568       ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4569      }
4570     } else {  /* Non-zero len */
4571      if (flags & SCF_DO_STCLASS_OR) {
4572       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4573       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4574      }
4575      else if (flags & SCF_DO_STCLASS_AND)
4576       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4577      flags &= ~SCF_DO_STCLASS;
4578     }
4579     if (!scan)   /* It was not CURLYX, but CURLY. */
4580      scan = next;
4581     if (!(flags & SCF_TRIE_DOING_RESTUDY)
4582      /* ? quantifier ok, except for (?{ ... }) */
4583      && (next_is_eval || !(mincount == 0 && maxcount == 1))
4584      && (minnext == 0) && (deltanext == 0)
4585      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4586      && maxcount <= REG_INFTY/3) /* Complement check for big
4587             count */
4588     {
4589      /* Fatal warnings may leak the regexp without this: */
4590      SAVEFREESV(RExC_rx_sv);
4591      ckWARNreg(RExC_parse,
4592        "Quantifier unexpected on zero-length expression");
4593      (void)ReREFCNT_inc(RExC_rx_sv);
4594     }
4595
4596     min += minnext * mincount;
4597     is_inf_internal |= deltanext == SSize_t_MAX
4598       || (maxcount == REG_INFTY && minnext + deltanext > 0);
4599     is_inf |= is_inf_internal;
4600     if (is_inf) {
4601      delta = SSize_t_MAX;
4602     } else {
4603      delta += (minnext + deltanext) * maxcount
4604        - minnext * mincount;
4605     }
4606     /* Try powerful optimization CURLYX => CURLYN. */
4607     if (  OP(oscan) == CURLYX && data
4608      && data->flags & SF_IN_PAR
4609      && !(data->flags & SF_HAS_EVAL)
4610      && !deltanext && minnext == 1 ) {
4611      /* Try to optimize to CURLYN.  */
4612      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4613      regnode * const nxt1 = nxt;
4614 #ifdef DEBUGGING
4615      regnode *nxt2;
4616 #endif
4617
4618      /* Skip open. */
4619      nxt = regnext(nxt);
4620      if (!REGNODE_SIMPLE(OP(nxt))
4621       && !(PL_regkind[OP(nxt)] == EXACT
4622        && STR_LEN(nxt) == 1))
4623       goto nogo;
4624 #ifdef DEBUGGING
4625      nxt2 = nxt;
4626 #endif
4627      nxt = regnext(nxt);
4628      if (OP(nxt) != CLOSE)
4629       goto nogo;
4630      if (RExC_open_parens) {
4631       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4632       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4633      }
4634      /* Now we know that nxt2 is the only contents: */
4635      oscan->flags = (U8)ARG(nxt);
4636      OP(oscan) = CURLYN;
4637      OP(nxt1) = NOTHING; /* was OPEN. */
4638
4639 #ifdef DEBUGGING
4640      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4641      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4642      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4643      OP(nxt) = OPTIMIZED; /* was CLOSE. */
4644      OP(nxt + 1) = OPTIMIZED; /* was count. */
4645      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4646 #endif
4647     }
4648    nogo:
4649
4650     /* Try optimization CURLYX => CURLYM. */
4651     if (  OP(oscan) == CURLYX && data
4652      && !(data->flags & SF_HAS_PAR)
4653      && !(data->flags & SF_HAS_EVAL)
4654      && !deltanext /* atom is fixed width */
4655      && minnext != 0 /* CURLYM can't handle zero width */
4656
4657       /* Nor characters whose fold at run-time may be
4658       * multi-character */
4659      && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4660     ) {
4661      /* XXXX How to optimize if data == 0? */
4662      /* Optimize to a simpler form.  */
4663      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4664      regnode *nxt2;
4665
4666      OP(oscan) = CURLYM;
4667      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4668        && (OP(nxt2) != WHILEM))
4669       nxt = nxt2;
4670      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4671      /* Need to optimize away parenths. */
4672      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4673       /* Set the parenth number.  */
4674       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4675
4676       oscan->flags = (U8)ARG(nxt);
4677       if (RExC_open_parens) {
4678        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4679        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4680       }
4681       OP(nxt1) = OPTIMIZED; /* was OPEN. */
4682       OP(nxt) = OPTIMIZED; /* was CLOSE. */
4683
4684 #ifdef DEBUGGING
4685       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4686       OP(nxt + 1) = OPTIMIZED; /* was count. */
4687       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4688       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4689 #endif
4690 #if 0
4691       while ( nxt1 && (OP(nxt1) != WHILEM)) {
4692        regnode *nnxt = regnext(nxt1);
4693        if (nnxt == nxt) {
4694         if (reg_off_by_arg[OP(nxt1)])
4695          ARG_SET(nxt1, nxt2 - nxt1);
4696         else if (nxt2 - nxt1 < U16_MAX)
4697          NEXT_OFF(nxt1) = nxt2 - nxt1;
4698         else
4699          OP(nxt) = NOTHING; /* Cannot beautify */
4700        }
4701        nxt1 = nnxt;
4702       }
4703 #endif
4704       /* Optimize again: */
4705       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4706          NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4707      }
4708      else
4709       oscan->flags = 0;
4710     }
4711     else if ((OP(oscan) == CURLYX)
4712       && (flags & SCF_WHILEM_VISITED_POS)
4713       /* See the comment on a similar expression above.
4714        However, this time it's not a subexpression
4715        we care about, but the expression itself. */
4716       && (maxcount == REG_INFTY)
4717       && data && ++data->whilem_c < 16) {
4718      /* This stays as CURLYX, we can put the count/of pair. */
4719      /* Find WHILEM (as in regexec.c) */
4720      regnode *nxt = oscan + NEXT_OFF(oscan);
4721
4722      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4723       nxt += ARG(nxt);
4724      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4725       | (RExC_whilem_seen << 4)); /* On WHILEM */
4726     }
4727     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4728      pars++;
4729     if (flags & SCF_DO_SUBSTR) {
4730      SV *last_str = NULL;
4731      STRLEN last_chrs = 0;
4732      int counted = mincount != 0;
4733
4734      if (data->last_end > 0 && mincount != 0) { /* Ends with a
4735                 string. */
4736       SSize_t b = pos_before >= data->last_start_min
4737        ? pos_before : data->last_start_min;
4738       STRLEN l;
4739       const char * const s = SvPV_const(data->last_found, l);
4740       SSize_t old = b - data->last_start_min;
4741
4742       if (UTF)
4743        old = utf8_hop((U8*)s, old) - (U8*)s;
4744       l -= old;
4745       /* Get the added string: */
4746       last_str = newSVpvn_utf8(s  + old, l, UTF);
4747       last_chrs = UTF ? utf8_length((U8*)(s + old),
4748            (U8*)(s + old + l)) : l;
4749       if (deltanext == 0 && pos_before == b) {
4750        /* What was added is a constant string */
4751        if (mincount > 1) {
4752
4753         SvGROW(last_str, (mincount * l) + 1);
4754         repeatcpy(SvPVX(last_str) + l,
4755           SvPVX_const(last_str), l,
4756           mincount - 1);
4757         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4758         /* Add additional parts. */
4759         SvCUR_set(data->last_found,
4760           SvCUR(data->last_found) - l);
4761         sv_catsv(data->last_found, last_str);
4762         {
4763          SV * sv = data->last_found;
4764          MAGIC *mg =
4765           SvUTF8(sv) && SvMAGICAL(sv) ?
4766           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4767          if (mg && mg->mg_len >= 0)
4768           mg->mg_len += last_chrs * (mincount-1);
4769         }
4770         last_chrs *= mincount;
4771         data->last_end += l * (mincount - 1);
4772        }
4773       } else {
4774        /* start offset must point into the last copy */
4775        data->last_start_min += minnext * (mincount - 1);
4776        data->last_start_max += is_inf ? SSize_t_MAX
4777         : (maxcount - 1) * (minnext + data->pos_delta);
4778       }
4779      }
4780      /* It is counted once already... */
4781      data->pos_min += minnext * (mincount - counted);
4782 #if 0
4783 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4784        " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4785        " maxcount=%"UVuf" mincount=%"UVuf"\n",
4786  (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4787  (UV)mincount);
4788 if (deltanext != SSize_t_MAX)
4789 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4790  (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4791   - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4792 #endif
4793      if (deltanext == SSize_t_MAX
4794       || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4795       data->pos_delta = SSize_t_MAX;
4796      else
4797       data->pos_delta += - counted * deltanext +
4798       (minnext + deltanext) * maxcount - minnext * mincount;
4799      if (mincount != maxcount) {
4800       /* Cannot extend fixed substrings found inside
4801        the group.  */
4802       scan_commit(pRExC_state, data, minlenp, is_inf);
4803       if (mincount && last_str) {
4804        SV * const sv = data->last_found;
4805        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4806         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4807
4808        if (mg)
4809         mg->mg_len = -1;
4810        sv_setsv(sv, last_str);
4811        data->last_end = data->pos_min;
4812        data->last_start_min = data->pos_min - last_chrs;
4813        data->last_start_max = is_inf
4814         ? SSize_t_MAX
4815         : data->pos_min + data->pos_delta - last_chrs;
4816       }
4817       data->longest = &(data->longest_float);
4818      }
4819      SvREFCNT_dec(last_str);
4820     }
4821     if (data && (fl & SF_HAS_EVAL))
4822      data->flags |= SF_HAS_EVAL;
4823    optimize_curly_tail:
4824     if (OP(oscan) != CURLYX) {
4825      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4826       && NEXT_OFF(next))
4827       NEXT_OFF(oscan) += NEXT_OFF(next);
4828     }
4829     continue;
4830
4831    default:
4832 #ifdef DEBUGGING
4833     Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4834                  OP(scan));
4835 #endif
4836    case REF:
4837    case CLUMP:
4838     if (flags & SCF_DO_SUBSTR) {
4839      /* Cannot expect anything... */
4840      scan_commit(pRExC_state, data, minlenp, is_inf);
4841      data->longest = &(data->longest_float);
4842     }
4843     is_inf = is_inf_internal = 1;
4844     if (flags & SCF_DO_STCLASS_OR) {
4845      if (OP(scan) == CLUMP) {
4846       /* Actually is any start char, but very few code points
4847       * aren't start characters */
4848       ssc_match_all_cp(data->start_class);
4849      }
4850      else {
4851       ssc_anything(data->start_class);
4852      }
4853     }
4854     flags &= ~SCF_DO_STCLASS;
4855     break;
4856    }
4857   }
4858   else if (OP(scan) == LNBREAK) {
4859    if (flags & SCF_DO_STCLASS) {
4860      if (flags & SCF_DO_STCLASS_AND) {
4861      ssc_intersection(data->start_class,
4862          PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4863      ssc_clear_locale(data->start_class);
4864      ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4865     }
4866     else if (flags & SCF_DO_STCLASS_OR) {
4867      ssc_union(data->start_class,
4868        PL_XPosix_ptrs[_CC_VERTSPACE],
4869        FALSE);
4870      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4871
4872      /* See commit msg for
4873      * 749e076fceedeb708a624933726e7989f2302f6a */
4874      ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4875     }
4876     flags &= ~SCF_DO_STCLASS;
4877    }
4878    min++;
4879    delta++;    /* Because of the 2 char string cr-lf */
4880    if (flags & SCF_DO_SUBSTR) {
4881     /* Cannot expect anything... */
4882     scan_commit(pRExC_state, data, minlenp, is_inf);
4883      data->pos_min += 1;
4884     data->pos_delta += 1;
4885     data->longest = &(data->longest_float);
4886     }
4887   }
4888   else if (REGNODE_SIMPLE(OP(scan))) {
4889
4890    if (flags & SCF_DO_SUBSTR) {
4891     scan_commit(pRExC_state, data, minlenp, is_inf);
4892     data->pos_min++;
4893    }
4894    min++;
4895    if (flags & SCF_DO_STCLASS) {
4896     bool invert = 0;
4897     SV* my_invlist = sv_2mortal(_new_invlist(0));
4898     U8 namedclass;
4899
4900     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4901     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4902
4903     /* Some of the logic below assumes that switching
4904     locale on will only add false positives. */
4905     switch (OP(scan)) {
4906
4907     default:
4908 #ifdef DEBUGGING
4909     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4910                  OP(scan));
4911 #endif
4912     case CANY:
4913     case SANY:
4914      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4915       ssc_match_all_cp(data->start_class);
4916      break;
4917
4918     case REG_ANY:
4919      {
4920       SV* REG_ANY_invlist = _new_invlist(2);
4921       REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4922                '\n');
4923       if (flags & SCF_DO_STCLASS_OR) {
4924        ssc_union(data->start_class,
4925          REG_ANY_invlist,
4926          TRUE /* TRUE => invert, hence all but \n
4927            */
4928          );
4929       }
4930       else if (flags & SCF_DO_STCLASS_AND) {
4931        ssc_intersection(data->start_class,
4932            REG_ANY_invlist,
4933            TRUE  /* TRUE => invert */
4934            );
4935        ssc_clear_locale(data->start_class);
4936       }
4937       SvREFCNT_dec_NN(REG_ANY_invlist);
4938      }
4939      break;
4940
4941     case ANYOF:
4942      if (flags & SCF_DO_STCLASS_AND)
4943       ssc_and(pRExC_state, data->start_class,
4944         (regnode_charclass *) scan);
4945      else
4946       ssc_or(pRExC_state, data->start_class,
4947               (regnode_charclass *) scan);
4948      break;
4949
4950     case NPOSIXL:
4951      invert = 1;
4952      /* FALLTHROUGH */
4953
4954     case POSIXL:
4955      namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4956      if (flags & SCF_DO_STCLASS_AND) {
4957       bool was_there = cBOOL(
4958           ANYOF_POSIXL_TEST(data->start_class,
4959                 namedclass));
4960       ANYOF_POSIXL_ZERO(data->start_class);
4961       if (was_there) {    /* Do an AND */
4962        ANYOF_POSIXL_SET(data->start_class, namedclass);
4963       }
4964       /* No individual code points can now match */
4965       data->start_class->invlist
4966             = sv_2mortal(_new_invlist(0));
4967      }
4968      else {
4969       int complement = namedclass + ((invert) ? -1 : 1);
4970
4971       assert(flags & SCF_DO_STCLASS_OR);
4972
4973       /* If the complement of this class was already there,
4974       * the result is that they match all code points,
4975       * (\d + \D == everything).  Remove the classes from
4976       * future consideration.  Locale is not relevant in
4977       * this case */
4978       if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4979        ssc_match_all_cp(data->start_class);
4980        ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4981        ANYOF_POSIXL_CLEAR(data->start_class, complement);
4982       }
4983       else {  /* The usual case; just add this class to the
4984         existing set */
4985        ANYOF_POSIXL_SET(data->start_class, namedclass);
4986       }
4987      }
4988      break;
4989
4990     case NPOSIXA:   /* For these, we always know the exact set of
4991         what's matched */
4992      invert = 1;
4993      /* FALLTHROUGH */
4994     case POSIXA:
4995      if (FLAGS(scan) == _CC_ASCII) {
4996       my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4997      }
4998      else {
4999       _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5000            PL_XPosix_ptrs[_CC_ASCII],
5001            &my_invlist);
5002      }
5003      goto join_posix;
5004
5005     case NPOSIXD:
5006     case NPOSIXU:
5007      invert = 1;
5008      /* FALLTHROUGH */
5009     case POSIXD:
5010     case POSIXU:
5011      my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5012
5013      /* NPOSIXD matches all upper Latin1 code points unless the
5014      * target string being matched is UTF-8, which is
5015      * unknowable until match time.  Since we are going to
5016      * invert, we want to get rid of all of them so that the
5017      * inversion will match all */
5018      if (OP(scan) == NPOSIXD) {
5019       _invlist_subtract(my_invlist, PL_UpperLatin1,
5020           &my_invlist);
5021      }
5022
5023     join_posix:
5024
5025      if (flags & SCF_DO_STCLASS_AND) {
5026       ssc_intersection(data->start_class, my_invlist, invert);
5027       ssc_clear_locale(data->start_class);
5028      }
5029      else {
5030       assert(flags & SCF_DO_STCLASS_OR);
5031       ssc_union(data->start_class, my_invlist, invert);
5032      }
5033     }
5034     if (flags & SCF_DO_STCLASS_OR)
5035      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5036     flags &= ~SCF_DO_STCLASS;
5037    }
5038   }
5039   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5040    data->flags |= (OP(scan) == MEOL
5041        ? SF_BEFORE_MEOL
5042        : SF_BEFORE_SEOL);
5043    scan_commit(pRExC_state, data, minlenp, is_inf);
5044
5045   }
5046   else if (  PL_regkind[OP(scan)] == BRANCHJ
5047     /* Lookbehind, or need to calculate parens/evals/stclass: */
5048     && (scan->flags || data || (flags & SCF_DO_STCLASS))
5049     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
5050    if ( OP(scan) == UNLESSM &&
5051     scan->flags == 0 &&
5052     OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5053     OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5054    ) {
5055     regnode *opt;
5056     regnode *upto= regnext(scan);
5057     DEBUG_PARSE_r({
5058      SV * const mysv_val=sv_newmortal();
5059      DEBUG_STUDYDATA("OPFAIL",data,depth);
5060
5061      /*DEBUG_PARSE_MSG("opfail");*/
5062      regprop(RExC_rx, mysv_val, upto, NULL);
5063      PerlIO_printf(Perl_debug_log,
5064       "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5065       SvPV_nolen_const(mysv_val),
5066       (IV)REG_NODE_NUM(upto),
5067       (IV)(upto - scan)
5068      );
5069     });
5070     OP(scan) = OPFAIL;
5071     NEXT_OFF(scan) = upto - scan;
5072     for (opt= scan + 1; opt < upto ; opt++)
5073      OP(opt) = OPTIMIZED;
5074     scan= upto;
5075     continue;
5076    }
5077    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5078     || OP(scan) == UNLESSM )
5079    {
5080     /* Negative Lookahead/lookbehind
5081     In this case we can't do fixed string optimisation.
5082     */
5083
5084     SSize_t deltanext, minnext, fake = 0;
5085     regnode *nscan;
5086     regnode_ssc intrnl;
5087     int f = 0;
5088
5089     data_fake.flags = 0;
5090     if (data) {
5091      data_fake.whilem_c = data->whilem_c;
5092      data_fake.last_closep = data->last_closep;
5093     }
5094     else
5095      data_fake.last_closep = &fake;
5096     data_fake.pos_delta = delta;
5097     if ( flags & SCF_DO_STCLASS && !scan->flags
5098      && OP(scan) == IFMATCH ) { /* Lookahead */
5099      ssc_init(pRExC_state, &intrnl);
5100      data_fake.start_class = &intrnl;
5101      f |= SCF_DO_STCLASS_AND;
5102     }
5103     if (flags & SCF_WHILEM_VISITED_POS)
5104      f |= SCF_WHILEM_VISITED_POS;
5105     next = regnext(scan);
5106     nscan = NEXTOPER(NEXTOPER(scan));
5107     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5108          last, &data_fake, stopparen,
5109          recursed_depth, NULL, f, depth+1);
5110     if (scan->flags) {
5111      if (deltanext) {
5112       FAIL("Variable length lookbehind not implemented");
5113      }
5114      else if (minnext > (I32)U8_MAX) {
5115       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5116        (UV)U8_MAX);
5117      }
5118      scan->flags = (U8)minnext;
5119     }
5120     if (data) {
5121      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5122       pars++;
5123      if (data_fake.flags & SF_HAS_EVAL)
5124       data->flags |= SF_HAS_EVAL;
5125      data->whilem_c = data_fake.whilem_c;
5126     }
5127     if (f & SCF_DO_STCLASS_AND) {
5128      if (flags & SCF_DO_STCLASS_OR) {
5129       /* OR before, AND after: ideally we would recurse with
5130       * data_fake to get the AND applied by study of the
5131       * remainder of the pattern, and then derecurse;
5132       * *** HACK *** for now just treat as "no information".
5133       * See [perl #56690].
5134       */
5135       ssc_init(pRExC_state, data->start_class);
5136      }  else {
5137       /* AND before and after: combine and continue */
5138       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5139      }
5140     }
5141    }
5142 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5143    else {
5144     /* Positive Lookahead/lookbehind
5145     In this case we can do fixed string optimisation,
5146     but we must be careful about it. Note in the case of
5147     lookbehind the positions will be offset by the minimum
5148     length of the pattern, something we won't know about
5149     until after the recurse.
5150     */
5151     SSize_t deltanext, fake = 0;
5152     regnode *nscan;
5153     regnode_ssc intrnl;
5154     int f = 0;
5155     /* We use SAVEFREEPV so that when the full compile
5156      is finished perl will clean up the allocated
5157      minlens when it's all done. This way we don't
5158      have to worry about freeing them when we know
5159      they wont be used, which would be a pain.
5160     */
5161     SSize_t *minnextp;
5162     Newx( minnextp, 1, SSize_t );
5163     SAVEFREEPV(minnextp);
5164
5165     if (data) {
5166      StructCopy(data, &data_fake, scan_data_t);
5167      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5168       f |= SCF_DO_SUBSTR;
5169       if (scan->flags)
5170        scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5171       data_fake.last_found=newSVsv(data->last_found);
5172      }
5173     }
5174     else
5175      data_fake.last_closep = &fake;
5176     data_fake.flags = 0;
5177     data_fake.pos_delta = delta;
5178     if (is_inf)
5179      data_fake.flags |= SF_IS_INF;
5180     if ( flags & SCF_DO_STCLASS && !scan->flags
5181      && OP(scan) == IFMATCH ) { /* Lookahead */
5182      ssc_init(pRExC_state, &intrnl);
5183      data_fake.start_class = &intrnl;
5184      f |= SCF_DO_STCLASS_AND;
5185     }
5186     if (flags & SCF_WHILEM_VISITED_POS)
5187      f |= SCF_WHILEM_VISITED_POS;
5188     next = regnext(scan);
5189     nscan = NEXTOPER(NEXTOPER(scan));
5190
5191     *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5192           &deltanext, last, &data_fake,
5193           stopparen, recursed_depth, NULL,
5194           f,depth+1);
5195     if (scan->flags) {
5196      if (deltanext) {
5197       FAIL("Variable length lookbehind not implemented");
5198      }
5199      else if (*minnextp > (I32)U8_MAX) {
5200       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5201        (UV)U8_MAX);
5202      }
5203      scan->flags = (U8)*minnextp;
5204     }
5205
5206     *minnextp += min;
5207
5208     if (f & SCF_DO_STCLASS_AND) {
5209      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5210     }
5211     if (data) {
5212      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5213       pars++;
5214      if (data_fake.flags & SF_HAS_EVAL)
5215       data->flags |= SF_HAS_EVAL;
5216      data->whilem_c = data_fake.whilem_c;
5217      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5218       if (RExC_rx->minlen<*minnextp)
5219        RExC_rx->minlen=*minnextp;
5220       scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5221       SvREFCNT_dec_NN(data_fake.last_found);
5222
5223       if ( data_fake.minlen_fixed != minlenp )
5224       {
5225        data->offset_fixed= data_fake.offset_fixed;
5226        data->minlen_fixed= data_fake.minlen_fixed;
5227        data->lookbehind_fixed+= scan->flags;
5228       }
5229       if ( data_fake.minlen_float != minlenp )
5230       {
5231        data->minlen_float= data_fake.minlen_float;
5232        data->offset_float_min=data_fake.offset_float_min;
5233        data->offset_float_max=data_fake.offset_float_max;
5234        data->lookbehind_float+= scan->flags;
5235       }
5236      }
5237     }
5238    }
5239 #endif
5240   }
5241   else if (OP(scan) == OPEN) {
5242    if (stopparen != (I32)ARG(scan))
5243     pars++;
5244   }
5245   else if (OP(scan) == CLOSE) {
5246    if (stopparen == (I32)ARG(scan)) {
5247     break;
5248    }
5249    if ((I32)ARG(scan) == is_par) {
5250     next = regnext(scan);
5251
5252     if ( next && (OP(next) != WHILEM) && next < last)
5253      is_par = 0;  /* Disable optimization */
5254    }
5255    if (data)
5256     *(data->last_closep) = ARG(scan);
5257   }
5258   else if (OP(scan) == EVAL) {
5259     if (data)
5260      data->flags |= SF_HAS_EVAL;
5261   }
5262   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5263    if (flags & SCF_DO_SUBSTR) {
5264     scan_commit(pRExC_state, data, minlenp, is_inf);
5265     flags &= ~SCF_DO_SUBSTR;
5266    }
5267    if (data && OP(scan)==ACCEPT) {
5268     data->flags |= SCF_SEEN_ACCEPT;
5269     if (stopmin > min)
5270      stopmin = min;
5271    }
5272   }
5273   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5274   {
5275     if (flags & SCF_DO_SUBSTR) {
5276      scan_commit(pRExC_state, data, minlenp, is_inf);
5277      data->longest = &(data->longest_float);
5278     }
5279     is_inf = is_inf_internal = 1;
5280     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5281      ssc_anything(data->start_class);
5282     flags &= ~SCF_DO_STCLASS;
5283   }
5284   else if (OP(scan) == GPOS) {
5285    if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5286     !(delta || is_inf || (data && data->pos_delta)))
5287    {
5288     if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5289      RExC_rx->intflags |= PREGf_ANCH_GPOS;
5290     if (RExC_rx->gofs < (STRLEN)min)
5291      RExC_rx->gofs = min;
5292    } else {
5293     RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5294     RExC_rx->gofs = 0;
5295    }
5296   }
5297 #ifdef TRIE_STUDY_OPT
5298 #ifdef FULL_TRIE_STUDY
5299   else if (PL_regkind[OP(scan)] == TRIE) {
5300    /* NOTE - There is similar code to this block above for handling
5301    BRANCH nodes on the initial study.  If you change stuff here
5302    check there too. */
5303    regnode *trie_node= scan;
5304    regnode *tail= regnext(scan);
5305    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5306    SSize_t max1 = 0, min1 = SSize_t_MAX;
5307    regnode_ssc accum;
5308
5309    if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5310     /* Cannot merge strings after this. */
5311     scan_commit(pRExC_state, data, minlenp, is_inf);
5312    }
5313    if (flags & SCF_DO_STCLASS)
5314     ssc_init_zero(pRExC_state, &accum);
5315
5316    if (!trie->jump) {
5317     min1= trie->minlen;
5318     max1= trie->maxlen;
5319    } else {
5320     const regnode *nextbranch= NULL;
5321     U32 word;
5322
5323     for ( word=1 ; word <= trie->wordcount ; word++)
5324     {
5325      SSize_t deltanext=0, minnext=0, f = 0, fake;
5326      regnode_ssc this_class;
5327
5328      data_fake.flags = 0;
5329      if (data) {
5330       data_fake.whilem_c = data->whilem_c;
5331       data_fake.last_closep = data->last_closep;
5332      }
5333      else
5334       data_fake.last_closep = &fake;
5335      data_fake.pos_delta = delta;
5336      if (flags & SCF_DO_STCLASS) {
5337       ssc_init(pRExC_state, &this_class);
5338       data_fake.start_class = &this_class;
5339       f = SCF_DO_STCLASS_AND;
5340      }
5341      if (flags & SCF_WHILEM_VISITED_POS)
5342       f |= SCF_WHILEM_VISITED_POS;
5343
5344      if (trie->jump[word]) {
5345       if (!nextbranch)
5346        nextbranch = trie_node + trie->jump[0];
5347       scan= trie_node + trie->jump[word];
5348       /* We go from the jump point to the branch that follows
5349       it. Note this means we need the vestigal unused
5350       branches even though they arent otherwise used. */
5351       minnext = study_chunk(pRExC_state, &scan, minlenp,
5352        &deltanext, (regnode *)nextbranch, &data_fake,
5353        stopparen, recursed_depth, NULL, f,depth+1);
5354      }
5355      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5356       nextbranch= regnext((regnode*)nextbranch);
5357
5358      if (min1 > (SSize_t)(minnext + trie->minlen))
5359       min1 = minnext + trie->minlen;
5360      if (deltanext == SSize_t_MAX) {
5361       is_inf = is_inf_internal = 1;
5362       max1 = SSize_t_MAX;
5363      } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5364       max1 = minnext + deltanext + trie->maxlen;
5365
5366      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5367       pars++;
5368      if (data_fake.flags & SCF_SEEN_ACCEPT) {
5369       if ( stopmin > min + min1)
5370        stopmin = min + min1;
5371       flags &= ~SCF_DO_SUBSTR;
5372       if (data)
5373        data->flags |= SCF_SEEN_ACCEPT;
5374      }
5375      if (data) {
5376       if (data_fake.flags & SF_HAS_EVAL)
5377        data->flags |= SF_HAS_EVAL;
5378       data->whilem_c = data_fake.whilem_c;
5379      }
5380      if (flags & SCF_DO_STCLASS)
5381       ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5382     }
5383    }
5384    if (flags & SCF_DO_SUBSTR) {
5385     data->pos_min += min1;
5386     data->pos_delta += max1 - min1;
5387     if (max1 != min1 || is_inf)
5388      data->longest = &(data->longest_float);
5389    }
5390    min += min1;
5391    delta += max1 - min1;
5392    if (flags & SCF_DO_STCLASS_OR) {
5393     ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5394     if (min1) {
5395      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5396      flags &= ~SCF_DO_STCLASS;
5397     }
5398    }
5399    else if (flags & SCF_DO_STCLASS_AND) {
5400     if (min1) {
5401      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5402      flags &= ~SCF_DO_STCLASS;
5403     }
5404     else {
5405      /* Switch to OR mode: cache the old value of
5406      * data->start_class */
5407      INIT_AND_WITHP;
5408      StructCopy(data->start_class, and_withp, regnode_ssc);
5409      flags &= ~SCF_DO_STCLASS_AND;
5410      StructCopy(&accum, data->start_class, regnode_ssc);
5411      flags |= SCF_DO_STCLASS_OR;
5412     }
5413    }
5414    scan= tail;
5415    continue;
5416   }
5417 #else
5418   else if (PL_regkind[OP(scan)] == TRIE) {
5419    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5420    U8*bang=NULL;
5421
5422    min += trie->minlen;
5423    delta += (trie->maxlen - trie->minlen);
5424    flags &= ~SCF_DO_STCLASS; /* xxx */
5425    if (flags & SCF_DO_SUBSTR) {
5426     /* Cannot expect anything... */
5427     scan_commit(pRExC_state, data, minlenp, is_inf);
5428      data->pos_min += trie->minlen;
5429      data->pos_delta += (trie->maxlen - trie->minlen);
5430     if (trie->maxlen != trie->minlen)
5431      data->longest = &(data->longest_float);
5432     }
5433     if (trie->jump) /* no more substrings -- for now /grr*/
5434    flags &= ~SCF_DO_SUBSTR;
5435   }
5436 #endif /* old or new */
5437 #endif /* TRIE_STUDY_OPT */
5438
5439   /* Else: zero-length, ignore. */
5440   scan = regnext(scan);
5441  }
5442  /* If we are exiting a recursion we can unset its recursed bit
5443  * and allow ourselves to enter it again - no danger of an
5444  * infinite loop there.
5445  if (stopparen > -1 && recursed) {
5446   DEBUG_STUDYDATA("unset:", data,depth);
5447   PAREN_UNSET( recursed, stopparen);
5448  }
5449  */
5450  if (frame) {
5451   DEBUG_STUDYDATA("frame-end:",data,depth);
5452   DEBUG_PEEP("fend", scan, depth);
5453   /* restore previous context */
5454   last = frame->last;
5455   scan = frame->next;
5456   stopparen = frame->stop;
5457   recursed_depth = frame->prev_recursed_depth;
5458   depth = depth - 1;
5459
5460   frame = frame->prev;
5461   goto fake_study_recurse;
5462  }
5463
5464   finish:
5465  assert(!frame);
5466  DEBUG_STUDYDATA("pre-fin:",data,depth);
5467
5468  *scanp = scan;
5469  *deltap = is_inf_internal ? SSize_t_MAX : delta;
5470
5471  if (flags & SCF_DO_SUBSTR && is_inf)
5472   data->pos_delta = SSize_t_MAX - data->pos_min;
5473  if (is_par > (I32)U8_MAX)
5474   is_par = 0;
5475  if (is_par && pars==1 && data) {
5476   data->flags |= SF_IN_PAR;
5477   data->flags &= ~SF_HAS_PAR;
5478  }
5479  else if (pars && data) {
5480   data->flags |= SF_HAS_PAR;
5481   data->flags &= ~SF_IN_PAR;
5482  }
5483  if (flags & SCF_DO_STCLASS_OR)
5484   ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5485  if (flags & SCF_TRIE_RESTUDY)
5486   data->flags |=  SCF_TRIE_RESTUDY;
5487
5488  DEBUG_STUDYDATA("post-fin:",data,depth);
5489
5490  {
5491   SSize_t final_minlen= min < stopmin ? min : stopmin;
5492
5493   if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5494    RExC_maxlen = final_minlen + delta;
5495   }
5496   return final_minlen;
5497  }
5498  /* not-reached */
5499 }
5500
5501 STATIC U32
5502 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5503 {
5504  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5505
5506  PERL_ARGS_ASSERT_ADD_DATA;
5507
5508  Renewc(RExC_rxi->data,
5509   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5510   char, struct reg_data);
5511  if(count)
5512   Renew(RExC_rxi->data->what, count + n, U8);
5513  else
5514   Newx(RExC_rxi->data->what, n, U8);
5515  RExC_rxi->data->count = count + n;
5516  Copy(s, RExC_rxi->data->what + count, n, U8);
5517  return count;
5518 }
5519
5520 /*XXX: todo make this not included in a non debugging perl, but appears to be
5521  * used anyway there, in 'use re' */
5522 #ifndef PERL_IN_XSUB_RE
5523 void
5524 Perl_reginitcolors(pTHX)
5525 {
5526  dVAR;
5527  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5528  if (s) {
5529   char *t = savepv(s);
5530   int i = 0;
5531   PL_colors[0] = t;
5532   while (++i < 6) {
5533    t = strchr(t, '\t');
5534    if (t) {
5535     *t = '\0';
5536     PL_colors[i] = ++t;
5537    }
5538    else
5539     PL_colors[i] = t = (char *)"";
5540   }
5541  } else {
5542   int i = 0;
5543   while (i < 6)
5544    PL_colors[i++] = (char *)"";
5545  }
5546  PL_colorset = 1;
5547 }
5548 #endif
5549
5550
5551 #ifdef TRIE_STUDY_OPT
5552 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5553  STMT_START {                                            \
5554   if (                                                \
5555    (data.flags & SCF_TRIE_RESTUDY)               \
5556    && ! restudied++                              \
5557   ) {                                                 \
5558    dOsomething;                                    \
5559    goto reStudy;                                   \
5560   }                                                   \
5561  } STMT_END
5562 #else
5563 #define CHECK_RESTUDY_GOTO_butfirst
5564 #endif
5565
5566 /*
5567  * pregcomp - compile a regular expression into internal code
5568  *
5569  * Decides which engine's compiler to call based on the hint currently in
5570  * scope
5571  */
5572
5573 #ifndef PERL_IN_XSUB_RE
5574
5575 /* return the currently in-scope regex engine (or the default if none)  */
5576
5577 regexp_engine const *
5578 Perl_current_re_engine(pTHX)
5579 {
5580  dVAR;
5581
5582  if (IN_PERL_COMPILETIME) {
5583   HV * const table = GvHV(PL_hintgv);
5584   SV **ptr;
5585
5586   if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5587    return &reh_regexp_engine;
5588   ptr = hv_fetchs(table, "regcomp", FALSE);
5589   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5590    return &reh_regexp_engine;
5591   return INT2PTR(regexp_engine*,SvIV(*ptr));
5592  }
5593  else {
5594   SV *ptr;
5595   if (!PL_curcop->cop_hints_hash)
5596    return &reh_regexp_engine;
5597   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5598   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5599    return &reh_regexp_engine;
5600   return INT2PTR(regexp_engine*,SvIV(ptr));
5601  }
5602 }
5603
5604
5605 REGEXP *
5606 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5607 {
5608  dVAR;
5609  regexp_engine const *eng = current_re_engine();
5610  GET_RE_DEBUG_FLAGS_DECL;
5611
5612  PERL_ARGS_ASSERT_PREGCOMP;
5613
5614  /* Dispatch a request to compile a regexp to correct regexp engine. */
5615  DEBUG_COMPILE_r({
5616   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5617       PTR2UV(eng));
5618  });
5619  return CALLREGCOMP_ENG(eng, pattern, flags);
5620 }
5621 #endif
5622
5623 /* public(ish) entry point for the perl core's own regex compiling code.
5624  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5625  * pattern rather than a list of OPs, and uses the internal engine rather
5626  * than the current one */
5627
5628 REGEXP *
5629 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5630 {
5631  SV *pat = pattern; /* defeat constness! */
5632  PERL_ARGS_ASSERT_RE_COMPILE;
5633  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5634 #ifdef PERL_IN_XSUB_RE
5635         &my_reg_engine,
5636 #else
5637         &reh_regexp_engine,
5638 #endif
5639         NULL, NULL, rx_flags, 0);
5640 }
5641
5642
5643 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5644  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5645  * point to the realloced string and length.
5646  *
5647  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5648  * stuff added */
5649
5650 static void
5651 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5652      char **pat_p, STRLEN *plen_p, int num_code_blocks)
5653 {
5654  U8 *const src = (U8*)*pat_p;
5655  U8 *dst;
5656  int n=0;
5657  STRLEN s = 0, d = 0;
5658  bool do_end = 0;
5659  GET_RE_DEBUG_FLAGS_DECL;
5660
5661  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5662   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5663
5664  Newx(dst, *plen_p * 2 + 1, U8);
5665
5666  while (s < *plen_p) {
5667   if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5668    dst[d]   = src[s];
5669   else {
5670    dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5671    dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5672   }
5673   if (n < num_code_blocks) {
5674    if (!do_end && pRExC_state->code_blocks[n].start == s) {
5675     pRExC_state->code_blocks[n].start = d;
5676     assert(dst[d] == '(');
5677     do_end = 1;
5678    }
5679    else if (do_end && pRExC_state->code_blocks[n].end == s) {
5680     pRExC_state->code_blocks[n].end = d;
5681     assert(dst[d] == ')');
5682     do_end = 0;
5683     n++;
5684    }
5685   }
5686   s++;
5687   d++;
5688  }
5689  dst[d] = '\0';
5690  *plen_p = d;
5691  *pat_p = (char*) dst;
5692  SAVEFREEPV(*pat_p);
5693  RExC_orig_utf8 = RExC_utf8 = 1;
5694 }
5695
5696
5697
5698 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5699  * while recording any code block indices, and handling overloading,
5700  * nested qr// objects etc.  If pat is null, it will allocate a new
5701  * string, or just return the first arg, if there's only one.
5702  *
5703  * Returns the malloced/updated pat.
5704  * patternp and pat_count is the array of SVs to be concatted;
5705  * oplist is the optional list of ops that generated the SVs;
5706  * recompile_p is a pointer to a boolean that will be set if
5707  *   the regex will need to be recompiled.
5708  * delim, if non-null is an SV that will be inserted between each element
5709  */
5710
5711 static SV*
5712 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5713     SV *pat, SV ** const patternp, int pat_count,
5714     OP *oplist, bool *recompile_p, SV *delim)
5715 {
5716  SV **svp;
5717  int n = 0;
5718  bool use_delim = FALSE;
5719  bool alloced = FALSE;
5720
5721  /* if we know we have at least two args, create an empty string,
5722  * then concatenate args to that. For no args, return an empty string */
5723  if (!pat && pat_count != 1) {
5724   pat = newSVpvs("");
5725   SAVEFREESV(pat);
5726   alloced = TRUE;
5727  }
5728
5729  for (svp = patternp; svp < patternp + pat_count; svp++) {
5730   SV *sv;
5731   SV *rx  = NULL;
5732   STRLEN orig_patlen = 0;
5733   bool code = 0;
5734   SV *msv = use_delim ? delim : *svp;
5735   if (!msv) msv = &PL_sv_undef;
5736
5737   /* if we've got a delimiter, we go round the loop twice for each
5738   * svp slot (except the last), using the delimiter the second
5739   * time round */
5740   if (use_delim) {
5741    svp--;
5742    use_delim = FALSE;
5743   }
5744   else if (delim)
5745    use_delim = TRUE;
5746
5747   if (SvTYPE(msv) == SVt_PVAV) {
5748    /* we've encountered an interpolated array within
5749    * the pattern, e.g. /...@a..../. Expand the list of elements,
5750    * then recursively append elements.
5751    * The code in this block is based on S_pushav() */
5752
5753    AV *const av = (AV*)msv;
5754    const SSize_t maxarg = AvFILL(av) + 1;
5755    SV **array;
5756
5757    if (oplist) {
5758     assert(oplist->op_type == OP_PADAV
5759      || oplist->op_type == OP_RV2AV);
5760     oplist = oplist->op_sibling;;
5761    }
5762
5763    if (SvRMAGICAL(av)) {
5764     SSize_t i;
5765
5766     Newx(array, maxarg, SV*);
5767     SAVEFREEPV(array);
5768     for (i=0; i < maxarg; i++) {
5769      SV ** const svp = av_fetch(av, i, FALSE);
5770      array[i] = svp ? *svp : &PL_sv_undef;
5771     }
5772    }
5773    else
5774     array = AvARRAY(av);
5775
5776    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5777         array, maxarg, NULL, recompile_p,
5778         /* $" */
5779         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5780
5781    continue;
5782   }
5783
5784
5785   /* we make the assumption here that each op in the list of
5786   * op_siblings maps to one SV pushed onto the stack,
5787   * except for code blocks, with have both an OP_NULL and
5788   * and OP_CONST.
5789   * This allows us to match up the list of SVs against the
5790   * list of OPs to find the next code block.
5791   *
5792   * Note that       PUSHMARK PADSV PADSV ..
5793   * is optimised to
5794   *                 PADRANGE PADSV  PADSV  ..
5795   * so the alignment still works. */
5796
5797   if (oplist) {
5798    if (oplist->op_type == OP_NULL
5799     && (oplist->op_flags & OPf_SPECIAL))
5800    {
5801     assert(n < pRExC_state->num_code_blocks);
5802     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5803     pRExC_state->code_blocks[n].block = oplist;
5804     pRExC_state->code_blocks[n].src_regex = NULL;
5805     n++;
5806     code = 1;
5807     oplist = oplist->op_sibling; /* skip CONST */
5808     assert(oplist);
5809    }
5810    oplist = oplist->op_sibling;;
5811   }
5812
5813   /* apply magic and QR overloading to arg */
5814
5815   SvGETMAGIC(msv);
5816   if (SvROK(msv) && SvAMAGIC(msv)) {
5817    SV *sv = AMG_CALLunary(msv, regexp_amg);
5818    if (sv) {
5819     if (SvROK(sv))
5820      sv = SvRV(sv);
5821     if (SvTYPE(sv) != SVt_REGEXP)
5822      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5823     msv = sv;
5824    }
5825   }
5826
5827   /* try concatenation overload ... */
5828   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5829     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5830   {
5831    sv_setsv(pat, sv);
5832    /* overloading involved: all bets are off over literal
5833    * code. Pretend we haven't seen it */
5834    pRExC_state->num_code_blocks -= n;
5835    n = 0;
5836   }
5837   else  {
5838    /* ... or failing that, try "" overload */
5839    while (SvAMAGIC(msv)
5840      && (sv = AMG_CALLunary(msv, string_amg))
5841      && sv != msv
5842      &&  !(   SvROK(msv)
5843       && SvROK(sv)
5844       && SvRV(msv) == SvRV(sv))
5845    ) {
5846     msv = sv;
5847     SvGETMAGIC(msv);
5848    }
5849    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5850     msv = SvRV(msv);
5851
5852    if (pat) {
5853     /* this is a partially unrolled
5854     *     sv_catsv_nomg(pat, msv);
5855     * that allows us to adjust code block indices if
5856     * needed */
5857     STRLEN dlen;
5858     char *dst = SvPV_force_nomg(pat, dlen);
5859     orig_patlen = dlen;
5860     if (SvUTF8(msv) && !SvUTF8(pat)) {
5861      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5862      sv_setpvn(pat, dst, dlen);
5863      SvUTF8_on(pat);
5864     }
5865     sv_catsv_nomg(pat, msv);
5866     rx = msv;
5867    }
5868    else
5869     pat = msv;
5870
5871    if (code)
5872     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5873   }
5874
5875   /* extract any code blocks within any embedded qr//'s */
5876   if (rx && SvTYPE(rx) == SVt_REGEXP
5877    && RX_ENGINE((REGEXP*)rx)->op_comp)
5878   {
5879
5880    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5881    if (ri->num_code_blocks) {
5882     int i;
5883     /* the presence of an embedded qr// with code means
5884     * we should always recompile: the text of the
5885     * qr// may not have changed, but it may be a
5886     * different closure than last time */
5887     *recompile_p = 1;
5888     Renew(pRExC_state->code_blocks,
5889      pRExC_state->num_code_blocks + ri->num_code_blocks,
5890      struct reg_code_block);
5891     pRExC_state->num_code_blocks += ri->num_code_blocks;
5892
5893     for (i=0; i < ri->num_code_blocks; i++) {
5894      struct reg_code_block *src, *dst;
5895      STRLEN offset =  orig_patlen
5896       + ReANY((REGEXP *)rx)->pre_prefix;
5897      assert(n < pRExC_state->num_code_blocks);
5898      src = &ri->code_blocks[i];
5899      dst = &pRExC_state->code_blocks[n];
5900      dst->start     = src->start + offset;
5901      dst->end     = src->end   + offset;
5902      dst->block     = src->block;
5903      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5904            src->src_regex
5905             ? src->src_regex
5906             : (REGEXP*)rx);
5907      n++;
5908     }
5909    }
5910   }
5911  }
5912  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5913  if (alloced)
5914   SvSETMAGIC(pat);
5915
5916  return pat;
5917 }
5918
5919
5920
5921 /* see if there are any run-time code blocks in the pattern.
5922  * False positives are allowed */
5923
5924 static bool
5925 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5926      char *pat, STRLEN plen)
5927 {
5928  int n = 0;
5929  STRLEN s;
5930
5931  PERL_UNUSED_CONTEXT;
5932
5933  for (s = 0; s < plen; s++) {
5934   if (n < pRExC_state->num_code_blocks
5935    && s == pRExC_state->code_blocks[n].start)
5936   {
5937    s = pRExC_state->code_blocks[n].end;
5938    n++;
5939    continue;
5940   }
5941   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5942   * positives here */
5943   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5944    (pat[s+2] == '{'
5945     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5946   )
5947    return 1;
5948  }
5949  return 0;
5950 }
5951
5952 /* Handle run-time code blocks. We will already have compiled any direct
5953  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5954  * copy of it, but with any literal code blocks blanked out and
5955  * appropriate chars escaped; then feed it into
5956  *
5957  *    eval "qr'modified_pattern'"
5958  *
5959  * For example,
5960  *
5961  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5962  *
5963  * becomes
5964  *
5965  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5966  *
5967  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5968  * and merge them with any code blocks of the original regexp.
5969  *
5970  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5971  * instead, just save the qr and return FALSE; this tells our caller that
5972  * the original pattern needs upgrading to utf8.
5973  */
5974
5975 static bool
5976 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5977  char *pat, STRLEN plen)
5978 {
5979  SV *qr;
5980
5981  GET_RE_DEBUG_FLAGS_DECL;
5982
5983  if (pRExC_state->runtime_code_qr) {
5984   /* this is the second time we've been called; this should
5985   * only happen if the main pattern got upgraded to utf8
5986   * during compilation; re-use the qr we compiled first time
5987   * round (which should be utf8 too)
5988   */
5989   qr = pRExC_state->runtime_code_qr;
5990   pRExC_state->runtime_code_qr = NULL;
5991   assert(RExC_utf8 && SvUTF8(qr));
5992  }
5993  else {
5994   int n = 0;
5995   STRLEN s;
5996   char *p, *newpat;
5997   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5998   SV *sv, *qr_ref;
5999   dSP;
6000
6001   /* determine how many extra chars we need for ' and \ escaping */
6002   for (s = 0; s < plen; s++) {
6003    if (pat[s] == '\'' || pat[s] == '\\')
6004     newlen++;
6005   }
6006
6007   Newx(newpat, newlen, char);
6008   p = newpat;
6009   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6010
6011   for (s = 0; s < plen; s++) {
6012    if (n < pRExC_state->num_code_blocks
6013     && s == pRExC_state->code_blocks[n].start)
6014    {
6015     /* blank out literal code block */
6016     assert(pat[s] == '(');
6017     while (s <= pRExC_state->code_blocks[n].end) {
6018      *p++ = '_';
6019      s++;
6020     }
6021     s--;
6022     n++;
6023     continue;
6024    }
6025    if (pat[s] == '\'' || pat[s] == '\\')
6026     *p++ = '\\';
6027    *p++ = pat[s];
6028   }
6029   *p++ = '\'';
6030   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6031    *p++ = 'x';
6032   *p++ = '\0';
6033   DEBUG_COMPILE_r({
6034    PerlIO_printf(Perl_debug_log,
6035     "%sre-parsing pattern for runtime code:%s %s\n",
6036     PL_colors[4],PL_colors[5],newpat);
6037   });
6038
6039   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6040   Safefree(newpat);
6041
6042   ENTER;
6043   SAVETMPS;
6044   save_re_context();
6045   PUSHSTACKi(PERLSI_REQUIRE);
6046   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6047   * parsing qr''; normally only q'' does this. It also alters
6048   * hints handling */
6049   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6050   SvREFCNT_dec_NN(sv);
6051   SPAGAIN;
6052   qr_ref = POPs;
6053   PUTBACK;
6054   {
6055    SV * const errsv = ERRSV;
6056    if (SvTRUE_NN(errsv))
6057    {
6058     Safefree(pRExC_state->code_blocks);
6059     /* use croak_sv ? */
6060     Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6061    }
6062   }
6063   assert(SvROK(qr_ref));
6064   qr = SvRV(qr_ref);
6065   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6066   /* the leaving below frees the tmp qr_ref.
6067   * Give qr a life of its own */
6068   SvREFCNT_inc(qr);
6069   POPSTACK;
6070   FREETMPS;
6071   LEAVE;
6072
6073  }
6074
6075  if (!RExC_utf8 && SvUTF8(qr)) {
6076   /* first time through; the pattern got upgraded; save the
6077   * qr for the next time through */
6078   assert(!pRExC_state->runtime_code_qr);
6079   pRExC_state->runtime_code_qr = qr;
6080   return 0;
6081  }
6082
6083
6084  /* extract any code blocks within the returned qr//  */
6085
6086
6087  /* merge the main (r1) and run-time (r2) code blocks into one */
6088  {
6089   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6090   struct reg_code_block *new_block, *dst;
6091   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6092   int i1 = 0, i2 = 0;
6093
6094   if (!r2->num_code_blocks) /* we guessed wrong */
6095   {
6096    SvREFCNT_dec_NN(qr);
6097    return 1;
6098   }
6099
6100   Newx(new_block,
6101    r1->num_code_blocks + r2->num_code_blocks,
6102    struct reg_code_block);
6103   dst = new_block;
6104
6105   while (    i1 < r1->num_code_blocks
6106     || i2 < r2->num_code_blocks)
6107   {
6108    struct reg_code_block *src;
6109    bool is_qr = 0;
6110
6111    if (i1 == r1->num_code_blocks) {
6112     src = &r2->code_blocks[i2++];
6113     is_qr = 1;
6114    }
6115    else if (i2 == r2->num_code_blocks)
6116     src = &r1->code_blocks[i1++];
6117    else if (  r1->code_blocks[i1].start
6118      < r2->code_blocks[i2].start)
6119    {
6120     src = &r1->code_blocks[i1++];
6121     assert(src->end < r2->code_blocks[i2].start);
6122    }
6123    else {
6124     assert(  r1->code_blocks[i1].start
6125      > r2->code_blocks[i2].start);
6126     src = &r2->code_blocks[i2++];
6127     is_qr = 1;
6128     assert(src->end < r1->code_blocks[i1].start);
6129    }
6130
6131    assert(pat[src->start] == '(');
6132    assert(pat[src->end]   == ')');
6133    dst->start     = src->start;
6134    dst->end     = src->end;
6135    dst->block     = src->block;
6136    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6137          : src->src_regex;
6138    dst++;
6139   }
6140   r1->num_code_blocks += r2->num_code_blocks;
6141   Safefree(r1->code_blocks);
6142   r1->code_blocks = new_block;
6143  }
6144
6145  SvREFCNT_dec_NN(qr);
6146  return 1;
6147 }
6148
6149
6150 STATIC bool
6151 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6152      SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6153      SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6154      STRLEN longest_length, bool eol, bool meol)
6155 {
6156  /* This is the common code for setting up the floating and fixed length
6157  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6158  * as to whether succeeded or not */
6159
6160  I32 t;
6161  SSize_t ml;
6162
6163  if (! (longest_length
6164   || (eol /* Can't have SEOL and MULTI */
6165    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6166   )
6167    /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6168   || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6169  {
6170   return FALSE;
6171  }
6172
6173  /* copy the information about the longest from the reg_scan_data
6174   over to the program. */
6175  if (SvUTF8(sv_longest)) {
6176   *rx_utf8 = sv_longest;
6177   *rx_substr = NULL;
6178  } else {
6179   *rx_substr = sv_longest;
6180   *rx_utf8 = NULL;
6181  }
6182  /* end_shift is how many chars that must be matched that
6183   follow this item. We calculate it ahead of time as once the
6184   lookbehind offset is added in we lose the ability to correctly
6185   calculate it.*/
6186  ml = minlen ? *(minlen) : (SSize_t)longest_length;
6187  *rx_end_shift = ml - offset
6188   - longest_length + (SvTAIL(sv_longest) != 0)
6189   + lookbehind;
6190
6191  t = (eol/* Can't have SEOL and MULTI */
6192   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6193  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6194
6195  return TRUE;
6196 }
6197
6198 /*
6199  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6200  * regular expression into internal code.
6201  * The pattern may be passed either as:
6202  *    a list of SVs (patternp plus pat_count)
6203  *    a list of OPs (expr)
6204  * If both are passed, the SV list is used, but the OP list indicates
6205  * which SVs are actually pre-compiled code blocks
6206  *
6207  * The SVs in the list have magic and qr overloading applied to them (and
6208  * the list may be modified in-place with replacement SVs in the latter
6209  * case).
6210  *
6211  * If the pattern hasn't changed from old_re, then old_re will be
6212  * returned.
6213  *
6214  * eng is the current engine. If that engine has an op_comp method, then
6215  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6216  * do the initial concatenation of arguments and pass on to the external
6217  * engine.
6218  *
6219  * If is_bare_re is not null, set it to a boolean indicating whether the
6220  * arg list reduced (after overloading) to a single bare regex which has
6221  * been returned (i.e. /$qr/).
6222  *
6223  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6224  *
6225  * pm_flags contains the PMf_* flags, typically based on those from the
6226  * pm_flags field of the related PMOP. Currently we're only interested in
6227  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6228  *
6229  * We can't allocate space until we know how big the compiled form will be,
6230  * but we can't compile it (and thus know how big it is) until we've got a
6231  * place to put the code.  So we cheat:  we compile it twice, once with code
6232  * generation turned off and size counting turned on, and once "for real".
6233  * This also means that we don't allocate space until we are sure that the
6234  * thing really will compile successfully, and we never have to move the
6235  * code and thus invalidate pointers into it.  (Note that it has to be in
6236  * one piece because free() must be able to free it all.) [NB: not true in perl]
6237  *
6238  * Beware that the optimization-preparation code in here knows about some
6239  * of the structure of the compiled regexp.  [I'll say.]
6240  */
6241
6242 REGEXP *
6243 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6244      OP *expr, const regexp_engine* eng, REGEXP *old_re,
6245      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6246 {
6247  dVAR;
6248  REGEXP *rx;
6249  struct regexp *r;
6250  regexp_internal *ri;
6251  STRLEN plen;
6252  char *exp;
6253  regnode *scan;
6254  I32 flags;
6255  SSize_t minlen = 0;
6256  U32 rx_flags;
6257  SV *pat;
6258  SV *code_blocksv = NULL;
6259  SV** new_patternp = patternp;
6260
6261  /* these are all flags - maybe they should be turned
6262  * into a single int with different bit masks */
6263  I32 sawlookahead = 0;
6264  I32 sawplus = 0;
6265  I32 sawopen = 0;
6266  I32 sawminmod = 0;
6267
6268  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6269  bool recompile = 0;
6270  bool runtime_code = 0;
6271  scan_data_t data;
6272  RExC_state_t RExC_state;
6273  RExC_state_t * const pRExC_state = &RExC_state;
6274 #ifdef TRIE_STUDY_OPT
6275  int restudied = 0;
6276  RExC_state_t copyRExC_state;
6277 #endif
6278  GET_RE_DEBUG_FLAGS_DECL;
6279
6280  PERL_ARGS_ASSERT_RE_OP_COMPILE;
6281
6282  DEBUG_r(if (!PL_colorset) reginitcolors());
6283
6284 #ifndef PERL_IN_XSUB_RE
6285  /* Initialize these here instead of as-needed, as is quick and avoids
6286  * having to test them each time otherwise */
6287  if (! PL_AboveLatin1) {
6288   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6289   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6290   PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6291   PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6292   PL_HasMultiCharFold =
6293      _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6294  }
6295 #endif
6296
6297  pRExC_state->code_blocks = NULL;
6298  pRExC_state->num_code_blocks = 0;
6299
6300  if (is_bare_re)
6301   *is_bare_re = FALSE;
6302
6303  if (expr && (expr->op_type == OP_LIST ||
6304     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6305   /* allocate code_blocks if needed */
6306   OP *o;
6307   int ncode = 0;
6308
6309   for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6310    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6311     ncode++; /* count of DO blocks */
6312   if (ncode) {
6313    pRExC_state->num_code_blocks = ncode;
6314    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6315   }
6316  }
6317
6318  if (!pat_count) {
6319   /* compile-time pattern with just OP_CONSTs and DO blocks */
6320
6321   int n;
6322   OP *o;
6323
6324   /* find how many CONSTs there are */
6325   assert(expr);
6326   n = 0;
6327   if (expr->op_type == OP_CONST)
6328    n = 1;
6329   else
6330    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6331     if (o->op_type == OP_CONST)
6332      n++;
6333    }
6334
6335   /* fake up an SV array */
6336
6337   assert(!new_patternp);
6338   Newx(new_patternp, n, SV*);
6339   SAVEFREEPV(new_patternp);
6340   pat_count = n;
6341
6342   n = 0;
6343   if (expr->op_type == OP_CONST)
6344    new_patternp[n] = cSVOPx_sv(expr);
6345   else
6346    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6347     if (o->op_type == OP_CONST)
6348      new_patternp[n++] = cSVOPo_sv;
6349    }
6350
6351  }
6352
6353  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6354   "Assembling pattern from %d elements%s\n", pat_count,
6355    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6356
6357  /* set expr to the first arg op */
6358
6359  if (pRExC_state->num_code_blocks
6360   && expr->op_type != OP_CONST)
6361  {
6362    expr = cLISTOPx(expr)->op_first;
6363    assert(   expr->op_type == OP_PUSHMARK
6364     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6365     || expr->op_type == OP_PADRANGE);
6366    expr = expr->op_sibling;
6367  }
6368
6369  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6370       expr, &recompile, NULL);
6371
6372  /* handle bare (possibly after overloading) regex: foo =~ $re */
6373  {
6374   SV *re = pat;
6375   if (SvROK(re))
6376    re = SvRV(re);
6377   if (SvTYPE(re) == SVt_REGEXP) {
6378    if (is_bare_re)
6379     *is_bare_re = TRUE;
6380    SvREFCNT_inc(re);
6381    Safefree(pRExC_state->code_blocks);
6382    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6383     "Precompiled pattern%s\n",
6384      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6385
6386    return (REGEXP*)re;
6387   }
6388  }
6389
6390  exp = SvPV_nomg(pat, plen);
6391
6392  if (!eng->op_comp) {
6393   if ((SvUTF8(pat) && IN_BYTES)
6394     || SvGMAGICAL(pat) || SvAMAGIC(pat))
6395   {
6396    /* make a temporary copy; either to convert to bytes,
6397    * or to avoid repeating get-magic / overloaded stringify */
6398    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6399           (IN_BYTES ? 0 : SvUTF8(pat)));
6400   }
6401   Safefree(pRExC_state->code_blocks);
6402   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6403  }
6404
6405  /* ignore the utf8ness if the pattern is 0 length */
6406  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6407  RExC_uni_semantics = 0;
6408  RExC_contains_locale = 0;
6409  RExC_contains_i = 0;
6410  pRExC_state->runtime_code_qr = NULL;
6411
6412  DEBUG_COMPILE_r({
6413    SV *dsv= sv_newmortal();
6414    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6415    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6416       PL_colors[4],PL_colors[5],s);
6417   });
6418
6419   redo_first_pass:
6420  /* we jump here if we upgrade the pattern to utf8 and have to
6421  * recompile */
6422
6423  if ((pm_flags & PMf_USE_RE_EVAL)
6424     /* this second condition covers the non-regex literal case,
6425     * i.e.  $foo =~ '(?{})'. */
6426     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6427  )
6428   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6429
6430  /* return old regex if pattern hasn't changed */
6431  /* XXX: note in the below we have to check the flags as well as the
6432  * pattern.
6433  *
6434  * Things get a touch tricky as we have to compare the utf8 flag
6435  * independently from the compile flags.  */
6436
6437  if (   old_re
6438   && !recompile
6439   && !!RX_UTF8(old_re) == !!RExC_utf8
6440   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6441   && RX_PRECOMP(old_re)
6442   && RX_PRELEN(old_re) == plen
6443   && memEQ(RX_PRECOMP(old_re), exp, plen)
6444   && !runtime_code /* with runtime code, always recompile */ )
6445  {
6446   Safefree(pRExC_state->code_blocks);
6447   return old_re;
6448  }
6449
6450  rx_flags = orig_rx_flags;
6451
6452  if (rx_flags & PMf_FOLD) {
6453   RExC_contains_i = 1;
6454  }
6455  if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6456
6457   /* Set to use unicode semantics if the pattern is in utf8 and has the
6458   * 'depends' charset specified, as it means unicode when utf8  */
6459   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6460  }
6461
6462  RExC_precomp = exp;
6463  RExC_flags = rx_flags;
6464  RExC_pm_flags = pm_flags;
6465
6466  if (runtime_code) {
6467   if (TAINTING_get && TAINT_get)
6468    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6469
6470   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6471    /* whoops, we have a non-utf8 pattern, whilst run-time code
6472    * got compiled as utf8. Try again with a utf8 pattern */
6473    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6474          pRExC_state->num_code_blocks);
6475    goto redo_first_pass;
6476   }
6477  }
6478  assert(!pRExC_state->runtime_code_qr);
6479
6480  RExC_sawback = 0;
6481
6482  RExC_seen = 0;
6483  RExC_maxlen = 0;
6484  RExC_in_lookbehind = 0;
6485  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6486  RExC_extralen = 0;
6487  RExC_override_recoding = 0;
6488  RExC_in_multi_char_class = 0;
6489
6490  /* First pass: determine size, legality. */
6491  RExC_parse = exp;
6492  RExC_start = exp;
6493  RExC_end = exp + plen;
6494  RExC_naughty = 0;
6495  RExC_npar = 1;
6496  RExC_nestroot = 0;
6497  RExC_size = 0L;
6498  RExC_emit = (regnode *) &RExC_emit_dummy;
6499  RExC_whilem_seen = 0;
6500  RExC_open_parens = NULL;
6501  RExC_close_parens = NULL;
6502  RExC_opend = NULL;
6503  RExC_paren_names = NULL;
6504 #ifdef DEBUGGING
6505  RExC_paren_name_list = NULL;
6506 #endif
6507  RExC_recurse = NULL;
6508  RExC_study_chunk_recursed = NULL;
6509  RExC_study_chunk_recursed_bytes= 0;
6510  RExC_recurse_count = 0;
6511  pRExC_state->code_index = 0;
6512
6513 #if 0 /* REGC() is (currently) a NOP at the first pass.
6514  * Clever compilers notice this and complain. --jhi */
6515  REGC((U8)REG_MAGIC, (char*)RExC_emit);
6516 #endif
6517  DEBUG_PARSE_r(
6518   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6519   RExC_lastnum=0;
6520   RExC_lastparse=NULL;
6521  );
6522  /* reg may croak on us, not giving us a chance to free
6523  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6524  need it to survive as long as the regexp (qr/(?{})/).
6525  We must check that code_blocksv is not already set, because we may
6526  have jumped back to restart the sizing pass. */
6527  if (pRExC_state->code_blocks && !code_blocksv) {
6528   code_blocksv = newSV_type(SVt_PV);
6529   SAVEFREESV(code_blocksv);
6530   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6531   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6532  }
6533  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6534   /* It's possible to write a regexp in ascii that represents Unicode
6535   codepoints outside of the byte range, such as via \x{100}. If we
6536   detect such a sequence we have to convert the entire pattern to utf8
6537   and then recompile, as our sizing calculation will have been based
6538   on 1 byte == 1 character, but we will need to use utf8 to encode
6539   at least some part of the pattern, and therefore must convert the whole
6540   thing.
6541   -- dmq */
6542   if (flags & RESTART_UTF8) {
6543    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6544          pRExC_state->num_code_blocks);
6545    goto redo_first_pass;
6546   }
6547   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6548  }
6549  if (code_blocksv)
6550   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6551
6552  DEBUG_PARSE_r({
6553   PerlIO_printf(Perl_debug_log,
6554    "Required size %"IVdf" nodes\n"
6555    "Starting second pass (creation)\n",
6556    (IV)RExC_size);
6557   RExC_lastnum=0;
6558   RExC_lastparse=NULL;
6559  });
6560
6561  /* The first pass could have found things that force Unicode semantics */
6562  if ((RExC_utf8 || RExC_uni_semantics)
6563   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6564  {
6565   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6566  }
6567
6568  /* Small enough for pointer-storage convention?
6569  If extralen==0, this means that we will not need long jumps. */
6570  if (RExC_size >= 0x10000L && RExC_extralen)
6571   RExC_size += RExC_extralen;
6572  else
6573   RExC_extralen = 0;
6574  if (RExC_whilem_seen > 15)
6575   RExC_whilem_seen = 15;
6576
6577  /* Allocate space and zero-initialize. Note, the two step process
6578  of zeroing when in debug mode, thus anything assigned has to
6579  happen after that */
6580  rx = (REGEXP*) newSV_type(SVt_REGEXP);
6581  r = ReANY(rx);
6582  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6583   char, regexp_internal);
6584  if ( r == NULL || ri == NULL )
6585   FAIL("Regexp out of space");
6586 #ifdef DEBUGGING
6587  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6588  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6589   char);
6590 #else
6591  /* bulk initialize base fields with 0. */
6592  Zero(ri, sizeof(regexp_internal), char);
6593 #endif
6594
6595  /* non-zero initialization begins here */
6596  RXi_SET( r, ri );
6597  r->engine= eng;
6598  r->extflags = rx_flags;
6599  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6600
6601  if (pm_flags & PMf_IS_QR) {
6602   ri->code_blocks = pRExC_state->code_blocks;
6603   ri->num_code_blocks = pRExC_state->num_code_blocks;
6604  }
6605  else
6606  {
6607   int n;
6608   for (n = 0; n < pRExC_state->num_code_blocks; n++)
6609    if (pRExC_state->code_blocks[n].src_regex)
6610     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6611   SAVEFREEPV(pRExC_state->code_blocks);
6612  }
6613
6614  {
6615   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6616   bool has_charset = (get_regex_charset(r->extflags)
6617              != REGEX_DEPENDS_CHARSET);
6618
6619   /* The caret is output if there are any defaults: if not all the STD
6620   * flags are set, or if no character set specifier is needed */
6621   bool has_default =
6622      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6623      || ! has_charset);
6624   bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6625             == REG_RUN_ON_COMMENT_SEEN);
6626   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6627        >> RXf_PMf_STD_PMMOD_SHIFT);
6628   const char *fptr = STD_PAT_MODS;        /*"msix"*/
6629   char *p;
6630   /* Allocate for the worst case, which is all the std flags are turned
6631   * on.  If more precision is desired, we could do a population count of
6632   * the flags set.  This could be done with a small lookup table, or by
6633   * shifting, masking and adding, or even, when available, assembly
6634   * language for a machine-language population count.
6635   * We never output a minus, as all those are defaults, so are
6636   * covered by the caret */
6637   const STRLEN wraplen = plen + has_p + has_runon
6638    + has_default       /* If needs a caret */
6639
6640     /* If needs a character set specifier */
6641    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6642    + (sizeof(STD_PAT_MODS) - 1)
6643    + (sizeof("(?:)") - 1);
6644
6645   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6646   r->xpv_len_u.xpvlenu_pv = p;
6647   if (RExC_utf8)
6648    SvFLAGS(rx) |= SVf_UTF8;
6649   *p++='('; *p++='?';
6650
6651   /* If a default, cover it using the caret */
6652   if (has_default) {
6653    *p++= DEFAULT_PAT_MOD;
6654   }
6655   if (has_charset) {
6656    STRLEN len;
6657    const char* const name = get_regex_charset_name(r->extflags, &len);
6658    Copy(name, p, len, char);
6659    p += len;
6660   }
6661   if (has_p)
6662    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6663   {
6664    char ch;
6665    while((ch = *fptr++)) {
6666     if(reganch & 1)
6667      *p++ = ch;
6668     reganch >>= 1;
6669    }
6670   }
6671
6672   *p++ = ':';
6673   Copy(RExC_precomp, p, plen, char);
6674   assert ((RX_WRAPPED(rx) - p) < 16);
6675   r->pre_prefix = p - RX_WRAPPED(rx);
6676   p += plen;
6677   if (has_runon)
6678    *p++ = '\n';
6679   *p++ = ')';
6680   *p = 0;
6681   SvCUR_set(rx, p - RX_WRAPPED(rx));
6682  }
6683
6684  r->intflags = 0;
6685  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6686
6687  /* setup various meta data about recursion, this all requires
6688  * RExC_npar to be correctly set, and a bit later on we clear it */
6689  if (RExC_seen & REG_RECURSE_SEEN) {
6690   Newxz(RExC_open_parens, RExC_npar,regnode *);
6691   SAVEFREEPV(RExC_open_parens);
6692   Newxz(RExC_close_parens,RExC_npar,regnode *);
6693   SAVEFREEPV(RExC_close_parens);
6694  }
6695  if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6696   /* Note, RExC_npar is 1 + the number of parens in a pattern.
6697   * So its 1 if there are no parens. */
6698   RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6699           ((RExC_npar & 0x07) != 0);
6700   Newx(RExC_study_chunk_recursed,
6701    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6702   SAVEFREEPV(RExC_study_chunk_recursed);
6703  }
6704
6705  /* Useful during FAIL. */
6706 #ifdef RE_TRACK_PATTERN_OFFSETS
6707  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6708  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6709       "%s %"UVuf" bytes for offset annotations.\n",
6710       ri->u.offsets ? "Got" : "Couldn't get",
6711       (UV)((2*RExC_size+1) * sizeof(U32))));
6712 #endif
6713  SetProgLen(ri,RExC_size);
6714  RExC_rx_sv = rx;
6715  RExC_rx = r;
6716  RExC_rxi = ri;
6717  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6718
6719  /* Second pass: emit code. */
6720  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6721  RExC_pm_flags = pm_flags;
6722  RExC_parse = exp;
6723  RExC_end = exp + plen;
6724  RExC_naughty = 0;
6725  RExC_npar = 1;
6726  RExC_emit_start = ri->program;
6727  RExC_emit = ri->program;
6728  RExC_emit_bound = ri->program + RExC_size + 1;
6729  pRExC_state->code_index = 0;
6730
6731  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6732  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6733   ReREFCNT_dec(rx);
6734   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6735  }
6736  /* XXXX To minimize changes to RE engine we always allocate
6737  3-units-long substrs field. */
6738  Newx(r->substrs, 1, struct reg_substr_data);
6739  if (RExC_recurse_count) {
6740   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6741   SAVEFREEPV(RExC_recurse);
6742  }
6743
6744 reStudy:
6745  r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6746  Zero(r->substrs, 1, struct reg_substr_data);
6747  if (RExC_study_chunk_recursed)
6748   Zero(RExC_study_chunk_recursed,
6749    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6750
6751 #ifdef TRIE_STUDY_OPT
6752  if (!restudied) {
6753   StructCopy(&zero_scan_data, &data, scan_data_t);
6754   copyRExC_state = RExC_state;
6755  } else {
6756   U32 seen=RExC_seen;
6757   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6758
6759   RExC_state = copyRExC_state;
6760   if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6761    RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6762   else
6763    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6764   StructCopy(&zero_scan_data, &data, scan_data_t);
6765  }
6766 #else
6767  StructCopy(&zero_scan_data, &data, scan_data_t);
6768 #endif
6769
6770  /* Dig out information for optimizations. */
6771  r->extflags = RExC_flags; /* was pm_op */
6772  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6773
6774  if (UTF)
6775   SvUTF8_on(rx); /* Unicode in it? */
6776  ri->regstclass = NULL;
6777  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6778   r->intflags |= PREGf_NAUGHTY;
6779  scan = ri->program + 1;  /* First BRANCH. */
6780
6781  /* testing for BRANCH here tells us whether there is "must appear"
6782  data in the pattern. If there is then we can use it for optimisations */
6783  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6784             */
6785   SSize_t fake;
6786   STRLEN longest_float_length, longest_fixed_length;
6787   regnode_ssc ch_class; /* pointed to by data */
6788   int stclass_flag;
6789   SSize_t last_close = 0; /* pointed to by data */
6790   regnode *first= scan;
6791   regnode *first_next= regnext(first);
6792   /*
6793   * Skip introductions and multiplicators >= 1
6794   * so that we can extract the 'meat' of the pattern that must
6795   * match in the large if() sequence following.
6796   * NOTE that EXACT is NOT covered here, as it is normally
6797   * picked up by the optimiser separately.
6798   *
6799   * This is unfortunate as the optimiser isnt handling lookahead
6800   * properly currently.
6801   *
6802   */
6803   while ((OP(first) == OPEN && (sawopen = 1)) ||
6804    /* An OR of *one* alternative - should not happen now. */
6805    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6806    /* for now we can't handle lookbehind IFMATCH*/
6807    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6808    (OP(first) == PLUS) ||
6809    (OP(first) == MINMOD) ||
6810    /* An {n,m} with n>0 */
6811    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6812    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6813   {
6814     /*
6815     * the only op that could be a regnode is PLUS, all the rest
6816     * will be regnode_1 or regnode_2.
6817     *
6818     * (yves doesn't think this is true)
6819     */
6820     if (OP(first) == PLUS)
6821      sawplus = 1;
6822     else {
6823      if (OP(first) == MINMOD)
6824       sawminmod = 1;
6825      first += regarglen[OP(first)];
6826     }
6827     first = NEXTOPER(first);
6828     first_next= regnext(first);
6829   }
6830
6831   /* Starting-point info. */
6832  again:
6833   DEBUG_PEEP("first:",first,0);
6834   /* Ignore EXACT as we deal with it later. */
6835   if (PL_regkind[OP(first)] == EXACT) {
6836    if (OP(first) == EXACT)
6837     NOOP; /* Empty, get anchored substr later. */
6838    else
6839     ri->regstclass = first;
6840   }
6841 #ifdef TRIE_STCLASS
6842   else if (PL_regkind[OP(first)] == TRIE &&
6843     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6844   {
6845    /* this can happen only on restudy */
6846    ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6847   }
6848 #endif
6849   else if (REGNODE_SIMPLE(OP(first)))
6850    ri->regstclass = first;
6851   else if (PL_regkind[OP(first)] == BOUND ||
6852     PL_regkind[OP(first)] == NBOUND)
6853    ri->regstclass = first;
6854   else if (PL_regkind[OP(first)] == BOL) {
6855    r->intflags |= (OP(first) == MBOL
6856       ? PREGf_ANCH_MBOL
6857       : (OP(first) == SBOL
6858        ? PREGf_ANCH_SBOL
6859        : PREGf_ANCH_BOL));
6860    first = NEXTOPER(first);
6861    goto again;
6862   }
6863   else if (OP(first) == GPOS) {
6864    r->intflags |= PREGf_ANCH_GPOS;
6865    first = NEXTOPER(first);
6866    goto again;
6867   }
6868   else if ((!sawopen || !RExC_sawback) &&
6869    (OP(first) == STAR &&
6870    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6871    !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6872   {
6873    /* turn .* into ^.* with an implied $*=1 */
6874    const int type =
6875     (OP(NEXTOPER(first)) == REG_ANY)
6876      ? PREGf_ANCH_MBOL
6877      : PREGf_ANCH_SBOL;
6878    r->intflags |= (type | PREGf_IMPLICIT);
6879    first = NEXTOPER(first);
6880    goto again;
6881   }
6882   if (sawplus && !sawminmod && !sawlookahead
6883    && (!sawopen || !RExC_sawback)
6884    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6885    /* x+ must match at the 1st pos of run of x's */
6886    r->intflags |= PREGf_SKIP;
6887
6888   /* Scan is after the zeroth branch, first is atomic matcher. */
6889 #ifdef TRIE_STUDY_OPT
6890   DEBUG_PARSE_r(
6891    if (!restudied)
6892     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6893        (IV)(first - scan + 1))
6894   );
6895 #else
6896   DEBUG_PARSE_r(
6897    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6898     (IV)(first - scan + 1))
6899   );
6900 #endif
6901
6902
6903   /*
6904   * If there's something expensive in the r.e., find the
6905   * longest literal string that must appear and make it the
6906   * regmust.  Resolve ties in favor of later strings, since
6907   * the regstart check works with the beginning of the r.e.
6908   * and avoiding duplication strengthens checking.  Not a
6909   * strong reason, but sufficient in the absence of others.
6910   * [Now we resolve ties in favor of the earlier string if
6911   * it happens that c_offset_min has been invalidated, since the
6912   * earlier string may buy us something the later one won't.]
6913   */
6914
6915   data.longest_fixed = newSVpvs("");
6916   data.longest_float = newSVpvs("");
6917   data.last_found = newSVpvs("");
6918   data.longest = &(data.longest_fixed);
6919   ENTER_with_name("study_chunk");
6920   SAVEFREESV(data.longest_fixed);
6921   SAVEFREESV(data.longest_float);
6922   SAVEFREESV(data.last_found);
6923   first = scan;
6924   if (!ri->regstclass) {
6925    ssc_init(pRExC_state, &ch_class);
6926    data.start_class = &ch_class;
6927    stclass_flag = SCF_DO_STCLASS_AND;
6928   } else    /* XXXX Check for BOUND? */
6929    stclass_flag = 0;
6930   data.last_closep = &last_close;
6931
6932   DEBUG_RExC_seen();
6933   minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6934        scan + RExC_size, /* Up to end */
6935    &data, -1, 0, NULL,
6936    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6937       | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6938    0);
6939
6940
6941   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6942
6943
6944   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6945    && data.last_start_min == 0 && data.last_end > 0
6946    && !RExC_seen_zerolen
6947    && !(RExC_seen & REG_VERBARG_SEEN)
6948    && !(RExC_seen & REG_GPOS_SEEN)
6949   ){
6950    r->extflags |= RXf_CHECK_ALL;
6951   }
6952   scan_commit(pRExC_state, &data,&minlen,0);
6953
6954   longest_float_length = CHR_SVLEN(data.longest_float);
6955
6956   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6957     && data.offset_fixed == data.offset_float_min
6958     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6959    && S_setup_longest (aTHX_ pRExC_state,
6960          data.longest_float,
6961          &(r->float_utf8),
6962          &(r->float_substr),
6963          &(r->float_end_shift),
6964          data.lookbehind_float,
6965          data.offset_float_min,
6966          data.minlen_float,
6967          longest_float_length,
6968          cBOOL(data.flags & SF_FL_BEFORE_EOL),
6969          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6970   {
6971    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6972    r->float_max_offset = data.offset_float_max;
6973    if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6974     r->float_max_offset -= data.lookbehind_float;
6975    SvREFCNT_inc_simple_void_NN(data.longest_float);
6976   }
6977   else {
6978    r->float_substr = r->float_utf8 = NULL;
6979    longest_float_length = 0;
6980   }
6981
6982   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6983
6984   if (S_setup_longest (aTHX_ pRExC_state,
6985         data.longest_fixed,
6986         &(r->anchored_utf8),
6987         &(r->anchored_substr),
6988         &(r->anchored_end_shift),
6989         data.lookbehind_fixed,
6990         data.offset_fixed,
6991         data.minlen_fixed,
6992         longest_fixed_length,
6993         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6994         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6995   {
6996    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6997    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6998   }
6999   else {
7000    r->anchored_substr = r->anchored_utf8 = NULL;
7001    longest_fixed_length = 0;
7002   }
7003   LEAVE_with_name("study_chunk");
7004
7005   if (ri->regstclass
7006    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7007    ri->regstclass = NULL;
7008
7009   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7010    && stclass_flag
7011    && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7012    && !ssc_is_anything(data.start_class))
7013   {
7014    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7015
7016    ssc_finalize(pRExC_state, data.start_class);
7017
7018    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7019    StructCopy(data.start_class,
7020      (regnode_ssc*)RExC_rxi->data->data[n],
7021      regnode_ssc);
7022    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7023    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7024    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7025      regprop(r, sv, (regnode*)data.start_class, NULL);
7026      PerlIO_printf(Perl_debug_log,
7027          "synthetic stclass \"%s\".\n",
7028          SvPVX_const(sv));});
7029    data.start_class = NULL;
7030   }
7031
7032   /* A temporary algorithm prefers floated substr to fixed one to dig
7033   * more info. */
7034   if (longest_fixed_length > longest_float_length) {
7035    r->substrs->check_ix = 0;
7036    r->check_end_shift = r->anchored_end_shift;
7037    r->check_substr = r->anchored_substr;
7038    r->check_utf8 = r->anchored_utf8;
7039    r->check_offset_min = r->check_offset_max = r->anchored_offset;
7040    if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7041     r->intflags |= PREGf_NOSCAN;
7042   }
7043   else {
7044    r->substrs->check_ix = 1;
7045    r->check_end_shift = r->float_end_shift;
7046    r->check_substr = r->float_substr;
7047    r->check_utf8 = r->float_utf8;
7048    r->check_offset_min = r->float_min_offset;
7049    r->check_offset_max = r->float_max_offset;
7050   }
7051   if ((r->check_substr || r->check_utf8) ) {
7052    r->extflags |= RXf_USE_INTUIT;
7053    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7054     r->extflags |= RXf_INTUIT_TAIL;
7055   }
7056   r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7057
7058   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7059   if ( (STRLEN)minlen < longest_float_length )
7060    minlen= longest_float_length;
7061   if ( (STRLEN)minlen < longest_fixed_length )
7062    minlen= longest_fixed_length;
7063   */
7064  }
7065  else {
7066   /* Several toplevels. Best we can is to set minlen. */
7067   SSize_t fake;
7068   regnode_ssc ch_class;
7069   SSize_t last_close = 0;
7070
7071   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7072
7073   scan = ri->program + 1;
7074   ssc_init(pRExC_state, &ch_class);
7075   data.start_class = &ch_class;
7076   data.last_closep = &last_close;
7077
7078   DEBUG_RExC_seen();
7079   minlen = study_chunk(pRExC_state,
7080    &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7081    SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7082              ? SCF_TRIE_DOING_RESTUDY
7083              : 0),
7084    0);
7085
7086   CHECK_RESTUDY_GOTO_butfirst(NOOP);
7087
7088   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7089     = r->float_substr = r->float_utf8 = NULL;
7090
7091   if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7092    && ! ssc_is_anything(data.start_class))
7093   {
7094    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7095
7096    ssc_finalize(pRExC_state, data.start_class);
7097
7098    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7099    StructCopy(data.start_class,
7100      (regnode_ssc*)RExC_rxi->data->data[n],
7101      regnode_ssc);
7102    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7103    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7104    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7105      regprop(r, sv, (regnode*)data.start_class, NULL);
7106      PerlIO_printf(Perl_debug_log,
7107          "synthetic stclass \"%s\".\n",
7108          SvPVX_const(sv));});
7109    data.start_class = NULL;
7110   }
7111  }
7112
7113  if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7114   r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7115   r->maxlen = REG_INFTY;
7116  }
7117  else {
7118   r->maxlen = RExC_maxlen;
7119  }
7120
7121  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7122  the "real" pattern. */
7123  DEBUG_OPTIMISE_r({
7124   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7125      (IV)minlen, (IV)r->minlen, RExC_maxlen);
7126  });
7127  r->minlenret = minlen;
7128  if (r->minlen < minlen)
7129   r->minlen = minlen;
7130
7131  if (RExC_seen & REG_GPOS_SEEN)
7132   r->intflags |= PREGf_GPOS_SEEN;
7133  if (RExC_seen & REG_LOOKBEHIND_SEEN)
7134   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7135             lookbehind */
7136  if (pRExC_state->num_code_blocks)
7137   r->extflags |= RXf_EVAL_SEEN;
7138  if (RExC_seen & REG_CANY_SEEN)
7139   r->intflags |= PREGf_CANY_SEEN;
7140  if (RExC_seen & REG_VERBARG_SEEN)
7141  {
7142   r->intflags |= PREGf_VERBARG_SEEN;
7143   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7144  }
7145  if (RExC_seen & REG_CUTGROUP_SEEN)
7146   r->intflags |= PREGf_CUTGROUP_SEEN;
7147  if (pm_flags & PMf_USE_RE_EVAL)
7148   r->intflags |= PREGf_USE_RE_EVAL;
7149  if (RExC_paren_names)
7150   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7151  else
7152   RXp_PAREN_NAMES(r) = NULL;
7153
7154  /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7155  * so it can be used in pp.c */
7156  if (r->intflags & PREGf_ANCH)
7157   r->extflags |= RXf_IS_ANCHORED;
7158
7159
7160  {
7161   /* this is used to identify "special" patterns that might result
7162   * in Perl NOT calling the regex engine and instead doing the match "itself",
7163   * particularly special cases in split//. By having the regex compiler
7164   * do this pattern matching at a regop level (instead of by inspecting the pattern)
7165   * we avoid weird issues with equivalent patterns resulting in different behavior,
7166   * AND we allow non Perl engines to get the same optimizations by the setting the
7167   * flags appropriately - Yves */
7168   regnode *first = ri->program + 1;
7169   U8 fop = OP(first);
7170   regnode *next = NEXTOPER(first);
7171   U8 nop = OP(next);
7172
7173   if (PL_regkind[fop] == NOTHING && nop == END)
7174    r->extflags |= RXf_NULL;
7175   else if (PL_regkind[fop] == BOL && nop == END)
7176    r->extflags |= RXf_START_ONLY;
7177   else if (fop == PLUS
7178     && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7179     && OP(regnext(first)) == END)
7180    r->extflags |= RXf_WHITE;
7181   else if ( r->extflags & RXf_SPLIT
7182     && fop == EXACT
7183     && STR_LEN(first) == 1
7184     && *(STRING(first)) == ' '
7185     && OP(regnext(first)) == END )
7186    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7187
7188  }
7189
7190  if (RExC_contains_locale) {
7191   RXp_EXTFLAGS(r) |= RXf_TAINTED;
7192  }
7193
7194 #ifdef DEBUGGING
7195  if (RExC_paren_names) {
7196   ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7197   ri->data->data[ri->name_list_idx]
7198         = (void*)SvREFCNT_inc(RExC_paren_name_list);
7199  } else
7200 #endif
7201   ri->name_list_idx = 0;
7202
7203  if (RExC_recurse_count) {
7204   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7205    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7206    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7207   }
7208  }
7209  Newxz(r->offs, RExC_npar, regexp_paren_pair);
7210  /* assume we don't need to swap parens around before we match */
7211
7212  DEBUG_DUMP_r({
7213   DEBUG_RExC_seen();
7214   PerlIO_printf(Perl_debug_log,"Final program:\n");
7215   regdump(r);
7216  });
7217 #ifdef RE_TRACK_PATTERN_OFFSETS
7218  DEBUG_OFFSETS_r(if (ri->u.offsets) {
7219   const STRLEN len = ri->u.offsets[0];
7220   STRLEN i;
7221   GET_RE_DEBUG_FLAGS_DECL;
7222   PerlIO_printf(Perl_debug_log,
7223      "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7224   for (i = 1; i <= len; i++) {
7225    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7226     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7227     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7228    }
7229   PerlIO_printf(Perl_debug_log, "\n");
7230  });
7231 #endif
7232
7233 #ifdef USE_ITHREADS
7234  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7235  * by setting the regexp SV to readonly-only instead. If the
7236  * pattern's been recompiled, the USEDness should remain. */
7237  if (old_re && SvREADONLY(old_re))
7238   SvREADONLY_on(rx);
7239 #endif
7240  return rx;
7241 }
7242
7243
7244 SV*
7245 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7246      const U32 flags)
7247 {
7248  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7249
7250  PERL_UNUSED_ARG(value);
7251
7252  if (flags & RXapif_FETCH) {
7253   return reg_named_buff_fetch(rx, key, flags);
7254  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7255   Perl_croak_no_modify();
7256   return NULL;
7257  } else if (flags & RXapif_EXISTS) {
7258   return reg_named_buff_exists(rx, key, flags)
7259    ? &PL_sv_yes
7260    : &PL_sv_no;
7261  } else if (flags & RXapif_REGNAMES) {
7262   return reg_named_buff_all(rx, flags);
7263  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7264   return reg_named_buff_scalar(rx, flags);
7265  } else {
7266   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7267   return NULL;
7268  }
7269 }
7270
7271 SV*
7272 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7273       const U32 flags)
7274 {
7275  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7276  PERL_UNUSED_ARG(lastkey);
7277
7278  if (flags & RXapif_FIRSTKEY)
7279   return reg_named_buff_firstkey(rx, flags);
7280  else if (flags & RXapif_NEXTKEY)
7281   return reg_named_buff_nextkey(rx, flags);
7282  else {
7283   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7284            (int)flags);
7285   return NULL;
7286  }
7287 }
7288
7289 SV*
7290 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7291       const U32 flags)
7292 {
7293  AV *retarray = NULL;
7294  SV *ret;
7295  struct regexp *const rx = ReANY(r);
7296
7297  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7298
7299  if (flags & RXapif_ALL)
7300   retarray=newAV();
7301
7302  if (rx && RXp_PAREN_NAMES(rx)) {
7303   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7304   if (he_str) {
7305    IV i;
7306    SV* sv_dat=HeVAL(he_str);
7307    I32 *nums=(I32*)SvPVX(sv_dat);
7308    for ( i=0; i<SvIVX(sv_dat); i++ ) {
7309     if ((I32)(rx->nparens) >= nums[i]
7310      && rx->offs[nums[i]].start != -1
7311      && rx->offs[nums[i]].end != -1)
7312     {
7313      ret = newSVpvs("");
7314      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7315      if (!retarray)
7316       return ret;
7317     } else {
7318      if (retarray)
7319       ret = newSVsv(&PL_sv_undef);
7320     }
7321     if (retarray)
7322      av_push(retarray, ret);
7323    }
7324    if (retarray)
7325     return newRV_noinc(MUTABLE_SV(retarray));
7326   }
7327  }
7328  return NULL;
7329 }
7330
7331 bool
7332 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7333       const U32 flags)
7334 {
7335  struct regexp *const rx = ReANY(r);
7336
7337  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7338
7339  if (rx && RXp_PAREN_NAMES(rx)) {
7340   if (flags & RXapif_ALL) {
7341    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7342   } else {
7343    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7344    if (sv) {
7345     SvREFCNT_dec_NN(sv);
7346     return TRUE;
7347    } else {
7348     return FALSE;
7349    }
7350   }
7351  } else {
7352   return FALSE;
7353  }
7354 }
7355
7356 SV*
7357 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7358 {
7359  struct regexp *const rx = ReANY(r);
7360
7361  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7362
7363  if ( rx && RXp_PAREN_NAMES(rx) ) {
7364   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7365
7366   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7367  } else {
7368   return FALSE;
7369  }
7370 }
7371
7372 SV*
7373 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7374 {
7375  struct regexp *const rx = ReANY(r);
7376  GET_RE_DEBUG_FLAGS_DECL;
7377
7378  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7379
7380  if (rx && RXp_PAREN_NAMES(rx)) {
7381   HV *hv = RXp_PAREN_NAMES(rx);
7382   HE *temphe;
7383   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7384    IV i;
7385    IV parno = 0;
7386    SV* sv_dat = HeVAL(temphe);
7387    I32 *nums = (I32*)SvPVX(sv_dat);
7388    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7389     if ((I32)(rx->lastparen) >= nums[i] &&
7390      rx->offs[nums[i]].start != -1 &&
7391      rx->offs[nums[i]].end != -1)
7392     {
7393      parno = nums[i];
7394      break;
7395     }
7396    }
7397    if (parno || flags & RXapif_ALL) {
7398     return newSVhek(HeKEY_hek(temphe));
7399    }
7400   }
7401  }
7402  return NULL;
7403 }
7404
7405 SV*
7406 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7407 {
7408  SV *ret;
7409  AV *av;
7410  SSize_t length;
7411  struct regexp *const rx = ReANY(r);
7412
7413  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7414
7415  if (rx && RXp_PAREN_NAMES(rx)) {
7416   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7417    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7418   } else if (flags & RXapif_ONE) {
7419    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7420    av = MUTABLE_AV(SvRV(ret));
7421    length = av_tindex(av);
7422    SvREFCNT_dec_NN(ret);
7423    return newSViv(length + 1);
7424   } else {
7425    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7426             (int)flags);
7427    return NULL;
7428   }
7429  }
7430  return &PL_sv_undef;
7431 }
7432
7433 SV*
7434 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7435 {
7436  struct regexp *const rx = ReANY(r);
7437  AV *av = newAV();
7438
7439  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7440
7441  if (rx && RXp_PAREN_NAMES(rx)) {
7442   HV *hv= RXp_PAREN_NAMES(rx);
7443   HE *temphe;
7444   (void)hv_iterinit(hv);
7445   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7446    IV i;
7447    IV parno = 0;
7448    SV* sv_dat = HeVAL(temphe);
7449    I32 *nums = (I32*)SvPVX(sv_dat);
7450    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7451     if ((I32)(rx->lastparen) >= nums[i] &&
7452      rx->offs[nums[i]].start != -1 &&
7453      rx->offs[nums[i]].end != -1)
7454     {
7455      parno = nums[i];
7456      break;
7457     }
7458    }
7459    if (parno || flags & RXapif_ALL) {
7460     av_push(av, newSVhek(HeKEY_hek(temphe)));
7461    }
7462   }
7463  }
7464
7465  return newRV_noinc(MUTABLE_SV(av));
7466 }
7467
7468 void
7469 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7470        SV * const sv)
7471 {
7472  struct regexp *const rx = ReANY(r);
7473  char *s = NULL;
7474  SSize_t i = 0;
7475  SSize_t s1, t1;
7476  I32 n = paren;
7477
7478  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7479
7480  if (      n == RX_BUFF_IDX_CARET_PREMATCH
7481   || n == RX_BUFF_IDX_CARET_FULLMATCH
7482   || n == RX_BUFF_IDX_CARET_POSTMATCH
7483  )
7484  {
7485   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7486   if (!keepcopy) {
7487    /* on something like
7488    *    $r = qr/.../;
7489    *    /$qr/p;
7490    * the KEEPCOPY is set on the PMOP rather than the regex */
7491    if (PL_curpm && r == PM_GETRE(PL_curpm))
7492     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7493   }
7494   if (!keepcopy)
7495    goto ret_undef;
7496  }
7497
7498  if (!rx->subbeg)
7499   goto ret_undef;
7500
7501  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7502   /* no need to distinguish between them any more */
7503   n = RX_BUFF_IDX_FULLMATCH;
7504
7505  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7506   && rx->offs[0].start != -1)
7507  {
7508   /* $`, ${^PREMATCH} */
7509   i = rx->offs[0].start;
7510   s = rx->subbeg;
7511  }
7512  else
7513  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7514   && rx->offs[0].end != -1)
7515  {
7516   /* $', ${^POSTMATCH} */
7517   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7518   i = rx->sublen + rx->suboffset - rx->offs[0].end;
7519  }
7520  else
7521  if ( 0 <= n && n <= (I32)rx->nparens &&
7522   (s1 = rx->offs[n].start) != -1 &&
7523   (t1 = rx->offs[n].end) != -1)
7524  {
7525   /* $&, ${^MATCH},  $1 ... */
7526   i = t1 - s1;
7527   s = rx->subbeg + s1 - rx->suboffset;
7528  } else {
7529   goto ret_undef;
7530  }
7531
7532  assert(s >= rx->subbeg);
7533  assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7534  if (i >= 0) {
7535 #ifdef NO_TAINT_SUPPORT
7536   sv_setpvn(sv, s, i);
7537 #else
7538   const int oldtainted = TAINT_get;
7539   TAINT_NOT;
7540   sv_setpvn(sv, s, i);
7541   TAINT_set(oldtainted);
7542 #endif
7543   if ( (rx->intflags & PREGf_CANY_SEEN)
7544    ? (RXp_MATCH_UTF8(rx)
7545       && (!i || is_utf8_string((U8*)s, i)))
7546    : (RXp_MATCH_UTF8(rx)) )
7547   {
7548    SvUTF8_on(sv);
7549   }
7550   else
7551    SvUTF8_off(sv);
7552   if (TAINTING_get) {
7553    if (RXp_MATCH_TAINTED(rx)) {
7554     if (SvTYPE(sv) >= SVt_PVMG) {
7555      MAGIC* const mg = SvMAGIC(sv);
7556      MAGIC* mgt;
7557      TAINT;
7558      SvMAGIC_set(sv, mg->mg_moremagic);
7559      SvTAINT(sv);
7560      if ((mgt = SvMAGIC(sv))) {
7561       mg->mg_moremagic = mgt;
7562       SvMAGIC_set(sv, mg);
7563      }
7564     } else {
7565      TAINT;
7566      SvTAINT(sv);
7567     }
7568    } else
7569     SvTAINTED_off(sv);
7570   }
7571  } else {
7572  ret_undef:
7573   sv_setsv(sv,&PL_sv_undef);
7574   return;
7575  }
7576 }
7577
7578 void
7579 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7580               SV const * const value)
7581 {
7582  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7583
7584  PERL_UNUSED_ARG(rx);
7585  PERL_UNUSED_ARG(paren);
7586  PERL_UNUSED_ARG(value);
7587
7588  if (!PL_localizing)
7589   Perl_croak_no_modify();
7590 }
7591
7592 I32
7593 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7594        const I32 paren)
7595 {
7596  struct regexp *const rx = ReANY(r);
7597  I32 i;
7598  I32 s1, t1;
7599
7600  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7601
7602  if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7603   || paren == RX_BUFF_IDX_CARET_FULLMATCH
7604   || paren == RX_BUFF_IDX_CARET_POSTMATCH
7605  )
7606  {
7607   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7608   if (!keepcopy) {
7609    /* on something like
7610    *    $r = qr/.../;
7611    *    /$qr/p;
7612    * the KEEPCOPY is set on the PMOP rather than the regex */
7613    if (PL_curpm && r == PM_GETRE(PL_curpm))
7614     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7615   }
7616   if (!keepcopy)
7617    goto warn_undef;
7618  }
7619
7620  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7621  switch (paren) {
7622  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7623  case RX_BUFF_IDX_PREMATCH:       /* $` */
7624   if (rx->offs[0].start != -1) {
7625       i = rx->offs[0].start;
7626       if (i > 0) {
7627         s1 = 0;
7628         t1 = i;
7629         goto getlen;
7630       }
7631    }
7632   return 0;
7633
7634  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7635  case RX_BUFF_IDX_POSTMATCH:       /* $' */
7636    if (rx->offs[0].end != -1) {
7637       i = rx->sublen - rx->offs[0].end;
7638       if (i > 0) {
7639         s1 = rx->offs[0].end;
7640         t1 = rx->sublen;
7641         goto getlen;
7642       }
7643    }
7644   return 0;
7645
7646  default: /* $& / ${^MATCH}, $1, $2, ... */
7647    if (paren <= (I32)rx->nparens &&
7648    (s1 = rx->offs[paren].start) != -1 &&
7649    (t1 = rx->offs[paren].end) != -1)
7650    {
7651    i = t1 - s1;
7652    goto getlen;
7653   } else {
7654   warn_undef:
7655    if (ckWARN(WARN_UNINITIALIZED))
7656     report_uninit((const SV *)sv);
7657    return 0;
7658   }
7659  }
7660   getlen:
7661  if (i > 0 && RXp_MATCH_UTF8(rx)) {
7662   const char * const s = rx->subbeg - rx->suboffset + s1;
7663   const U8 *ep;
7664   STRLEN el;
7665
7666   i = t1 - s1;
7667   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7668       i = el;
7669  }
7670  return i;
7671 }
7672
7673 SV*
7674 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7675 {
7676  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7677   PERL_UNUSED_ARG(rx);
7678   if (0)
7679    return NULL;
7680   else
7681    return newSVpvs("Regexp");
7682 }
7683
7684 /* Scans the name of a named buffer from the pattern.
7685  * If flags is REG_RSN_RETURN_NULL returns null.
7686  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7687  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7688  * to the parsed name as looked up in the RExC_paren_names hash.
7689  * If there is an error throws a vFAIL().. type exception.
7690  */
7691
7692 #define REG_RSN_RETURN_NULL    0
7693 #define REG_RSN_RETURN_NAME    1
7694 #define REG_RSN_RETURN_DATA    2
7695
7696 STATIC SV*
7697 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7698 {
7699  char *name_start = RExC_parse;
7700
7701  PERL_ARGS_ASSERT_REG_SCAN_NAME;
7702
7703  assert (RExC_parse <= RExC_end);
7704  if (RExC_parse == RExC_end) NOOP;
7705  else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7706   /* skip IDFIRST by using do...while */
7707   if (UTF)
7708    do {
7709     RExC_parse += UTF8SKIP(RExC_parse);
7710    } while (isWORDCHAR_utf8((U8*)RExC_parse));
7711   else
7712    do {
7713     RExC_parse++;
7714    } while (isWORDCHAR(*RExC_parse));
7715  } else {
7716   RExC_parse++; /* so the <- from the vFAIL is after the offending
7717       character */
7718   vFAIL("Group name must start with a non-digit word character");
7719  }
7720  if ( flags ) {
7721   SV* sv_name
7722    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7723        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7724   if ( flags == REG_RSN_RETURN_NAME)
7725    return sv_name;
7726   else if (flags==REG_RSN_RETURN_DATA) {
7727    HE *he_str = NULL;
7728    SV *sv_dat = NULL;
7729    if ( ! sv_name )      /* should not happen*/
7730     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7731    if (RExC_paren_names)
7732     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7733    if ( he_str )
7734     sv_dat = HeVAL(he_str);
7735    if ( ! sv_dat )
7736     vFAIL("Reference to nonexistent named group");
7737    return sv_dat;
7738   }
7739   else {
7740    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7741      (unsigned long) flags);
7742   }
7743   assert(0); /* NOT REACHED */
7744  }
7745  return NULL;
7746 }
7747
7748 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7749  int rem=(int)(RExC_end - RExC_parse);                       \
7750  int cut;                                                    \
7751  int num;                                                    \
7752  int iscut=0;                                                \
7753  if (rem>10) {                                               \
7754   rem=10;                                                 \
7755   iscut=1;                                                \
7756  }                                                           \
7757  cut=10-rem;                                                 \
7758  if (RExC_lastparse!=RExC_parse)                             \
7759   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7760    rem, RExC_parse,                                    \
7761    cut + 4,                                            \
7762    iscut ? "..." : "<"                                 \
7763   );                                                      \
7764  else                                                        \
7765   PerlIO_printf(Perl_debug_log,"%16s","");                \
7766                 \
7767  if (SIZE_ONLY)                                              \
7768  num = RExC_size + 1;                                     \
7769  else                                                        \
7770  num=REG_NODE_NUM(RExC_emit);                             \
7771  if (RExC_lastnum!=num)                                      \
7772  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7773  else                                                        \
7774  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7775  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7776   (int)((depth*2)), "",                                   \
7777   (funcname)                                              \
7778  );                                                          \
7779  RExC_lastnum=num;                                           \
7780  RExC_lastparse=RExC_parse;                                  \
7781 })
7782
7783
7784
7785 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7786  DEBUG_PARSE_MSG((funcname));                            \
7787  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7788 })
7789 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7790  DEBUG_PARSE_MSG((funcname));                            \
7791  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7792 })
7793
7794 /* This section of code defines the inversion list object and its methods.  The
7795  * interfaces are highly subject to change, so as much as possible is static to
7796  * this file.  An inversion list is here implemented as a malloc'd C UV array
7797  * as an SVt_INVLIST scalar.
7798  *
7799  * An inversion list for Unicode is an array of code points, sorted by ordinal
7800  * number.  The zeroth element is the first code point in the list.  The 1th
7801  * element is the first element beyond that not in the list.  In other words,
7802  * the first range is
7803  *  invlist[0]..(invlist[1]-1)
7804  * The other ranges follow.  Thus every element whose index is divisible by two
7805  * marks the beginning of a range that is in the list, and every element not
7806  * divisible by two marks the beginning of a range not in the list.  A single
7807  * element inversion list that contains the single code point N generally
7808  * consists of two elements
7809  *  invlist[0] == N
7810  *  invlist[1] == N+1
7811  * (The exception is when N is the highest representable value on the
7812  * machine, in which case the list containing just it would be a single
7813  * element, itself.  By extension, if the last range in the list extends to
7814  * infinity, then the first element of that range will be in the inversion list
7815  * at a position that is divisible by two, and is the final element in the
7816  * list.)
7817  * Taking the complement (inverting) an inversion list is quite simple, if the
7818  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7819  * This implementation reserves an element at the beginning of each inversion
7820  * list to always contain 0; there is an additional flag in the header which
7821  * indicates if the list begins at the 0, or is offset to begin at the next
7822  * element.
7823  *
7824  * More about inversion lists can be found in "Unicode Demystified"
7825  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7826  * More will be coming when functionality is added later.
7827  *
7828  * The inversion list data structure is currently implemented as an SV pointing
7829  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7830  * array of UV whose memory management is automatically handled by the existing
7831  * facilities for SV's.
7832  *
7833  * Some of the methods should always be private to the implementation, and some
7834  * should eventually be made public */
7835
7836 /* The header definitions are in F<inline_invlist.c> */
7837
7838 PERL_STATIC_INLINE UV*
7839 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7840 {
7841  /* Returns a pointer to the first element in the inversion list's array.
7842  * This is called upon initialization of an inversion list.  Where the
7843  * array begins depends on whether the list has the code point U+0000 in it
7844  * or not.  The other parameter tells it whether the code that follows this
7845  * call is about to put a 0 in the inversion list or not.  The first
7846  * element is either the element reserved for 0, if TRUE, or the element
7847  * after it, if FALSE */
7848
7849  bool* offset = get_invlist_offset_addr(invlist);
7850  UV* zero_addr = (UV *) SvPVX(invlist);
7851
7852  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7853
7854  /* Must be empty */
7855  assert(! _invlist_len(invlist));
7856
7857  *zero_addr = 0;
7858
7859  /* 1^1 = 0; 1^0 = 1 */
7860  *offset = 1 ^ will_have_0;
7861  return zero_addr + *offset;
7862 }
7863
7864 PERL_STATIC_INLINE UV*
7865 S_invlist_array(SV* const invlist)
7866 {
7867  /* Returns the pointer to the inversion list's array.  Every time the
7868  * length changes, this needs to be called in case malloc or realloc moved
7869  * it */
7870
7871  PERL_ARGS_ASSERT_INVLIST_ARRAY;
7872
7873  /* Must not be empty.  If these fail, you probably didn't check for <len>
7874  * being non-zero before trying to get the array */
7875  assert(_invlist_len(invlist));
7876
7877  /* The very first element always contains zero, The array begins either
7878  * there, or if the inversion list is offset, at the element after it.
7879  * The offset header field determines which; it contains 0 or 1 to indicate
7880  * how much additionally to add */
7881  assert(0 == *(SvPVX(invlist)));
7882  return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7883 }
7884
7885 PERL_STATIC_INLINE void
7886 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7887 {
7888  /* Sets the current number of elements stored in the inversion list.
7889  * Updates SvCUR correspondingly */
7890  PERL_UNUSED_CONTEXT;
7891  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7892
7893  assert(SvTYPE(invlist) == SVt_INVLIST);
7894
7895  SvCUR_set(invlist,
7896    (len == 0)
7897    ? 0
7898    : TO_INTERNAL_SIZE(len + offset));
7899  assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7900 }
7901
7902 PERL_STATIC_INLINE IV*
7903 S_get_invlist_previous_index_addr(SV* invlist)
7904 {
7905  /* Return the address of the IV that is reserved to hold the cached index
7906  * */
7907  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7908
7909  assert(SvTYPE(invlist) == SVt_INVLIST);
7910
7911  return &(((XINVLIST*) SvANY(invlist))->prev_index);
7912 }
7913
7914 PERL_STATIC_INLINE IV
7915 S_invlist_previous_index(SV* const invlist)
7916 {
7917  /* Returns cached index of previous search */
7918
7919  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7920
7921  return *get_invlist_previous_index_addr(invlist);
7922 }
7923
7924 PERL_STATIC_INLINE void
7925 S_invlist_set_previous_index(SV* const invlist, const IV index)
7926 {
7927  /* Caches <index> for later retrieval */
7928
7929  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7930
7931  assert(index == 0 || index < (int) _invlist_len(invlist));
7932
7933  *get_invlist_previous_index_addr(invlist) = index;
7934 }
7935
7936 PERL_STATIC_INLINE UV
7937 S_invlist_max(SV* const invlist)
7938 {
7939  /* Returns the maximum number of elements storable in the inversion list's
7940  * array, without having to realloc() */
7941
7942  PERL_ARGS_ASSERT_INVLIST_MAX;
7943
7944  assert(SvTYPE(invlist) == SVt_INVLIST);
7945
7946  /* Assumes worst case, in which the 0 element is not counted in the
7947  * inversion list, so subtracts 1 for that */
7948  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7949   ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7950   : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7951 }
7952
7953 #ifndef PERL_IN_XSUB_RE
7954 SV*
7955 Perl__new_invlist(pTHX_ IV initial_size)
7956 {
7957
7958  /* Return a pointer to a newly constructed inversion list, with enough
7959  * space to store 'initial_size' elements.  If that number is negative, a
7960  * system default is used instead */
7961
7962  SV* new_list;
7963
7964  if (initial_size < 0) {
7965   initial_size = 10;
7966  }
7967
7968  /* Allocate the initial space */
7969  new_list = newSV_type(SVt_INVLIST);
7970
7971  /* First 1 is in case the zero element isn't in the list; second 1 is for
7972  * trailing NUL */
7973  SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7974  invlist_set_len(new_list, 0, 0);
7975
7976  /* Force iterinit() to be used to get iteration to work */
7977  *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7978
7979  *get_invlist_previous_index_addr(new_list) = 0;
7980
7981  return new_list;
7982 }
7983
7984 SV*
7985 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7986 {
7987  /* Return a pointer to a newly constructed inversion list, initialized to
7988  * point to <list>, which has to be in the exact correct inversion list
7989  * form, including internal fields.  Thus this is a dangerous routine that
7990  * should not be used in the wrong hands.  The passed in 'list' contains
7991  * several header fields at the beginning that are not part of the
7992  * inversion list body proper */
7993
7994  const STRLEN length = (STRLEN) list[0];
7995  const UV version_id =          list[1];
7996  const bool offset   =    cBOOL(list[2]);
7997 #define HEADER_LENGTH 3
7998  /* If any of the above changes in any way, you must change HEADER_LENGTH
7999  * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8000  *      perl -E 'say int(rand 2**31-1)'
8001  */
8002 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8003           data structure type, so that one being
8004           passed in can be validated to be an
8005           inversion list of the correct vintage.
8006          */
8007
8008  SV* invlist = newSV_type(SVt_INVLIST);
8009
8010  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8011
8012  if (version_id != INVLIST_VERSION_ID) {
8013   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8014  }
8015
8016  /* The generated array passed in includes header elements that aren't part
8017  * of the list proper, so start it just after them */
8018  SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8019
8020  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
8021        shouldn't touch it */
8022
8023  *(get_invlist_offset_addr(invlist)) = offset;
8024
8025  /* The 'length' passed to us is the physical number of elements in the
8026  * inversion list.  But if there is an offset the logical number is one
8027  * less than that */
8028  invlist_set_len(invlist, length  - offset, offset);
8029
8030  invlist_set_previous_index(invlist, 0);
8031
8032  /* Initialize the iteration pointer. */
8033  invlist_iterfinish(invlist);
8034
8035  SvREADONLY_on(invlist);
8036
8037  return invlist;
8038 }
8039 #endif /* ifndef PERL_IN_XSUB_RE */
8040
8041 STATIC void
8042 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8043 {
8044  /* Grow the maximum size of an inversion list */
8045
8046  PERL_ARGS_ASSERT_INVLIST_EXTEND;
8047
8048  assert(SvTYPE(invlist) == SVt_INVLIST);
8049
8050  /* Add one to account for the zero element at the beginning which may not
8051  * be counted by the calling parameters */
8052  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8053 }
8054
8055 PERL_STATIC_INLINE void
8056 S_invlist_trim(SV* const invlist)
8057 {
8058  PERL_ARGS_ASSERT_INVLIST_TRIM;
8059
8060  assert(SvTYPE(invlist) == SVt_INVLIST);
8061
8062  /* Change the length of the inversion list to how many entries it currently
8063  * has */
8064  SvPV_shrink_to_cur((SV *) invlist);
8065 }
8066
8067 STATIC void
8068 S__append_range_to_invlist(pTHX_ SV* const invlist,
8069         const UV start, const UV end)
8070 {
8071    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8072  * the end of the inversion list.  The range must be above any existing
8073  * ones. */
8074
8075  UV* array;
8076  UV max = invlist_max(invlist);
8077  UV len = _invlist_len(invlist);
8078  bool offset;
8079
8080  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8081
8082  if (len == 0) { /* Empty lists must be initialized */
8083   offset = start != 0;
8084   array = _invlist_array_init(invlist, ! offset);
8085  }
8086  else {
8087   /* Here, the existing list is non-empty. The current max entry in the
8088   * list is generally the first value not in the set, except when the
8089   * set extends to the end of permissible values, in which case it is
8090   * the first entry in that final set, and so this call is an attempt to
8091   * append out-of-order */
8092
8093   UV final_element = len - 1;
8094   array = invlist_array(invlist);
8095   if (array[final_element] > start
8096    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8097   {
8098    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8099      array[final_element], start,
8100      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8101   }
8102
8103   /* Here, it is a legal append.  If the new range begins with the first
8104   * value not in the set, it is extending the set, so the new first
8105   * value not in the set is one greater than the newly extended range.
8106   * */
8107   offset = *get_invlist_offset_addr(invlist);
8108   if (array[final_element] == start) {
8109    if (end != UV_MAX) {
8110     array[final_element] = end + 1;
8111    }
8112    else {
8113     /* But if the end is the maximum representable on the machine,
8114     * just let the range that this would extend to have no end */
8115     invlist_set_len(invlist, len - 1, offset);
8116    }
8117    return;
8118   }
8119  }
8120
8121  /* Here the new range doesn't extend any existing set.  Add it */
8122
8123  len += 2; /* Includes an element each for the start and end of range */
8124
8125  /* If wll overflow the existing space, extend, which may cause the array to
8126  * be moved */
8127  if (max < len) {
8128   invlist_extend(invlist, len);
8129
8130   /* Have to set len here to avoid assert failure in invlist_array() */
8131   invlist_set_len(invlist, len, offset);
8132
8133   array = invlist_array(invlist);
8134  }
8135  else {
8136   invlist_set_len(invlist, len, offset);
8137  }
8138
8139  /* The next item on the list starts the range, the one after that is
8140  * one past the new range.  */
8141  array[len - 2] = start;
8142  if (end != UV_MAX) {
8143   array[len - 1] = end + 1;
8144  }
8145  else {
8146   /* But if the end is the maximum representable on the machine, just let
8147   * the range have no end */
8148   invlist_set_len(invlist, len - 1, offset);
8149  }
8150 }
8151
8152 #ifndef PERL_IN_XSUB_RE
8153
8154 IV
8155 Perl__invlist_search(SV* const invlist, const UV cp)
8156 {
8157  /* Searches the inversion list for the entry that contains the input code
8158  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8159  * return value is the index into the list's array of the range that
8160  * contains <cp> */
8161
8162  IV low = 0;
8163  IV mid;
8164  IV high = _invlist_len(invlist);
8165  const IV highest_element = high - 1;
8166  const UV* array;
8167
8168  PERL_ARGS_ASSERT__INVLIST_SEARCH;
8169
8170  /* If list is empty, return failure. */
8171  if (high == 0) {
8172   return -1;
8173  }
8174
8175  /* (We can't get the array unless we know the list is non-empty) */
8176  array = invlist_array(invlist);
8177
8178  mid = invlist_previous_index(invlist);
8179  assert(mid >=0 && mid <= highest_element);
8180
8181  /* <mid> contains the cache of the result of the previous call to this
8182  * function (0 the first time).  See if this call is for the same result,
8183  * or if it is for mid-1.  This is under the theory that calls to this
8184  * function will often be for related code points that are near each other.
8185  * And benchmarks show that caching gives better results.  We also test
8186  * here if the code point is within the bounds of the list.  These tests
8187  * replace others that would have had to be made anyway to make sure that
8188  * the array bounds were not exceeded, and these give us extra information
8189  * at the same time */
8190  if (cp >= array[mid]) {
8191   if (cp >= array[highest_element]) {
8192    return highest_element;
8193   }
8194
8195   /* Here, array[mid] <= cp < array[highest_element].  This means that
8196   * the final element is not the answer, so can exclude it; it also
8197   * means that <mid> is not the final element, so can refer to 'mid + 1'
8198   * safely */
8199   if (cp < array[mid + 1]) {
8200    return mid;
8201   }
8202   high--;
8203   low = mid + 1;
8204  }
8205  else { /* cp < aray[mid] */
8206   if (cp < array[0]) { /* Fail if outside the array */
8207    return -1;
8208   }
8209   high = mid;
8210   if (cp >= array[mid - 1]) {
8211    goto found_entry;
8212   }
8213  }
8214
8215  /* Binary search.  What we are looking for is <i> such that
8216  * array[i] <= cp < array[i+1]
8217  * The loop below converges on the i+1.  Note that there may not be an
8218  * (i+1)th element in the array, and things work nonetheless */
8219  while (low < high) {
8220   mid = (low + high) / 2;
8221   assert(mid <= highest_element);
8222   if (array[mid] <= cp) { /* cp >= array[mid] */
8223    low = mid + 1;
8224
8225    /* We could do this extra test to exit the loop early.
8226    if (cp < array[low]) {
8227     return mid;
8228    }
8229    */
8230   }
8231   else { /* cp < array[mid] */
8232    high = mid;
8233   }
8234  }
8235
8236   found_entry:
8237  high--;
8238  invlist_set_previous_index(invlist, high);
8239  return high;
8240 }
8241
8242 void
8243 Perl__invlist_populate_swatch(SV* const invlist,
8244        const UV start, const UV end, U8* swatch)
8245 {
8246  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8247  * but is used when the swash has an inversion list.  This makes this much
8248  * faster, as it uses a binary search instead of a linear one.  This is
8249  * intimately tied to that function, and perhaps should be in utf8.c,
8250  * except it is intimately tied to inversion lists as well.  It assumes
8251  * that <swatch> is all 0's on input */
8252
8253  UV current = start;
8254  const IV len = _invlist_len(invlist);
8255  IV i;
8256  const UV * array;
8257
8258  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8259
8260  if (len == 0) { /* Empty inversion list */
8261   return;
8262  }
8263
8264  array = invlist_array(invlist);
8265
8266  /* Find which element it is */
8267  i = _invlist_search(invlist, start);
8268
8269  /* We populate from <start> to <end> */
8270  while (current < end) {
8271   UV upper;
8272
8273   /* The inversion list gives the results for every possible code point
8274   * after the first one in the list.  Only those ranges whose index is
8275   * even are ones that the inversion list matches.  For the odd ones,
8276   * and if the initial code point is not in the list, we have to skip
8277   * forward to the next element */
8278   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8279    i++;
8280    if (i >= len) { /* Finished if beyond the end of the array */
8281     return;
8282    }
8283    current = array[i];
8284    if (current >= end) {   /* Finished if beyond the end of what we
8285          are populating */
8286     if (LIKELY(end < UV_MAX)) {
8287      return;
8288     }
8289
8290     /* We get here when the upper bound is the maximum
8291     * representable on the machine, and we are looking for just
8292     * that code point.  Have to special case it */
8293     i = len;
8294     goto join_end_of_list;
8295    }
8296   }
8297   assert(current >= start);
8298
8299   /* The current range ends one below the next one, except don't go past
8300   * <end> */
8301   i++;
8302   upper = (i < len && array[i] < end) ? array[i] : end;
8303
8304   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8305   * for each code point in it */
8306   for (; current < upper; current++) {
8307    const STRLEN offset = (STRLEN)(current - start);
8308    swatch[offset >> 3] |= 1 << (offset & 7);
8309   }
8310
8311  join_end_of_list:
8312
8313   /* Quit if at the end of the list */
8314   if (i >= len) {
8315
8316    /* But first, have to deal with the highest possible code point on
8317    * the platform.  The previous code assumes that <end> is one
8318    * beyond where we want to populate, but that is impossible at the
8319    * platform's infinity, so have to handle it specially */
8320    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8321    {
8322     const STRLEN offset = (STRLEN)(end - start);
8323     swatch[offset >> 3] |= 1 << (offset & 7);
8324    }
8325    return;
8326   }
8327
8328   /* Advance to the next range, which will be for code points not in the
8329   * inversion list */
8330   current = array[i];
8331  }
8332
8333  return;
8334 }
8335
8336 void
8337 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8338           const bool complement_b, SV** output)
8339 {
8340  /* Take the union of two inversion lists and point <output> to it.  *output
8341  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8342  * the reference count to that list will be decremented if not already a
8343  * temporary (mortal); otherwise *output will be made correspondingly
8344  * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8345  * second list is returned.  If <complement_b> is TRUE, the union is taken
8346  * of the complement (inversion) of <b> instead of b itself.
8347  *
8348  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8349  * Richard Gillam, published by Addison-Wesley, and explained at some
8350  * length there.  The preface says to incorporate its examples into your
8351  * code at your own risk.
8352  *
8353  * The algorithm is like a merge sort.
8354  *
8355  * XXX A potential performance improvement is to keep track as we go along
8356  * if only one of the inputs contributes to the result, meaning the other
8357  * is a subset of that one.  In that case, we can skip the final copy and
8358  * return the larger of the input lists, but then outside code might need
8359  * to keep track of whether to free the input list or not */
8360
8361  const UV* array_a;    /* a's array */
8362  const UV* array_b;
8363  UV len_a;     /* length of a's array */
8364  UV len_b;
8365
8366  SV* u;   /* the resulting union */
8367  UV* array_u;
8368  UV len_u;
8369
8370  UV i_a = 0;      /* current index into a's array */
8371  UV i_b = 0;
8372  UV i_u = 0;
8373
8374  /* running count, as explained in the algorithm source book; items are
8375  * stopped accumulating and are output when the count changes to/from 0.
8376  * The count is incremented when we start a range that's in the set, and
8377  * decremented when we start a range that's not in the set.  So its range
8378  * is 0 to 2.  Only when the count is zero is something not in the set.
8379  */
8380  UV count = 0;
8381
8382  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8383  assert(a != b);
8384
8385  /* If either one is empty, the union is the other one */
8386  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8387   bool make_temp = FALSE; /* Should we mortalize the result? */
8388
8389   if (*output == a) {
8390    if (a != NULL) {
8391     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8392      SvREFCNT_dec_NN(a);
8393     }
8394    }
8395   }
8396   if (*output != b) {
8397    *output = invlist_clone(b);
8398    if (complement_b) {
8399     _invlist_invert(*output);
8400    }
8401   } /* else *output already = b; */
8402
8403   if (make_temp) {
8404    sv_2mortal(*output);
8405   }
8406   return;
8407  }
8408  else if ((len_b = _invlist_len(b)) == 0) {
8409   bool make_temp = FALSE;
8410   if (*output == b) {
8411    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8412     SvREFCNT_dec_NN(b);
8413    }
8414   }
8415
8416   /* The complement of an empty list is a list that has everything in it,
8417   * so the union with <a> includes everything too */
8418   if (complement_b) {
8419    if (a == *output) {
8420     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8421      SvREFCNT_dec_NN(a);
8422     }
8423    }
8424    *output = _new_invlist(1);
8425    _append_range_to_invlist(*output, 0, UV_MAX);
8426   }
8427   else if (*output != a) {
8428    *output = invlist_clone(a);
8429   }
8430   /* else *output already = a; */
8431
8432   if (make_temp) {
8433    sv_2mortal(*output);
8434   }
8435   return;
8436  }
8437
8438  /* Here both lists exist and are non-empty */
8439  array_a = invlist_array(a);
8440  array_b = invlist_array(b);
8441
8442  /* If are to take the union of 'a' with the complement of b, set it
8443  * up so are looking at b's complement. */
8444  if (complement_b) {
8445
8446   /* To complement, we invert: if the first element is 0, remove it.  To
8447   * do this, we just pretend the array starts one later */
8448   if (array_b[0] == 0) {
8449    array_b++;
8450    len_b--;
8451   }
8452   else {
8453
8454    /* But if the first element is not zero, we pretend the list starts
8455    * at the 0 that is always stored immediately before the array. */
8456    array_b--;
8457    len_b++;
8458   }
8459  }
8460
8461  /* Size the union for the worst case: that the sets are completely
8462  * disjoint */
8463  u = _new_invlist(len_a + len_b);
8464
8465  /* Will contain U+0000 if either component does */
8466  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8467          || (len_b > 0 && array_b[0] == 0));
8468
8469  /* Go through each list item by item, stopping when exhausted one of
8470  * them */
8471  while (i_a < len_a && i_b < len_b) {
8472   UV cp;     /* The element to potentially add to the union's array */
8473   bool cp_in_set;   /* is it in the the input list's set or not */
8474
8475   /* We need to take one or the other of the two inputs for the union.
8476   * Since we are merging two sorted lists, we take the smaller of the
8477   * next items.  In case of a tie, we take the one that is in its set
8478   * first.  If we took one not in the set first, it would decrement the
8479   * count, possibly to 0 which would cause it to be output as ending the
8480   * range, and the next time through we would take the same number, and
8481   * output it again as beginning the next range.  By doing it the
8482   * opposite way, there is no possibility that the count will be
8483   * momentarily decremented to 0, and thus the two adjoining ranges will
8484   * be seamlessly merged.  (In a tie and both are in the set or both not
8485   * in the set, it doesn't matter which we take first.) */
8486   if (array_a[i_a] < array_b[i_b]
8487    || (array_a[i_a] == array_b[i_b]
8488     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8489   {
8490    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8491    cp= array_a[i_a++];
8492   }
8493   else {
8494    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8495    cp = array_b[i_b++];
8496   }
8497
8498   /* Here, have chosen which of the two inputs to look at.  Only output
8499   * if the running count changes to/from 0, which marks the
8500   * beginning/end of a range in that's in the set */
8501   if (cp_in_set) {
8502    if (count == 0) {
8503     array_u[i_u++] = cp;
8504    }
8505    count++;
8506   }
8507   else {
8508    count--;
8509    if (count == 0) {
8510     array_u[i_u++] = cp;
8511    }
8512   }
8513  }
8514
8515  /* Here, we are finished going through at least one of the lists, which
8516  * means there is something remaining in at most one.  We check if the list
8517  * that hasn't been exhausted is positioned such that we are in the middle
8518  * of a range in its set or not.  (i_a and i_b point to the element beyond
8519  * the one we care about.) If in the set, we decrement 'count'; if 0, there
8520  * is potentially more to output.
8521  * There are four cases:
8522  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
8523  *    in the union is entirely from the non-exhausted set.
8524  * 2) Both were in their sets, count is 2.  Nothing further should
8525  *    be output, as everything that remains will be in the exhausted
8526  *    list's set, hence in the union; decrementing to 1 but not 0 insures
8527  *    that
8528  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8529  *    Nothing further should be output because the union includes
8530  *    everything from the exhausted set.  Not decrementing ensures that.
8531  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8532  *    decrementing to 0 insures that we look at the remainder of the
8533  *    non-exhausted set */
8534  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8535   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8536  {
8537   count--;
8538  }
8539
8540  /* The final length is what we've output so far, plus what else is about to
8541  * be output.  (If 'count' is non-zero, then the input list we exhausted
8542  * has everything remaining up to the machine's limit in its set, and hence
8543  * in the union, so there will be no further output. */
8544  len_u = i_u;
8545  if (count == 0) {
8546   /* At most one of the subexpressions will be non-zero */
8547   len_u += (len_a - i_a) + (len_b - i_b);
8548  }
8549
8550  /* Set result to final length, which can change the pointer to array_u, so
8551  * re-find it */
8552  if (len_u != _invlist_len(u)) {
8553   invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8554   invlist_trim(u);
8555   array_u = invlist_array(u);
8556  }
8557
8558  /* When 'count' is 0, the list that was exhausted (if one was shorter than
8559  * the other) ended with everything above it not in its set.  That means
8560  * that the remaining part of the union is precisely the same as the
8561  * non-exhausted list, so can just copy it unchanged.  (If both list were
8562  * exhausted at the same time, then the operations below will be both 0.)
8563  */
8564  if (count == 0) {
8565   IV copy_count; /* At most one will have a non-zero copy count */
8566   if ((copy_count = len_a - i_a) > 0) {
8567    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8568   }
8569   else if ((copy_count = len_b - i_b) > 0) {
8570    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8571   }
8572  }
8573
8574  /*  We may be removing a reference to one of the inputs.  If so, the output
8575  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8576  *  count decremented) */
8577  if (a == *output || b == *output) {
8578   assert(! invlist_is_iterating(*output));
8579   if ((SvTEMP(*output))) {
8580    sv_2mortal(u);
8581   }
8582   else {
8583    SvREFCNT_dec_NN(*output);
8584   }
8585  }
8586
8587  *output = u;
8588
8589  return;
8590 }
8591
8592 void
8593 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8594            const bool complement_b, SV** i)
8595 {
8596  /* Take the intersection of two inversion lists and point <i> to it.  *i
8597  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8598  * the reference count to that list will be decremented if not already a
8599  * temporary (mortal); otherwise *i will be made correspondingly mortal.
8600  * The first list, <a>, may be NULL, in which case an empty list is
8601  * returned.  If <complement_b> is TRUE, the result will be the
8602  * intersection of <a> and the complement (or inversion) of <b> instead of
8603  * <b> directly.
8604  *
8605  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8606  * Richard Gillam, published by Addison-Wesley, and explained at some
8607  * length there.  The preface says to incorporate its examples into your
8608  * code at your own risk.  In fact, it had bugs
8609  *
8610  * The algorithm is like a merge sort, and is essentially the same as the
8611  * union above
8612  */
8613
8614  const UV* array_a;  /* a's array */
8615  const UV* array_b;
8616  UV len_a; /* length of a's array */
8617  UV len_b;
8618
8619  SV* r;       /* the resulting intersection */
8620  UV* array_r;
8621  UV len_r;
8622
8623  UV i_a = 0;      /* current index into a's array */
8624  UV i_b = 0;
8625  UV i_r = 0;
8626
8627  /* running count, as explained in the algorithm source book; items are
8628  * stopped accumulating and are output when the count changes to/from 2.
8629  * The count is incremented when we start a range that's in the set, and
8630  * decremented when we start a range that's not in the set.  So its range
8631  * is 0 to 2.  Only when the count is 2 is something in the intersection.
8632  */
8633  UV count = 0;
8634
8635  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8636  assert(a != b);
8637
8638  /* Special case if either one is empty */
8639  len_a = (a == NULL) ? 0 : _invlist_len(a);
8640  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8641   bool make_temp = FALSE;
8642
8643   if (len_a != 0 && complement_b) {
8644
8645    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8646    * be empty.  Here, also we are using 'b's complement, which hence
8647    * must be every possible code point.  Thus the intersection is
8648    * simply 'a'. */
8649    if (*i != a) {
8650     if (*i == b) {
8651      if (! (make_temp = cBOOL(SvTEMP(b)))) {
8652       SvREFCNT_dec_NN(b);
8653      }
8654     }
8655
8656     *i = invlist_clone(a);
8657    }
8658    /* else *i is already 'a' */
8659
8660    if (make_temp) {
8661     sv_2mortal(*i);
8662    }
8663    return;
8664   }
8665
8666   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8667   * intersection must be empty */
8668   if (*i == a) {
8669    if (! (make_temp = cBOOL(SvTEMP(a)))) {
8670     SvREFCNT_dec_NN(a);
8671    }
8672   }
8673   else if (*i == b) {
8674    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8675     SvREFCNT_dec_NN(b);
8676    }
8677   }
8678   *i = _new_invlist(0);
8679   if (make_temp) {
8680    sv_2mortal(*i);
8681   }
8682
8683   return;
8684  }
8685
8686  /* Here both lists exist and are non-empty */
8687  array_a = invlist_array(a);
8688  array_b = invlist_array(b);
8689
8690  /* If are to take the intersection of 'a' with the complement of b, set it
8691  * up so are looking at b's complement. */
8692  if (complement_b) {
8693
8694   /* To complement, we invert: if the first element is 0, remove it.  To
8695   * do this, we just pretend the array starts one later */
8696   if (array_b[0] == 0) {
8697    array_b++;
8698    len_b--;
8699   }
8700   else {
8701
8702    /* But if the first element is not zero, we pretend the list starts
8703    * at the 0 that is always stored immediately before the array. */
8704    array_b--;
8705    len_b++;
8706   }
8707  }
8708
8709  /* Size the intersection for the worst case: that the intersection ends up
8710  * fragmenting everything to be completely disjoint */
8711  r= _new_invlist(len_a + len_b);
8712
8713  /* Will contain U+0000 iff both components do */
8714  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8715          && len_b > 0 && array_b[0] == 0);
8716
8717  /* Go through each list item by item, stopping when exhausted one of
8718  * them */
8719  while (i_a < len_a && i_b < len_b) {
8720   UV cp;     /* The element to potentially add to the intersection's
8721      array */
8722   bool cp_in_set; /* Is it in the input list's set or not */
8723
8724   /* We need to take one or the other of the two inputs for the
8725   * intersection.  Since we are merging two sorted lists, we take the
8726   * smaller of the next items.  In case of a tie, we take the one that
8727   * is not in its set first (a difference from the union algorithm).  If
8728   * we took one in the set first, it would increment the count, possibly
8729   * to 2 which would cause it to be output as starting a range in the
8730   * intersection, and the next time through we would take that same
8731   * number, and output it again as ending the set.  By doing it the
8732   * opposite of this, there is no possibility that the count will be
8733   * momentarily incremented to 2.  (In a tie and both are in the set or
8734   * both not in the set, it doesn't matter which we take first.) */
8735   if (array_a[i_a] < array_b[i_b]
8736    || (array_a[i_a] == array_b[i_b]
8737     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8738   {
8739    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8740    cp= array_a[i_a++];
8741   }
8742   else {
8743    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8744    cp= array_b[i_b++];
8745   }
8746
8747   /* Here, have chosen which of the two inputs to look at.  Only output
8748   * if the running count changes to/from 2, which marks the
8749   * beginning/end of a range that's in the intersection */
8750   if (cp_in_set) {
8751    count++;
8752    if (count == 2) {
8753     array_r[i_r++] = cp;
8754    }
8755   }
8756   else {
8757    if (count == 2) {
8758     array_r[i_r++] = cp;
8759    }
8760    count--;
8761   }
8762  }
8763
8764  /* Here, we are finished going through at least one of the lists, which
8765  * means there is something remaining in at most one.  We check if the list
8766  * that has been exhausted is positioned such that we are in the middle
8767  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8768  * the ones we care about.)  There are four cases:
8769  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
8770  *    nothing left in the intersection.
8771  * 2) Both were in their sets, count is 2 and perhaps is incremented to
8772  *    above 2.  What should be output is exactly that which is in the
8773  *    non-exhausted set, as everything it has is also in the intersection
8774  *    set, and everything it doesn't have can't be in the intersection
8775  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8776  *    gets incremented to 2.  Like the previous case, the intersection is
8777  *    everything that remains in the non-exhausted set.
8778  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8779  *    remains 1.  And the intersection has nothing more. */
8780  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8781   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8782  {
8783   count++;
8784  }
8785
8786  /* The final length is what we've output so far plus what else is in the
8787  * intersection.  At most one of the subexpressions below will be non-zero
8788  * */
8789  len_r = i_r;
8790  if (count >= 2) {
8791   len_r += (len_a - i_a) + (len_b - i_b);
8792  }
8793
8794  /* Set result to final length, which can change the pointer to array_r, so
8795  * re-find it */
8796  if (len_r != _invlist_len(r)) {
8797   invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8798   invlist_trim(r);
8799   array_r = invlist_array(r);
8800  }
8801
8802  /* Finish outputting any remaining */
8803  if (count >= 2) { /* At most one will have a non-zero copy count */
8804   IV copy_count;
8805   if ((copy_count = len_a - i_a) > 0) {
8806    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8807   }
8808   else if ((copy_count = len_b - i_b) > 0) {
8809    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8810   }
8811  }
8812
8813  /*  We may be removing a reference to one of the inputs.  If so, the output
8814  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8815  *  count decremented) */
8816  if (a == *i || b == *i) {
8817   assert(! invlist_is_iterating(*i));
8818   if (SvTEMP(*i)) {
8819    sv_2mortal(r);
8820   }
8821   else {
8822    SvREFCNT_dec_NN(*i);
8823   }
8824  }
8825
8826  *i = r;
8827
8828  return;
8829 }
8830
8831 SV*
8832 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8833 {
8834  /* Add the range from 'start' to 'end' inclusive to the inversion list's
8835  * set.  A pointer to the inversion list is returned.  This may actually be
8836  * a new list, in which case the passed in one has been destroyed.  The
8837  * passed in inversion list can be NULL, in which case a new one is created
8838  * with just the one range in it */
8839
8840  SV* range_invlist;
8841  UV len;
8842
8843  if (invlist == NULL) {
8844   invlist = _new_invlist(2);
8845   len = 0;
8846  }
8847  else {
8848   len = _invlist_len(invlist);
8849  }
8850
8851  /* If comes after the final entry actually in the list, can just append it
8852  * to the end, */
8853  if (len == 0
8854   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8855    && start >= invlist_array(invlist)[len - 1]))
8856  {
8857   _append_range_to_invlist(invlist, start, end);
8858   return invlist;
8859  }
8860
8861  /* Here, can't just append things, create and return a new inversion list
8862  * which is the union of this range and the existing inversion list */
8863  range_invlist = _new_invlist(2);
8864  _append_range_to_invlist(range_invlist, start, end);
8865
8866  _invlist_union(invlist, range_invlist, &invlist);
8867
8868  /* The temporary can be freed */
8869  SvREFCNT_dec_NN(range_invlist);
8870
8871  return invlist;
8872 }
8873
8874 SV*
8875 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8876         UV** other_elements_ptr)
8877 {
8878  /* Create and return an inversion list whose contents are to be populated
8879  * by the caller.  The caller gives the number of elements (in 'size') and
8880  * the very first element ('element0').  This function will set
8881  * '*other_elements_ptr' to an array of UVs, where the remaining elements
8882  * are to be placed.
8883  *
8884  * Obviously there is some trust involved that the caller will properly
8885  * fill in the other elements of the array.
8886  *
8887  * (The first element needs to be passed in, as the underlying code does
8888  * things differently depending on whether it is zero or non-zero) */
8889
8890  SV* invlist = _new_invlist(size);
8891  bool offset;
8892
8893  PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8894
8895  _append_range_to_invlist(invlist, element0, element0);
8896  offset = *get_invlist_offset_addr(invlist);
8897
8898  invlist_set_len(invlist, size, offset);
8899  *other_elements_ptr = invlist_array(invlist) + 1;
8900  return invlist;
8901 }
8902
8903 #endif
8904
8905 PERL_STATIC_INLINE SV*
8906 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8907  return _add_range_to_invlist(invlist, cp, cp);
8908 }
8909
8910 #ifndef PERL_IN_XSUB_RE
8911 void
8912 Perl__invlist_invert(pTHX_ SV* const invlist)
8913 {
8914  /* Complement the input inversion list.  This adds a 0 if the list didn't
8915  * have a zero; removes it otherwise.  As described above, the data
8916  * structure is set up so that this is very efficient */
8917
8918  PERL_ARGS_ASSERT__INVLIST_INVERT;
8919
8920  assert(! invlist_is_iterating(invlist));
8921
8922  /* The inverse of matching nothing is matching everything */
8923  if (_invlist_len(invlist) == 0) {
8924   _append_range_to_invlist(invlist, 0, UV_MAX);
8925   return;
8926  }
8927
8928  *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8929 }
8930
8931 #endif
8932
8933 PERL_STATIC_INLINE SV*
8934 S_invlist_clone(pTHX_ SV* const invlist)
8935 {
8936
8937  /* Return a new inversion list that is a copy of the input one, which is
8938  * unchanged.  The new list will not be mortal even if the old one was. */
8939
8940  /* Need to allocate extra space to accommodate Perl's addition of a
8941  * trailing NUL to SvPV's, since it thinks they are always strings */
8942  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8943  STRLEN physical_length = SvCUR(invlist);
8944  bool offset = *(get_invlist_offset_addr(invlist));
8945
8946  PERL_ARGS_ASSERT_INVLIST_CLONE;
8947
8948  *(get_invlist_offset_addr(new_invlist)) = offset;
8949  invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8950  Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8951
8952  return new_invlist;
8953 }
8954
8955 PERL_STATIC_INLINE STRLEN*
8956 S_get_invlist_iter_addr(SV* invlist)
8957 {
8958  /* Return the address of the UV that contains the current iteration
8959  * position */
8960
8961  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8962
8963  assert(SvTYPE(invlist) == SVt_INVLIST);
8964
8965  return &(((XINVLIST*) SvANY(invlist))->iterator);
8966 }
8967
8968 PERL_STATIC_INLINE void
8969 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
8970 {
8971  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8972
8973  *get_invlist_iter_addr(invlist) = 0;
8974 }
8975
8976 PERL_STATIC_INLINE void
8977 S_invlist_iterfinish(SV* invlist)
8978 {
8979  /* Terminate iterator for invlist.  This is to catch development errors.
8980  * Any iteration that is interrupted before completed should call this
8981  * function.  Functions that add code points anywhere else but to the end
8982  * of an inversion list assert that they are not in the middle of an
8983  * iteration.  If they were, the addition would make the iteration
8984  * problematical: if the iteration hadn't reached the place where things
8985  * were being added, it would be ok */
8986
8987  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8988
8989  *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8990 }
8991
8992 STATIC bool
8993 S_invlist_iternext(SV* invlist, UV* start, UV* end)
8994 {
8995  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8996  * This call sets in <*start> and <*end>, the next range in <invlist>.
8997  * Returns <TRUE> if successful and the next call will return the next
8998  * range; <FALSE> if was already at the end of the list.  If the latter,
8999  * <*start> and <*end> are unchanged, and the next call to this function
9000  * will start over at the beginning of the list */
9001
9002  STRLEN* pos = get_invlist_iter_addr(invlist);
9003  UV len = _invlist_len(invlist);
9004  UV *array;
9005
9006  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9007
9008  if (*pos >= len) {
9009   *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9010   return FALSE;
9011  }
9012
9013  array = invlist_array(invlist);
9014
9015  *start = array[(*pos)++];
9016
9017  if (*pos >= len) {
9018   *end = UV_MAX;
9019  }
9020  else {
9021   *end = array[(*pos)++] - 1;
9022  }
9023
9024  return TRUE;
9025 }
9026
9027 PERL_STATIC_INLINE bool
9028 S_invlist_is_iterating(SV* const invlist)
9029 {
9030  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9031
9032  return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9033 }
9034
9035 PERL_STATIC_INLINE UV
9036 S_invlist_highest(SV* const invlist)
9037 {
9038  /* Returns the highest code point that matches an inversion list.  This API
9039  * has an ambiguity, as it returns 0 under either the highest is actually
9040  * 0, or if the list is empty.  If this distinction matters to you, check
9041  * for emptiness before calling this function */
9042
9043  UV len = _invlist_len(invlist);
9044  UV *array;
9045
9046  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9047
9048  if (len == 0) {
9049   return 0;
9050  }
9051
9052  array = invlist_array(invlist);
9053
9054  /* The last element in the array in the inversion list always starts a
9055  * range that goes to infinity.  That range may be for code points that are
9056  * matched in the inversion list, or it may be for ones that aren't
9057  * matched.  In the latter case, the highest code point in the set is one
9058  * less than the beginning of this range; otherwise it is the final element
9059  * of this range: infinity */
9060  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9061   ? UV_MAX
9062   : array[len - 1] - 1;
9063 }
9064
9065 #ifndef PERL_IN_XSUB_RE
9066 SV *
9067 Perl__invlist_contents(pTHX_ SV* const invlist)
9068 {
9069  /* Get the contents of an inversion list into a string SV so that they can
9070  * be printed out.  It uses the format traditionally done for debug tracing
9071  */
9072
9073  UV start, end;
9074  SV* output = newSVpvs("\n");
9075
9076  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9077
9078  assert(! invlist_is_iterating(invlist));
9079
9080  invlist_iterinit(invlist);
9081  while (invlist_iternext(invlist, &start, &end)) {
9082   if (end == UV_MAX) {
9083    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9084   }
9085   else if (end != start) {
9086    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9087      start,       end);
9088   }
9089   else {
9090    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9091   }
9092  }
9093
9094  return output;
9095 }
9096 #endif
9097
9098 #ifndef PERL_IN_XSUB_RE
9099 void
9100 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9101       const char * const indent, SV* const invlist)
9102 {
9103  /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9104  * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9105  * the string 'indent'.  The output looks like this:
9106   [0] 0x000A .. 0x000D
9107   [2] 0x0085
9108   [4] 0x2028 .. 0x2029
9109   [6] 0x3104 .. INFINITY
9110  * This means that the first range of code points matched by the list are
9111  * 0xA through 0xD; the second range contains only the single code point
9112  * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9113  * are used to define each range (except if the final range extends to
9114  * infinity, only a single element is needed).  The array index of the
9115  * first element for the corresponding range is given in brackets. */
9116
9117  UV start, end;
9118  STRLEN count = 0;
9119
9120  PERL_ARGS_ASSERT__INVLIST_DUMP;
9121
9122  if (invlist_is_iterating(invlist)) {
9123   Perl_dump_indent(aTHX_ level, file,
9124    "%sCan't dump inversion list because is in middle of iterating\n",
9125    indent);
9126   return;
9127  }
9128
9129  invlist_iterinit(invlist);
9130  while (invlist_iternext(invlist, &start, &end)) {
9131   if (end == UV_MAX) {
9132    Perl_dump_indent(aTHX_ level, file,
9133          "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9134         indent, (UV)count, start);
9135   }
9136   else if (end != start) {
9137    Perl_dump_indent(aTHX_ level, file,
9138          "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9139         indent, (UV)count, start,         end);
9140   }
9141   else {
9142    Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9143            indent, (UV)count, start);
9144   }
9145   count += 2;
9146  }
9147 }
9148
9149 void
9150 Perl__load_PL_utf8_foldclosures (pTHX)
9151 {
9152  assert(! PL_utf8_foldclosures);
9153
9154  /* If the folds haven't been read in, call a fold function
9155  * to force that */
9156  if (! PL_utf8_tofold) {
9157   U8 dummy[UTF8_MAXBYTES_CASE+1];
9158
9159   /* This string is just a short named one above \xff */
9160   to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9161   assert(PL_utf8_tofold); /* Verify that worked */
9162  }
9163  PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9164 }
9165 #endif
9166
9167 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9168 bool
9169 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9170 {
9171  /* Return a boolean as to if the two passed in inversion lists are
9172  * identical.  The final argument, if TRUE, says to take the complement of
9173  * the second inversion list before doing the comparison */
9174
9175  const UV* array_a = invlist_array(a);
9176  const UV* array_b = invlist_array(b);
9177  UV len_a = _invlist_len(a);
9178  UV len_b = _invlist_len(b);
9179
9180  UV i = 0;      /* current index into the arrays */
9181  bool retval = TRUE;     /* Assume are identical until proven otherwise */
9182
9183  PERL_ARGS_ASSERT__INVLISTEQ;
9184
9185  /* If are to compare 'a' with the complement of b, set it
9186  * up so are looking at b's complement. */
9187  if (complement_b) {
9188
9189   /* The complement of nothing is everything, so <a> would have to have
9190   * just one element, starting at zero (ending at infinity) */
9191   if (len_b == 0) {
9192    return (len_a == 1 && array_a[0] == 0);
9193   }
9194   else if (array_b[0] == 0) {
9195
9196    /* Otherwise, to complement, we invert.  Here, the first element is
9197    * 0, just remove it.  To do this, we just pretend the array starts
9198    * one later */
9199
9200    array_b++;
9201    len_b--;
9202   }
9203   else {
9204
9205    /* But if the first element is not zero, we pretend the list starts
9206    * at the 0 that is always stored immediately before the array. */
9207    array_b--;
9208    len_b++;
9209   }
9210  }
9211
9212  /* Make sure that the lengths are the same, as well as the final element
9213  * before looping through the remainder.  (Thus we test the length, final,
9214  * and first elements right off the bat) */
9215  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9216   retval = FALSE;
9217  }
9218  else for (i = 0; i < len_a - 1; i++) {
9219   if (array_a[i] != array_b[i]) {
9220    retval = FALSE;
9221    break;
9222   }
9223  }
9224
9225  return retval;
9226 }
9227 #endif
9228
9229 #undef HEADER_LENGTH
9230 #undef TO_INTERNAL_SIZE
9231 #undef FROM_INTERNAL_SIZE
9232 #undef INVLIST_VERSION_ID
9233
9234 /* End of inversion list object */
9235
9236 STATIC void
9237 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9238 {
9239  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9240  * constructs, and updates RExC_flags with them.  On input, RExC_parse
9241  * should point to the first flag; it is updated on output to point to the
9242  * final ')' or ':'.  There needs to be at least one flag, or this will
9243  * abort */
9244
9245  /* for (?g), (?gc), and (?o) warnings; warning
9246  about (?c) will warn about (?g) -- japhy    */
9247
9248 #define WASTED_O  0x01
9249 #define WASTED_G  0x02
9250 #define WASTED_C  0x04
9251 #define WASTED_GC (WASTED_G|WASTED_C)
9252  I32 wastedflags = 0x00;
9253  U32 posflags = 0, negflags = 0;
9254  U32 *flagsp = &posflags;
9255  char has_charset_modifier = '\0';
9256  regex_charset cs;
9257  bool has_use_defaults = FALSE;
9258  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9259
9260  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9261
9262  /* '^' as an initial flag sets certain defaults */
9263  if (UCHARAT(RExC_parse) == '^') {
9264   RExC_parse++;
9265   has_use_defaults = TRUE;
9266   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9267   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9268           ? REGEX_UNICODE_CHARSET
9269           : REGEX_DEPENDS_CHARSET);
9270  }
9271
9272  cs = get_regex_charset(RExC_flags);
9273  if (cs == REGEX_DEPENDS_CHARSET
9274   && (RExC_utf8 || RExC_uni_semantics))
9275  {
9276   cs = REGEX_UNICODE_CHARSET;
9277  }
9278
9279  while (*RExC_parse) {
9280   /* && strchr("iogcmsx", *RExC_parse) */
9281   /* (?g), (?gc) and (?o) are useless here
9282   and must be globally applied -- japhy */
9283   switch (*RExC_parse) {
9284
9285    /* Code for the imsx flags */
9286    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9287
9288    case LOCALE_PAT_MOD:
9289     if (has_charset_modifier) {
9290      goto excess_modifier;
9291     }
9292     else if (flagsp == &negflags) {
9293      goto neg_modifier;
9294     }
9295     cs = REGEX_LOCALE_CHARSET;
9296     has_charset_modifier = LOCALE_PAT_MOD;
9297     break;
9298    case UNICODE_PAT_MOD:
9299     if (has_charset_modifier) {
9300      goto excess_modifier;
9301     }
9302     else if (flagsp == &negflags) {
9303      goto neg_modifier;
9304     }
9305     cs = REGEX_UNICODE_CHARSET;
9306     has_charset_modifier = UNICODE_PAT_MOD;
9307     break;
9308    case ASCII_RESTRICT_PAT_MOD:
9309     if (flagsp == &negflags) {
9310      goto neg_modifier;
9311     }
9312     if (has_charset_modifier) {
9313      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9314       goto excess_modifier;
9315      }
9316      /* Doubled modifier implies more restricted */
9317      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9318     }
9319     else {
9320      cs = REGEX_ASCII_RESTRICTED_CHARSET;
9321     }
9322     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9323     break;
9324    case DEPENDS_PAT_MOD:
9325     if (has_use_defaults) {
9326      goto fail_modifiers;
9327     }
9328     else if (flagsp == &negflags) {
9329      goto neg_modifier;
9330     }
9331     else if (has_charset_modifier) {
9332      goto excess_modifier;
9333     }
9334
9335     /* The dual charset means unicode semantics if the
9336     * pattern (or target, not known until runtime) are
9337     * utf8, or something in the pattern indicates unicode
9338     * semantics */
9339     cs = (RExC_utf8 || RExC_uni_semantics)
9340      ? REGEX_UNICODE_CHARSET
9341      : REGEX_DEPENDS_CHARSET;
9342     has_charset_modifier = DEPENDS_PAT_MOD;
9343     break;
9344    excess_modifier:
9345     RExC_parse++;
9346     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9347      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9348     }
9349     else if (has_charset_modifier == *(RExC_parse - 1)) {
9350      vFAIL2("Regexp modifier \"%c\" may not appear twice",
9351           *(RExC_parse - 1));
9352     }
9353     else {
9354      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9355     }
9356     /*NOTREACHED*/
9357    neg_modifier:
9358     RExC_parse++;
9359     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9360          *(RExC_parse - 1));
9361     /*NOTREACHED*/
9362    case ONCE_PAT_MOD: /* 'o' */
9363    case GLOBAL_PAT_MOD: /* 'g' */
9364     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9365      const I32 wflagbit = *RExC_parse == 'o'
9366           ? WASTED_O
9367           : WASTED_G;
9368      if (! (wastedflags & wflagbit) ) {
9369       wastedflags |= wflagbit;
9370       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9371       vWARN5(
9372        RExC_parse + 1,
9373        "Useless (%s%c) - %suse /%c modifier",
9374        flagsp == &negflags ? "?-" : "?",
9375        *RExC_parse,
9376        flagsp == &negflags ? "don't " : "",
9377        *RExC_parse
9378       );
9379      }
9380     }
9381     break;
9382
9383    case CONTINUE_PAT_MOD: /* 'c' */
9384     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9385      if (! (wastedflags & WASTED_C) ) {
9386       wastedflags |= WASTED_GC;
9387       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9388       vWARN3(
9389        RExC_parse + 1,
9390        "Useless (%sc) - %suse /gc modifier",
9391        flagsp == &negflags ? "?-" : "?",
9392        flagsp == &negflags ? "don't " : ""
9393       );
9394      }
9395     }
9396     break;
9397    case KEEPCOPY_PAT_MOD: /* 'p' */
9398     if (flagsp == &negflags) {
9399      if (SIZE_ONLY)
9400       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9401     } else {
9402      *flagsp |= RXf_PMf_KEEPCOPY;
9403     }
9404     break;
9405    case '-':
9406     /* A flag is a default iff it is following a minus, so
9407     * if there is a minus, it means will be trying to
9408     * re-specify a default which is an error */
9409     if (has_use_defaults || flagsp == &negflags) {
9410      goto fail_modifiers;
9411     }
9412     flagsp = &negflags;
9413     wastedflags = 0;  /* reset so (?g-c) warns twice */
9414     break;
9415    case ':':
9416    case ')':
9417     RExC_flags |= posflags;
9418     RExC_flags &= ~negflags;
9419     set_regex_charset(&RExC_flags, cs);
9420     if (RExC_flags & RXf_PMf_FOLD) {
9421      RExC_contains_i = 1;
9422     }
9423     return;
9424     /*NOTREACHED*/
9425    default:
9426    fail_modifiers:
9427     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9428     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9429     vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9430      UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9431     /*NOTREACHED*/
9432   }
9433
9434   ++RExC_parse;
9435  }
9436 }
9437
9438 /*
9439  - reg - regular expression, i.e. main body or parenthesized thing
9440  *
9441  * Caller must absorb opening parenthesis.
9442  *
9443  * Combining parenthesis handling with the base level of regular expression
9444  * is a trifle forced, but the need to tie the tails of the branches to what
9445  * follows makes it hard to avoid.
9446  */
9447 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9448 #ifdef DEBUGGING
9449 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9450 #else
9451 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9452 #endif
9453
9454 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9455    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9456    needs to be restarted.
9457    Otherwise would only return NULL if regbranch() returns NULL, which
9458    cannot happen.  */
9459 STATIC regnode *
9460 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9461  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9462  * 2 is like 1, but indicates that nextchar() has been called to advance
9463  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9464  * this flag alerts us to the need to check for that */
9465 {
9466  dVAR;
9467  regnode *ret;  /* Will be the head of the group. */
9468  regnode *br;
9469  regnode *lastbr;
9470  regnode *ender = NULL;
9471  I32 parno = 0;
9472  I32 flags;
9473  U32 oregflags = RExC_flags;
9474  bool have_branch = 0;
9475  bool is_open = 0;
9476  I32 freeze_paren = 0;
9477  I32 after_freeze = 0;
9478  I32 num; /* numeric backreferences */
9479
9480  char * parse_start = RExC_parse; /* MJD */
9481  char * const oregcomp_parse = RExC_parse;
9482
9483  GET_RE_DEBUG_FLAGS_DECL;
9484
9485  PERL_ARGS_ASSERT_REG;
9486  DEBUG_PARSE("reg ");
9487
9488  *flagp = 0;    /* Tentatively. */
9489
9490
9491  /* Make an OPEN node, if parenthesized. */
9492  if (paren) {
9493
9494   /* Under /x, space and comments can be gobbled up between the '(' and
9495   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9496   * intervening space, as the sequence is a token, and a token should be
9497   * indivisible */
9498   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9499
9500   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9501    char *start_verb = RExC_parse;
9502    STRLEN verb_len = 0;
9503    char *start_arg = NULL;
9504    unsigned char op = 0;
9505    int argok = 1;
9506    int internal_argval = 0; /* internal_argval is only useful if
9507           !argok */
9508
9509    if (has_intervening_patws) {
9510     RExC_parse++;
9511     vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9512    }
9513    while ( *RExC_parse && *RExC_parse != ')' ) {
9514     if ( *RExC_parse == ':' ) {
9515      start_arg = RExC_parse + 1;
9516      break;
9517     }
9518     RExC_parse++;
9519    }
9520    ++start_verb;
9521    verb_len = RExC_parse - start_verb;
9522    if ( start_arg ) {
9523     RExC_parse++;
9524     while ( *RExC_parse && *RExC_parse != ')' )
9525      RExC_parse++;
9526     if ( *RExC_parse != ')' )
9527      vFAIL("Unterminated verb pattern argument");
9528     if ( RExC_parse == start_arg )
9529      start_arg = NULL;
9530    } else {
9531     if ( *RExC_parse != ')' )
9532      vFAIL("Unterminated verb pattern");
9533    }
9534
9535    switch ( *start_verb ) {
9536    case 'A':  /* (*ACCEPT) */
9537     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9538      op = ACCEPT;
9539      internal_argval = RExC_nestroot;
9540     }
9541     break;
9542    case 'C':  /* (*COMMIT) */
9543     if ( memEQs(start_verb,verb_len,"COMMIT") )
9544      op = COMMIT;
9545     break;
9546    case 'F':  /* (*FAIL) */
9547     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9548      op = OPFAIL;
9549      argok = 0;
9550     }
9551     break;
9552    case ':':  /* (*:NAME) */
9553    case 'M':  /* (*MARK:NAME) */
9554     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9555      op = MARKPOINT;
9556      argok = -1;
9557     }
9558     break;
9559    case 'P':  /* (*PRUNE) */
9560     if ( memEQs(start_verb,verb_len,"PRUNE") )
9561      op = PRUNE;
9562     break;
9563    case 'S':   /* (*SKIP) */
9564     if ( memEQs(start_verb,verb_len,"SKIP") )
9565      op = SKIP;
9566     break;
9567    case 'T':  /* (*THEN) */
9568     /* [19:06] <TimToady> :: is then */
9569     if ( memEQs(start_verb,verb_len,"THEN") ) {
9570      op = CUTGROUP;
9571      RExC_seen |= REG_CUTGROUP_SEEN;
9572     }
9573     break;
9574    }
9575    if ( ! op ) {
9576     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9577     vFAIL2utf8f(
9578      "Unknown verb pattern '%"UTF8f"'",
9579      UTF8fARG(UTF, verb_len, start_verb));
9580    }
9581    if ( argok ) {
9582     if ( start_arg && internal_argval ) {
9583      vFAIL3("Verb pattern '%.*s' may not have an argument",
9584       verb_len, start_verb);
9585     } else if ( argok < 0 && !start_arg ) {
9586      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9587       verb_len, start_verb);
9588     } else {
9589      ret = reganode(pRExC_state, op, internal_argval);
9590      if ( ! internal_argval && ! SIZE_ONLY ) {
9591       if (start_arg) {
9592        SV *sv = newSVpvn( start_arg,
9593            RExC_parse - start_arg);
9594        ARG(ret) = add_data( pRExC_state,
9595             STR_WITH_LEN("S"));
9596        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9597        ret->flags = 0;
9598       } else {
9599        ret->flags = 1;
9600       }
9601      }
9602     }
9603     if (!internal_argval)
9604      RExC_seen |= REG_VERBARG_SEEN;
9605    } else if ( start_arg ) {
9606     vFAIL3("Verb pattern '%.*s' may not have an argument",
9607       verb_len, start_verb);
9608    } else {
9609     ret = reg_node(pRExC_state, op);
9610    }
9611    nextchar(pRExC_state);
9612    return ret;
9613   }
9614   else if (*RExC_parse == '?') { /* (?...) */
9615    bool is_logical = 0;
9616    const char * const seqstart = RExC_parse;
9617    if (has_intervening_patws) {
9618     RExC_parse++;
9619     vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9620    }
9621
9622    RExC_parse++;
9623    paren = *RExC_parse++;
9624    ret = NULL;   /* For look-ahead/behind. */
9625    switch (paren) {
9626
9627    case 'P': /* (?P...) variants for those used to PCRE/Python */
9628     paren = *RExC_parse++;
9629     if ( paren == '<')         /* (?P<...>) named capture */
9630      goto named_capture;
9631     else if (paren == '>') {   /* (?P>name) named recursion */
9632      goto named_recursion;
9633     }
9634     else if (paren == '=') {   /* (?P=...)  named backref */
9635      /* this pretty much dupes the code for \k<NAME> in
9636      * regatom(), if you change this make sure you change that
9637      * */
9638      char* name_start = RExC_parse;
9639      U32 num = 0;
9640      SV *sv_dat = reg_scan_name(pRExC_state,
9641       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9642      if (RExC_parse == name_start || *RExC_parse != ')')
9643       /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9644       vFAIL2("Sequence %.3s... not terminated",parse_start);
9645
9646      if (!SIZE_ONLY) {
9647       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9648       RExC_rxi->data->data[num]=(void*)sv_dat;
9649       SvREFCNT_inc_simple_void(sv_dat);
9650      }
9651      RExC_sawback = 1;
9652      ret = reganode(pRExC_state,
9653         ((! FOLD)
9654          ? NREF
9655          : (ASCII_FOLD_RESTRICTED)
9656          ? NREFFA
9657          : (AT_LEAST_UNI_SEMANTICS)
9658           ? NREFFU
9659           : (LOC)
9660           ? NREFFL
9661           : NREFF),
9662          num);
9663      *flagp |= HASWIDTH;
9664
9665      Set_Node_Offset(ret, parse_start+1);
9666      Set_Node_Cur_Length(ret, parse_start);
9667
9668      nextchar(pRExC_state);
9669      return ret;
9670     }
9671     RExC_parse++;
9672     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9673     vFAIL3("Sequence (%.*s...) not recognized",
9674         RExC_parse-seqstart, seqstart);
9675     /*NOTREACHED*/
9676    case '<':           /* (?<...) */
9677     if (*RExC_parse == '!')
9678      paren = ',';
9679     else if (*RExC_parse != '=')
9680    named_capture:
9681     {               /* (?<...>) */
9682      char *name_start;
9683      SV *svname;
9684      paren= '>';
9685    case '\'':          /* (?'...') */
9686       name_start= RExC_parse;
9687       svname = reg_scan_name(pRExC_state,
9688       SIZE_ONLY    /* reverse test from the others */
9689       ? REG_RSN_RETURN_NAME
9690       : REG_RSN_RETURN_NULL);
9691      if (RExC_parse == name_start || *RExC_parse != paren)
9692       vFAIL2("Sequence (?%c... not terminated",
9693        paren=='>' ? '<' : paren);
9694      if (SIZE_ONLY) {
9695       HE *he_str;
9696       SV *sv_dat = NULL;
9697       if (!svname) /* shouldn't happen */
9698        Perl_croak(aTHX_
9699         "panic: reg_scan_name returned NULL");
9700       if (!RExC_paren_names) {
9701        RExC_paren_names= newHV();
9702        sv_2mortal(MUTABLE_SV(RExC_paren_names));
9703 #ifdef DEBUGGING
9704        RExC_paren_name_list= newAV();
9705        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9706 #endif
9707       }
9708       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9709       if ( he_str )
9710        sv_dat = HeVAL(he_str);
9711       if ( ! sv_dat ) {
9712        /* croak baby croak */
9713        Perl_croak(aTHX_
9714         "panic: paren_name hash element allocation failed");
9715       } else if ( SvPOK(sv_dat) ) {
9716        /* (?|...) can mean we have dupes so scan to check
9717        its already been stored. Maybe a flag indicating
9718        we are inside such a construct would be useful,
9719        but the arrays are likely to be quite small, so
9720        for now we punt -- dmq */
9721        IV count = SvIV(sv_dat);
9722        I32 *pv = (I32*)SvPVX(sv_dat);
9723        IV i;
9724        for ( i = 0 ; i < count ; i++ ) {
9725         if ( pv[i] == RExC_npar ) {
9726          count = 0;
9727          break;
9728         }
9729        }
9730        if ( count ) {
9731         pv = (I32*)SvGROW(sv_dat,
9732             SvCUR(sv_dat) + sizeof(I32)+1);
9733         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9734         pv[count] = RExC_npar;
9735         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9736        }
9737       } else {
9738        (void)SvUPGRADE(sv_dat,SVt_PVNV);
9739        sv_setpvn(sv_dat, (char *)&(RExC_npar),
9740                 sizeof(I32));
9741        SvIOK_on(sv_dat);
9742        SvIV_set(sv_dat, 1);
9743       }
9744 #ifdef DEBUGGING
9745       /* Yes this does cause a memory leak in debugging Perls
9746       * */
9747       if (!av_store(RExC_paren_name_list,
9748          RExC_npar, SvREFCNT_inc(svname)))
9749        SvREFCNT_dec_NN(svname);
9750 #endif
9751
9752       /*sv_dump(sv_dat);*/
9753      }
9754      nextchar(pRExC_state);
9755      paren = 1;
9756      goto capturing_parens;
9757     }
9758     RExC_seen |= REG_LOOKBEHIND_SEEN;
9759     RExC_in_lookbehind++;
9760     RExC_parse++;
9761     /* FALLTHROUGH */
9762    case '=':           /* (?=...) */
9763     RExC_seen_zerolen++;
9764     break;
9765    case '!':           /* (?!...) */
9766     RExC_seen_zerolen++;
9767     if (*RExC_parse == ')') {
9768      ret=reg_node(pRExC_state, OPFAIL);
9769      nextchar(pRExC_state);
9770      return ret;
9771     }
9772     break;
9773    case '|':           /* (?|...) */
9774     /* branch reset, behave like a (?:...) except that
9775     buffers in alternations share the same numbers */
9776     paren = ':';
9777     after_freeze = freeze_paren = RExC_npar;
9778     break;
9779    case ':':           /* (?:...) */
9780    case '>':           /* (?>...) */
9781     break;
9782    case '$':           /* (?$...) */
9783    case '@':           /* (?@...) */
9784     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9785     break;
9786    case '0' :           /* (?0) */
9787    case 'R' :           /* (?R) */
9788     if (*RExC_parse != ')')
9789      FAIL("Sequence (?R) not terminated");
9790     ret = reg_node(pRExC_state, GOSTART);
9791      RExC_seen |= REG_GOSTART_SEEN;
9792     *flagp |= POSTPONED;
9793     nextchar(pRExC_state);
9794     return ret;
9795     /*notreached*/
9796    /* named and numeric backreferences */
9797    case '&':            /* (?&NAME) */
9798     parse_start = RExC_parse - 1;
9799    named_recursion:
9800     {
9801       SV *sv_dat = reg_scan_name(pRExC_state,
9802        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9803       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9804     }
9805     if (RExC_parse == RExC_end || *RExC_parse != ')')
9806      vFAIL("Sequence (?&... not terminated");
9807     goto gen_recurse_regop;
9808     assert(0); /* NOT REACHED */
9809    case '+':
9810     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9811      RExC_parse++;
9812      vFAIL("Illegal pattern");
9813     }
9814     goto parse_recursion;
9815     /* NOT REACHED*/
9816    case '-': /* (?-1) */
9817     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9818      RExC_parse--; /* rewind to let it be handled later */
9819      goto parse_flags;
9820     }
9821     /* FALLTHROUGH */
9822    case '1': case '2': case '3': case '4': /* (?1) */
9823    case '5': case '6': case '7': case '8': case '9':
9824     RExC_parse--;
9825    parse_recursion:
9826     num = atoi(RExC_parse);
9827     parse_start = RExC_parse - 1; /* MJD */
9828     if (*RExC_parse == '-')
9829      RExC_parse++;
9830     while (isDIGIT(*RExC_parse))
9831       RExC_parse++;
9832     if (*RExC_parse!=')')
9833      vFAIL("Expecting close bracket");
9834
9835    gen_recurse_regop:
9836     if ( paren == '-' ) {
9837      /*
9838      Diagram of capture buffer numbering.
9839      Top line is the normal capture buffer numbers
9840      Bottom line is the negative indexing as from
9841      the X (the (?-2))
9842
9843      +   1 2    3 4 5 X          6 7
9844      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9845      -   5 4    3 2 1 X          x x
9846
9847      */
9848      num = RExC_npar + num;
9849      if (num < 1)  {
9850       RExC_parse++;
9851       vFAIL("Reference to nonexistent group");
9852      }
9853     } else if ( paren == '+' ) {
9854      num = RExC_npar + num - 1;
9855     }
9856
9857     ret = reganode(pRExC_state, GOSUB, num);
9858     if (!SIZE_ONLY) {
9859      if (num > (I32)RExC_rx->nparens) {
9860       RExC_parse++;
9861       vFAIL("Reference to nonexistent group");
9862      }
9863      ARG2L_SET( ret, RExC_recurse_count++);
9864      RExC_emit++;
9865      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9866       "Recurse #%"UVuf" to %"IVdf"\n",
9867        (UV)ARG(ret), (IV)ARG2L(ret)));
9868     } else {
9869      RExC_size++;
9870      }
9871      RExC_seen |= REG_RECURSE_SEEN;
9872     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9873     Set_Node_Offset(ret, parse_start); /* MJD */
9874
9875     *flagp |= POSTPONED;
9876     nextchar(pRExC_state);
9877     return ret;
9878
9879    assert(0); /* NOT REACHED */
9880
9881    case '?':           /* (??...) */
9882     is_logical = 1;
9883     if (*RExC_parse != '{') {
9884      RExC_parse++;
9885      /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9886      vFAIL2utf8f(
9887       "Sequence (%"UTF8f"...) not recognized",
9888       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9889      /*NOTREACHED*/
9890     }
9891     *flagp |= POSTPONED;
9892     paren = *RExC_parse++;
9893     /* FALLTHROUGH */
9894    case '{':           /* (?{...}) */
9895    {
9896     U32 n = 0;
9897     struct reg_code_block *cb;
9898
9899     RExC_seen_zerolen++;
9900
9901     if (   !pRExC_state->num_code_blocks
9902      || pRExC_state->code_index >= pRExC_state->num_code_blocks
9903      || pRExC_state->code_blocks[pRExC_state->code_index].start
9904       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9905        - RExC_start)
9906     ) {
9907      if (RExC_pm_flags & PMf_USE_RE_EVAL)
9908       FAIL("panic: Sequence (?{...}): no code block found\n");
9909      FAIL("Eval-group not allowed at runtime, use re 'eval'");
9910     }
9911     /* this is a pre-compiled code block (?{...}) */
9912     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9913     RExC_parse = RExC_start + cb->end;
9914     if (!SIZE_ONLY) {
9915      OP *o = cb->block;
9916      if (cb->src_regex) {
9917       n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9918       RExC_rxi->data->data[n] =
9919        (void*)SvREFCNT_inc((SV*)cb->src_regex);
9920       RExC_rxi->data->data[n+1] = (void*)o;
9921      }
9922      else {
9923       n = add_data(pRExC_state,
9924        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9925       RExC_rxi->data->data[n] = (void*)o;
9926      }
9927     }
9928     pRExC_state->code_index++;
9929     nextchar(pRExC_state);
9930
9931     if (is_logical) {
9932      regnode *eval;
9933      ret = reg_node(pRExC_state, LOGICAL);
9934      eval = reganode(pRExC_state, EVAL, n);
9935      if (!SIZE_ONLY) {
9936       ret->flags = 2;
9937       /* for later propagation into (??{}) return value */
9938       eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9939      }
9940      REGTAIL(pRExC_state, ret, eval);
9941      /* deal with the length of this later - MJD */
9942      return ret;
9943     }
9944     ret = reganode(pRExC_state, EVAL, n);
9945     Set_Node_Length(ret, RExC_parse - parse_start + 1);
9946     Set_Node_Offset(ret, parse_start);
9947     return ret;
9948    }
9949    case '(':           /* (?(?{...})...) and (?(?=...)...) */
9950    {
9951     int is_define= 0;
9952     if (RExC_parse[0] == '?') {        /* (?(?...)) */
9953      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9954       || RExC_parse[1] == '<'
9955       || RExC_parse[1] == '{') { /* Lookahead or eval. */
9956       I32 flag;
9957       regnode *tail;
9958
9959       ret = reg_node(pRExC_state, LOGICAL);
9960       if (!SIZE_ONLY)
9961        ret->flags = 1;
9962
9963       tail = reg(pRExC_state, 1, &flag, depth+1);
9964       if (flag & RESTART_UTF8) {
9965        *flagp = RESTART_UTF8;
9966        return NULL;
9967       }
9968       REGTAIL(pRExC_state, ret, tail);
9969       goto insert_if;
9970      }
9971     }
9972     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9973       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9974     {
9975      char ch = RExC_parse[0] == '<' ? '>' : '\'';
9976      char *name_start= RExC_parse++;
9977      U32 num = 0;
9978      SV *sv_dat=reg_scan_name(pRExC_state,
9979       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9980      if (RExC_parse == name_start || *RExC_parse != ch)
9981       vFAIL2("Sequence (?(%c... not terminated",
9982        (ch == '>' ? '<' : ch));
9983      RExC_parse++;
9984      if (!SIZE_ONLY) {
9985       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9986       RExC_rxi->data->data[num]=(void*)sv_dat;
9987       SvREFCNT_inc_simple_void(sv_dat);
9988      }
9989      ret = reganode(pRExC_state,NGROUPP,num);
9990      goto insert_if_check_paren;
9991     }
9992     else if (RExC_parse[0] == 'D' &&
9993       RExC_parse[1] == 'E' &&
9994       RExC_parse[2] == 'F' &&
9995       RExC_parse[3] == 'I' &&
9996       RExC_parse[4] == 'N' &&
9997       RExC_parse[5] == 'E')
9998     {
9999      ret = reganode(pRExC_state,DEFINEP,0);
10000      RExC_parse +=6 ;
10001      is_define = 1;
10002      goto insert_if_check_paren;
10003     }
10004     else if (RExC_parse[0] == 'R') {
10005      RExC_parse++;
10006      parno = 0;
10007      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10008       parno = atoi(RExC_parse++);
10009       while (isDIGIT(*RExC_parse))
10010        RExC_parse++;
10011      } else if (RExC_parse[0] == '&') {
10012       SV *sv_dat;
10013       RExC_parse++;
10014       sv_dat = reg_scan_name(pRExC_state,
10015        SIZE_ONLY
10016        ? REG_RSN_RETURN_NULL
10017        : REG_RSN_RETURN_DATA);
10018        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10019      }
10020      ret = reganode(pRExC_state,INSUBP,parno);
10021      goto insert_if_check_paren;
10022     }
10023     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10024      /* (?(1)...) */
10025      char c;
10026      char *tmp;
10027      parno = atoi(RExC_parse++);
10028
10029      while (isDIGIT(*RExC_parse))
10030       RExC_parse++;
10031      ret = reganode(pRExC_state, GROUPP, parno);
10032
10033     insert_if_check_paren:
10034      if (*(tmp = nextchar(pRExC_state)) != ')') {
10035       /* nextchar also skips comments, so undo its work
10036       * and skip over the the next character.
10037       */
10038       RExC_parse = tmp;
10039       RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10040       vFAIL("Switch condition not recognized");
10041      }
10042     insert_if:
10043      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10044      br = regbranch(pRExC_state, &flags, 1,depth+1);
10045      if (br == NULL) {
10046       if (flags & RESTART_UTF8) {
10047        *flagp = RESTART_UTF8;
10048        return NULL;
10049       }
10050       FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10051        (UV) flags);
10052      } else
10053       REGTAIL(pRExC_state, br, reganode(pRExC_state,
10054               LONGJMP, 0));
10055      c = *nextchar(pRExC_state);
10056      if (flags&HASWIDTH)
10057       *flagp |= HASWIDTH;
10058      if (c == '|') {
10059       if (is_define)
10060        vFAIL("(?(DEFINE)....) does not allow branches");
10061
10062       /* Fake one for optimizer.  */
10063       lastbr = reganode(pRExC_state, IFTHEN, 0);
10064
10065       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10066        if (flags & RESTART_UTF8) {
10067         *flagp = RESTART_UTF8;
10068         return NULL;
10069        }
10070        FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10071         (UV) flags);
10072       }
10073       REGTAIL(pRExC_state, ret, lastbr);
10074       if (flags&HASWIDTH)
10075        *flagp |= HASWIDTH;
10076       c = *nextchar(pRExC_state);
10077      }
10078      else
10079       lastbr = NULL;
10080      if (c != ')')
10081       vFAIL("Switch (?(condition)... contains too many branches");
10082      ender = reg_node(pRExC_state, TAIL);
10083      REGTAIL(pRExC_state, br, ender);
10084      if (lastbr) {
10085       REGTAIL(pRExC_state, lastbr, ender);
10086       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10087      }
10088      else
10089       REGTAIL(pRExC_state, ret, ender);
10090      RExC_size++; /* XXX WHY do we need this?!!
10091          For large programs it seems to be required
10092          but I can't figure out why. -- dmq*/
10093      return ret;
10094     }
10095     else {
10096      RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10097      vFAIL("Unknown switch condition (?(...))");
10098     }
10099    }
10100    case '[':           /* (?[ ... ]) */
10101     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10102           oregcomp_parse);
10103    case 0:
10104     RExC_parse--; /* for vFAIL to print correctly */
10105     vFAIL("Sequence (? incomplete");
10106     break;
10107    default: /* e.g., (?i) */
10108     --RExC_parse;
10109    parse_flags:
10110     parse_lparen_question_flags(pRExC_state);
10111     if (UCHARAT(RExC_parse) != ':') {
10112      nextchar(pRExC_state);
10113      *flagp = TRYAGAIN;
10114      return NULL;
10115     }
10116     paren = ':';
10117     nextchar(pRExC_state);
10118     ret = NULL;
10119     goto parse_rest;
10120    } /* end switch */
10121   }
10122   else {                  /* (...) */
10123   capturing_parens:
10124    parno = RExC_npar;
10125    RExC_npar++;
10126
10127    ret = reganode(pRExC_state, OPEN, parno);
10128    if (!SIZE_ONLY ){
10129     if (!RExC_nestroot)
10130      RExC_nestroot = parno;
10131     if (RExC_seen & REG_RECURSE_SEEN
10132      && !RExC_open_parens[parno-1])
10133     {
10134      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10135       "Setting open paren #%"IVdf" to %d\n",
10136       (IV)parno, REG_NODE_NUM(ret)));
10137      RExC_open_parens[parno-1]= ret;
10138     }
10139    }
10140    Set_Node_Length(ret, 1); /* MJD */
10141    Set_Node_Offset(ret, RExC_parse); /* MJD */
10142    is_open = 1;
10143   }
10144  }
10145  else                        /* ! paren */
10146   ret = NULL;
10147
10148    parse_rest:
10149  /* Pick up the branches, linking them together. */
10150  parse_start = RExC_parse;   /* MJD */
10151  br = regbranch(pRExC_state, &flags, 1,depth+1);
10152
10153  /*     branch_len = (paren != 0); */
10154
10155  if (br == NULL) {
10156   if (flags & RESTART_UTF8) {
10157    *flagp = RESTART_UTF8;
10158    return NULL;
10159   }
10160   FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10161  }
10162  if (*RExC_parse == '|') {
10163   if (!SIZE_ONLY && RExC_extralen) {
10164    reginsert(pRExC_state, BRANCHJ, br, depth+1);
10165   }
10166   else {                  /* MJD */
10167    reginsert(pRExC_state, BRANCH, br, depth+1);
10168    Set_Node_Length(br, paren != 0);
10169    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10170   }
10171   have_branch = 1;
10172   if (SIZE_ONLY)
10173    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
10174  }
10175  else if (paren == ':') {
10176   *flagp |= flags&SIMPLE;
10177  }
10178  if (is_open) {    /* Starts with OPEN. */
10179   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10180  }
10181  else if (paren != '?')  /* Not Conditional */
10182   ret = br;
10183  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10184  lastbr = br;
10185  while (*RExC_parse == '|') {
10186   if (!SIZE_ONLY && RExC_extralen) {
10187    ender = reganode(pRExC_state, LONGJMP,0);
10188
10189    /* Append to the previous. */
10190    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10191   }
10192   if (SIZE_ONLY)
10193    RExC_extralen += 2;  /* Account for LONGJMP. */
10194   nextchar(pRExC_state);
10195   if (freeze_paren) {
10196    if (RExC_npar > after_freeze)
10197     after_freeze = RExC_npar;
10198    RExC_npar = freeze_paren;
10199   }
10200   br = regbranch(pRExC_state, &flags, 0, depth+1);
10201
10202   if (br == NULL) {
10203    if (flags & RESTART_UTF8) {
10204     *flagp = RESTART_UTF8;
10205     return NULL;
10206    }
10207    FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10208   }
10209   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10210   lastbr = br;
10211   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10212  }
10213
10214  if (have_branch || paren != ':') {
10215   /* Make a closing node, and hook it on the end. */
10216   switch (paren) {
10217   case ':':
10218    ender = reg_node(pRExC_state, TAIL);
10219    break;
10220   case 1: case 2:
10221    ender = reganode(pRExC_state, CLOSE, parno);
10222    if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10223     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10224       "Setting close paren #%"IVdf" to %d\n",
10225       (IV)parno, REG_NODE_NUM(ender)));
10226     RExC_close_parens[parno-1]= ender;
10227     if (RExC_nestroot == parno)
10228      RExC_nestroot = 0;
10229    }
10230    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10231    Set_Node_Length(ender,1); /* MJD */
10232    break;
10233   case '<':
10234   case ',':
10235   case '=':
10236   case '!':
10237    *flagp &= ~HASWIDTH;
10238    /* FALLTHROUGH */
10239   case '>':
10240    ender = reg_node(pRExC_state, SUCCEED);
10241    break;
10242   case 0:
10243    ender = reg_node(pRExC_state, END);
10244    if (!SIZE_ONLY) {
10245     assert(!RExC_opend); /* there can only be one! */
10246     RExC_opend = ender;
10247    }
10248    break;
10249   }
10250   DEBUG_PARSE_r(if (!SIZE_ONLY) {
10251    SV * const mysv_val1=sv_newmortal();
10252    SV * const mysv_val2=sv_newmortal();
10253    DEBUG_PARSE_MSG("lsbr");
10254    regprop(RExC_rx, mysv_val1, lastbr, NULL);
10255    regprop(RExC_rx, mysv_val2, ender, NULL);
10256    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10257       SvPV_nolen_const(mysv_val1),
10258       (IV)REG_NODE_NUM(lastbr),
10259       SvPV_nolen_const(mysv_val2),
10260       (IV)REG_NODE_NUM(ender),
10261       (IV)(ender - lastbr)
10262    );
10263   });
10264   REGTAIL(pRExC_state, lastbr, ender);
10265
10266   if (have_branch && !SIZE_ONLY) {
10267    char is_nothing= 1;
10268    if (depth==1)
10269     RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10270
10271    /* Hook the tails of the branches to the closing node. */
10272    for (br = ret; br; br = regnext(br)) {
10273     const U8 op = PL_regkind[OP(br)];
10274     if (op == BRANCH) {
10275      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10276      if ( OP(NEXTOPER(br)) != NOTHING
10277       || regnext(NEXTOPER(br)) != ender)
10278       is_nothing= 0;
10279     }
10280     else if (op == BRANCHJ) {
10281      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10282      /* for now we always disable this optimisation * /
10283      if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10284       || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10285      */
10286       is_nothing= 0;
10287     }
10288    }
10289    if (is_nothing) {
10290     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10291     DEBUG_PARSE_r(if (!SIZE_ONLY) {
10292      SV * const mysv_val1=sv_newmortal();
10293      SV * const mysv_val2=sv_newmortal();
10294      DEBUG_PARSE_MSG("NADA");
10295      regprop(RExC_rx, mysv_val1, ret, NULL);
10296      regprop(RExC_rx, mysv_val2, ender, NULL);
10297      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10298         SvPV_nolen_const(mysv_val1),
10299         (IV)REG_NODE_NUM(ret),
10300         SvPV_nolen_const(mysv_val2),
10301         (IV)REG_NODE_NUM(ender),
10302         (IV)(ender - ret)
10303      );
10304     });
10305     OP(br)= NOTHING;
10306     if (OP(ender) == TAIL) {
10307      NEXT_OFF(br)= 0;
10308      RExC_emit= br + 1;
10309     } else {
10310      regnode *opt;
10311      for ( opt= br + 1; opt < ender ; opt++ )
10312       OP(opt)= OPTIMIZED;
10313      NEXT_OFF(br)= ender - br;
10314     }
10315    }
10316   }
10317  }
10318
10319  {
10320   const char *p;
10321   static const char parens[] = "=!<,>";
10322
10323   if (paren && (p = strchr(parens, paren))) {
10324    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10325    int flag = (p - parens) > 1;
10326
10327    if (paren == '>')
10328     node = SUSPEND, flag = 0;
10329    reginsert(pRExC_state, node,ret, depth+1);
10330    Set_Node_Cur_Length(ret, parse_start);
10331    Set_Node_Offset(ret, parse_start + 1);
10332    ret->flags = flag;
10333    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10334   }
10335  }
10336
10337  /* Check for proper termination. */
10338  if (paren) {
10339   /* restore original flags, but keep (?p) */
10340   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10341   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10342    RExC_parse = oregcomp_parse;
10343    vFAIL("Unmatched (");
10344   }
10345  }
10346  else if (!paren && RExC_parse < RExC_end) {
10347   if (*RExC_parse == ')') {
10348    RExC_parse++;
10349    vFAIL("Unmatched )");
10350   }
10351   else
10352    FAIL("Junk on end of regexp"); /* "Can't happen". */
10353   assert(0); /* NOTREACHED */
10354  }
10355
10356  if (RExC_in_lookbehind) {
10357   RExC_in_lookbehind--;
10358  }
10359  if (after_freeze > RExC_npar)
10360   RExC_npar = after_freeze;
10361  return(ret);
10362 }
10363
10364 /*
10365  - regbranch - one alternative of an | operator
10366  *
10367  * Implements the concatenation operator.
10368  *
10369  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10370  * restarted.
10371  */
10372 STATIC regnode *
10373 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10374 {
10375  dVAR;
10376  regnode *ret;
10377  regnode *chain = NULL;
10378  regnode *latest;
10379  I32 flags = 0, c = 0;
10380  GET_RE_DEBUG_FLAGS_DECL;
10381
10382  PERL_ARGS_ASSERT_REGBRANCH;
10383
10384  DEBUG_PARSE("brnc");
10385
10386  if (first)
10387   ret = NULL;
10388  else {
10389   if (!SIZE_ONLY && RExC_extralen)
10390    ret = reganode(pRExC_state, BRANCHJ,0);
10391   else {
10392    ret = reg_node(pRExC_state, BRANCH);
10393    Set_Node_Length(ret, 1);
10394   }
10395  }
10396
10397  if (!first && SIZE_ONLY)
10398   RExC_extralen += 1;   /* BRANCHJ */
10399
10400  *flagp = WORST;   /* Tentatively. */
10401
10402  RExC_parse--;
10403  nextchar(pRExC_state);
10404  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10405   flags &= ~TRYAGAIN;
10406   latest = regpiece(pRExC_state, &flags,depth+1);
10407   if (latest == NULL) {
10408    if (flags & TRYAGAIN)
10409     continue;
10410    if (flags & RESTART_UTF8) {
10411     *flagp = RESTART_UTF8;
10412     return NULL;
10413    }
10414    FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10415   }
10416   else if (ret == NULL)
10417    ret = latest;
10418   *flagp |= flags&(HASWIDTH|POSTPONED);
10419   if (chain == NULL)  /* First piece. */
10420    *flagp |= flags&SPSTART;
10421   else {
10422    RExC_naughty++;
10423    REGTAIL(pRExC_state, chain, latest);
10424   }
10425   chain = latest;
10426   c++;
10427  }
10428  if (chain == NULL) { /* Loop ran zero times. */
10429   chain = reg_node(pRExC_state, NOTHING);
10430   if (ret == NULL)
10431    ret = chain;
10432  }
10433  if (c == 1) {
10434   *flagp |= flags&SIMPLE;
10435  }
10436
10437  return ret;
10438 }
10439
10440 /*
10441  - regpiece - something followed by possible [*+?]
10442  *
10443  * Note that the branching code sequences used for ? and the general cases
10444  * of * and + are somewhat optimized:  they use the same NOTHING node as
10445  * both the endmarker for their branch list and the body of the last branch.
10446  * It might seem that this node could be dispensed with entirely, but the
10447  * endmarker role is not redundant.
10448  *
10449  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10450  * TRYAGAIN.
10451  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10452  * restarted.
10453  */
10454 STATIC regnode *
10455 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10456 {
10457  dVAR;
10458  regnode *ret;
10459  char op;
10460  char *next;
10461  I32 flags;
10462  const char * const origparse = RExC_parse;
10463  I32 min;
10464  I32 max = REG_INFTY;
10465 #ifdef RE_TRACK_PATTERN_OFFSETS
10466  char *parse_start;
10467 #endif
10468  const char *maxpos = NULL;
10469
10470  /* Save the original in case we change the emitted regop to a FAIL. */
10471  regnode * const orig_emit = RExC_emit;
10472
10473  GET_RE_DEBUG_FLAGS_DECL;
10474
10475  PERL_ARGS_ASSERT_REGPIECE;
10476
10477  DEBUG_PARSE("piec");
10478
10479  ret = regatom(pRExC_state, &flags,depth+1);
10480  if (ret == NULL) {
10481   if (flags & (TRYAGAIN|RESTART_UTF8))
10482    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10483   else
10484    FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10485   return(NULL);
10486  }
10487
10488  op = *RExC_parse;
10489
10490  if (op == '{' && regcurly(RExC_parse)) {
10491   maxpos = NULL;
10492 #ifdef RE_TRACK_PATTERN_OFFSETS
10493   parse_start = RExC_parse; /* MJD */
10494 #endif
10495   next = RExC_parse + 1;
10496   while (isDIGIT(*next) || *next == ',') {
10497    if (*next == ',') {
10498     if (maxpos)
10499      break;
10500     else
10501      maxpos = next;
10502    }
10503    next++;
10504   }
10505   if (*next == '}') {  /* got one */
10506    if (!maxpos)
10507     maxpos = next;
10508    RExC_parse++;
10509    min = atoi(RExC_parse);
10510    if (*maxpos == ',')
10511     maxpos++;
10512    else
10513     maxpos = RExC_parse;
10514    max = atoi(maxpos);
10515    if (!max && *maxpos != '0')
10516     max = REG_INFTY;  /* meaning "infinity" */
10517    else if (max >= REG_INFTY)
10518     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10519    RExC_parse = next;
10520    nextchar(pRExC_state);
10521    if (max < min) {    /* If can't match, warn and optimize to fail
10522         unconditionally */
10523     if (SIZE_ONLY) {
10524      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10525
10526      /* We can't back off the size because we have to reserve
10527      * enough space for all the things we are about to throw
10528      * away, but we can shrink it by the ammount we are about
10529      * to re-use here */
10530      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10531     }
10532     else {
10533      RExC_emit = orig_emit;
10534     }
10535     ret = reg_node(pRExC_state, OPFAIL);
10536     return ret;
10537    }
10538    else if (min == max
10539      && RExC_parse < RExC_end
10540      && (*RExC_parse == '?' || *RExC_parse == '+'))
10541    {
10542     if (SIZE_ONLY) {
10543      ckWARN2reg(RExC_parse + 1,
10544        "Useless use of greediness modifier '%c'",
10545        *RExC_parse);
10546     }
10547     /* Absorb the modifier, so later code doesn't see nor use
10548      * it */
10549     nextchar(pRExC_state);
10550    }
10551
10552   do_curly:
10553    if ((flags&SIMPLE)) {
10554     RExC_naughty += 2 + RExC_naughty / 2;
10555     reginsert(pRExC_state, CURLY, ret, depth+1);
10556     Set_Node_Offset(ret, parse_start+1); /* MJD */
10557     Set_Node_Cur_Length(ret, parse_start);
10558    }
10559    else {
10560     regnode * const w = reg_node(pRExC_state, WHILEM);
10561
10562     w->flags = 0;
10563     REGTAIL(pRExC_state, ret, w);
10564     if (!SIZE_ONLY && RExC_extralen) {
10565      reginsert(pRExC_state, LONGJMP,ret, depth+1);
10566      reginsert(pRExC_state, NOTHING,ret, depth+1);
10567      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10568     }
10569     reginsert(pRExC_state, CURLYX,ret, depth+1);
10570         /* MJD hk */
10571     Set_Node_Offset(ret, parse_start+1);
10572     Set_Node_Length(ret,
10573         op == '{' ? (RExC_parse - parse_start) : 1);
10574
10575     if (!SIZE_ONLY && RExC_extralen)
10576      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10577     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10578     if (SIZE_ONLY)
10579      RExC_whilem_seen++, RExC_extralen += 3;
10580     RExC_naughty += 4 + RExC_naughty; /* compound interest */
10581    }
10582    ret->flags = 0;
10583
10584    if (min > 0)
10585     *flagp = WORST;
10586    if (max > 0)
10587     *flagp |= HASWIDTH;
10588    if (!SIZE_ONLY) {
10589     ARG1_SET(ret, (U16)min);
10590     ARG2_SET(ret, (U16)max);
10591    }
10592    if (max == REG_INFTY)
10593     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10594
10595    goto nest_check;
10596   }
10597  }
10598
10599  if (!ISMULT1(op)) {
10600   *flagp = flags;
10601   return(ret);
10602  }
10603
10604 #if 0    /* Now runtime fix should be reliable. */
10605
10606  /* if this is reinstated, don't forget to put this back into perldiag:
10607
10608    =item Regexp *+ operand could be empty at {#} in regex m/%s/
10609
10610   (F) The part of the regexp subject to either the * or + quantifier
10611   could match an empty string. The {#} shows in the regular
10612   expression about where the problem was discovered.
10613
10614  */
10615
10616  if (!(flags&HASWIDTH) && op != '?')
10617  vFAIL("Regexp *+ operand could be empty");
10618 #endif
10619
10620 #ifdef RE_TRACK_PATTERN_OFFSETS
10621  parse_start = RExC_parse;
10622 #endif
10623  nextchar(pRExC_state);
10624
10625  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10626
10627  if (op == '*' && (flags&SIMPLE)) {
10628   reginsert(pRExC_state, STAR, ret, depth+1);
10629   ret->flags = 0;
10630   RExC_naughty += 4;
10631   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10632  }
10633  else if (op == '*') {
10634   min = 0;
10635   goto do_curly;
10636  }
10637  else if (op == '+' && (flags&SIMPLE)) {
10638   reginsert(pRExC_state, PLUS, ret, depth+1);
10639   ret->flags = 0;
10640   RExC_naughty += 3;
10641   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10642  }
10643  else if (op == '+') {
10644   min = 1;
10645   goto do_curly;
10646  }
10647  else if (op == '?') {
10648   min = 0; max = 1;
10649   goto do_curly;
10650  }
10651   nest_check:
10652  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10653   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10654   ckWARN2reg(RExC_parse,
10655     "%"UTF8f" matches null string many times",
10656     UTF8fARG(UTF, (RExC_parse >= origparse
10657         ? RExC_parse - origparse
10658         : 0),
10659     origparse));
10660   (void)ReREFCNT_inc(RExC_rx_sv);
10661  }
10662
10663  if (RExC_parse < RExC_end && *RExC_parse == '?') {
10664   nextchar(pRExC_state);
10665   reginsert(pRExC_state, MINMOD, ret, depth+1);
10666   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10667  }
10668  else
10669  if (RExC_parse < RExC_end && *RExC_parse == '+') {
10670   regnode *ender;
10671   nextchar(pRExC_state);
10672   ender = reg_node(pRExC_state, SUCCEED);
10673   REGTAIL(pRExC_state, ret, ender);
10674   reginsert(pRExC_state, SUSPEND, ret, depth+1);
10675   ret->flags = 0;
10676   ender = reg_node(pRExC_state, TAIL);
10677   REGTAIL(pRExC_state, ret, ender);
10678  }
10679
10680  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10681   RExC_parse++;
10682   vFAIL("Nested quantifiers");
10683  }
10684
10685  return(ret);
10686 }
10687
10688 STATIC bool
10689 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10690      UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10691      const bool strict   /* Apply stricter parsing rules? */
10692  )
10693 {
10694
10695  /* This is expected to be called by a parser routine that has recognized '\N'
10696    and needs to handle the rest. RExC_parse is expected to point at the first
10697    char following the N at the time of the call.  On successful return,
10698    RExC_parse has been updated to point to just after the sequence identified
10699    by this routine, and <*flagp> has been updated.
10700
10701    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10702    character class.
10703
10704    \N may begin either a named sequence, or if outside a character class, mean
10705    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10706    attempted to decide which, and in the case of a named sequence, converted it
10707    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10708    where c1... are the characters in the sequence.  For single-quoted regexes,
10709    the tokenizer passes the \N sequence through unchanged; this code will not
10710    attempt to determine this nor expand those, instead raising a syntax error.
10711    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10712    or there is no '}', it signals that this \N occurrence means to match a
10713    non-newline.
10714
10715    Only the \N{U+...} form should occur in a character class, for the same
10716    reason that '.' inside a character class means to just match a period: it
10717    just doesn't make sense.
10718
10719    The function raises an error (via vFAIL), and doesn't return for various
10720    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10721    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10722    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10723    only possible if node_p is non-NULL.
10724
10725
10726    If <valuep> is non-null, it means the caller can accept an input sequence
10727    consisting of a just a single code point; <*valuep> is set to that value
10728    if the input is such.
10729
10730    If <node_p> is non-null it signifies that the caller can accept any other
10731    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10732    is set as follows:
10733  1) \N means not-a-NL: points to a newly created REG_ANY node;
10734  2) \N{}:              points to a new NOTHING node;
10735  3) otherwise:         points to a new EXACT node containing the resolved
10736       string.
10737    Note that FALSE is returned for single code point sequences if <valuep> is
10738    null.
10739  */
10740
10741  char * endbrace;    /* '}' following the name */
10742  char* p;
10743  char *endchar; /* Points to '.' or '}' ending cur char in the input
10744       stream */
10745  bool has_multiple_chars; /* true if the input stream contains a sequence of
10746         more than one character */
10747
10748  GET_RE_DEBUG_FLAGS_DECL;
10749
10750  PERL_ARGS_ASSERT_GROK_BSLASH_N;
10751
10752  GET_RE_DEBUG_FLAGS;
10753
10754  assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10755
10756  /* The [^\n] meaning of \N ignores spaces and comments under the /x
10757  * modifier.  The other meaning does not, so use a temporary until we find
10758  * out which we are being called with */
10759  p = (RExC_flags & RXf_PMf_EXTENDED)
10760   ? regpatws(pRExC_state, RExC_parse,
10761         TRUE) /* means recognize comments */
10762   : RExC_parse;
10763
10764  /* Disambiguate between \N meaning a named character versus \N meaning
10765  * [^\n].  The former is assumed when it can't be the latter. */
10766  if (*p != '{' || regcurly(p)) {
10767   RExC_parse = p;
10768   if (! node_p) {
10769    /* no bare \N allowed in a charclass */
10770    if (in_char_class) {
10771     vFAIL("\\N in a character class must be a named character: \\N{...}");
10772    }
10773    return FALSE;
10774   }
10775   RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10776       current char */
10777   nextchar(pRExC_state);
10778   *node_p = reg_node(pRExC_state, REG_ANY);
10779   *flagp |= HASWIDTH|SIMPLE;
10780   RExC_naughty++;
10781   Set_Node_Length(*node_p, 1); /* MJD */
10782   return TRUE;
10783  }
10784
10785  /* Here, we have decided it should be a named character or sequence */
10786
10787  /* The test above made sure that the next real character is a '{', but
10788  * under the /x modifier, it could be separated by space (or a comment and
10789  * \n) and this is not allowed (for consistency with \x{...} and the
10790  * tokenizer handling of \N{NAME}). */
10791  if (*RExC_parse != '{') {
10792   vFAIL("Missing braces on \\N{}");
10793  }
10794
10795  RExC_parse++; /* Skip past the '{' */
10796
10797  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10798   || ! (endbrace == RExC_parse  /* nothing between the {} */
10799    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10800             */
10801     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10802              */
10803  {
10804   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10805   vFAIL("\\N{NAME} must be resolved by the lexer");
10806  }
10807
10808  if (endbrace == RExC_parse) {   /* empty: \N{} */
10809   bool ret = TRUE;
10810   if (node_p) {
10811    *node_p = reg_node(pRExC_state,NOTHING);
10812   }
10813   else if (in_char_class) {
10814    if (SIZE_ONLY && in_char_class) {
10815     if (strict) {
10816      RExC_parse++;   /* Position after the "}" */
10817      vFAIL("Zero length \\N{}");
10818     }
10819     else {
10820      ckWARNreg(RExC_parse,
10821        "Ignoring zero length \\N{} in character class");
10822     }
10823    }
10824    ret = FALSE;
10825   }
10826   else {
10827    return FALSE;
10828   }
10829   nextchar(pRExC_state);
10830   return ret;
10831  }
10832
10833  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10834  RExC_parse += 2; /* Skip past the 'U+' */
10835
10836  endchar = RExC_parse + strcspn(RExC_parse, ".}");
10837
10838  /* Code points are separated by dots.  If none, there is only one code
10839  * point, and is terminated by the brace */
10840  has_multiple_chars = (endchar < endbrace);
10841
10842  if (valuep && (! has_multiple_chars || in_char_class)) {
10843   /* We only pay attention to the first char of
10844   multichar strings being returned in char classes. I kinda wonder
10845   if this makes sense as it does change the behaviour
10846   from earlier versions, OTOH that behaviour was broken
10847   as well. XXX Solution is to recharacterize as
10848   [rest-of-class]|multi1|multi2... */
10849
10850   STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10851   I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10852    | PERL_SCAN_DISALLOW_PREFIX
10853    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10854
10855   *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10856
10857   /* The tokenizer should have guaranteed validity, but it's possible to
10858   * bypass it by using single quoting, so check */
10859   if (length_of_hex == 0
10860    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10861   {
10862    RExC_parse += length_of_hex; /* Includes all the valid */
10863    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10864        ? UTF8SKIP(RExC_parse)
10865        : 1;
10866    /* Guard against malformed utf8 */
10867    if (RExC_parse >= endchar) {
10868     RExC_parse = endchar;
10869    }
10870    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10871   }
10872
10873   if (in_char_class && has_multiple_chars) {
10874    if (strict) {
10875     RExC_parse = endbrace;
10876     vFAIL("\\N{} in character class restricted to one character");
10877    }
10878    else {
10879     ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10880    }
10881   }
10882
10883   RExC_parse = endbrace + 1;
10884  }
10885  else if (! node_p || ! has_multiple_chars) {
10886
10887   /* Here, the input is legal, but not according to the caller's
10888   * options.  We fail without advancing the parse, so that the
10889   * caller can try again */
10890   RExC_parse = p;
10891   return FALSE;
10892  }
10893  else {
10894
10895   /* What is done here is to convert this to a sub-pattern of the form
10896   * (?:\x{char1}\x{char2}...)
10897   * and then call reg recursively.  That way, it retains its atomicness,
10898   * while not having to worry about special handling that some code
10899   * points may have.  toke.c has converted the original Unicode values
10900   * to native, so that we can just pass on the hex values unchanged.  We
10901   * do have to set a flag to keep recoding from happening in the
10902   * recursion */
10903
10904   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10905   STRLEN len;
10906   char *orig_end = RExC_end;
10907   I32 flags;
10908
10909   while (RExC_parse < endbrace) {
10910
10911    /* Convert to notation the rest of the code understands */
10912    sv_catpv(substitute_parse, "\\x{");
10913    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10914    sv_catpv(substitute_parse, "}");
10915
10916    /* Point to the beginning of the next character in the sequence. */
10917    RExC_parse = endchar + 1;
10918    endchar = RExC_parse + strcspn(RExC_parse, ".}");
10919   }
10920   sv_catpv(substitute_parse, ")");
10921
10922   RExC_parse = SvPV(substitute_parse, len);
10923
10924   /* Don't allow empty number */
10925   if (len < 8) {
10926    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10927   }
10928   RExC_end = RExC_parse + len;
10929
10930   /* The values are Unicode, and therefore not subject to recoding */
10931   RExC_override_recoding = 1;
10932
10933   if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10934    if (flags & RESTART_UTF8) {
10935     *flagp = RESTART_UTF8;
10936     return FALSE;
10937    }
10938    FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10939     (UV) flags);
10940   }
10941   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10942
10943   RExC_parse = endbrace;
10944   RExC_end = orig_end;
10945   RExC_override_recoding = 0;
10946
10947   nextchar(pRExC_state);
10948  }
10949
10950  return TRUE;
10951 }
10952
10953
10954 /*
10955  * reg_recode
10956  *
10957  * It returns the code point in utf8 for the value in *encp.
10958  *    value: a code value in the source encoding
10959  *    encp:  a pointer to an Encode object
10960  *
10961  * If the result from Encode is not a single character,
10962  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10963  */
10964 STATIC UV
10965 S_reg_recode(pTHX_ const char value, SV **encp)
10966 {
10967  STRLEN numlen = 1;
10968  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10969  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10970  const STRLEN newlen = SvCUR(sv);
10971  UV uv = UNICODE_REPLACEMENT;
10972
10973  PERL_ARGS_ASSERT_REG_RECODE;
10974
10975  if (newlen)
10976   uv = SvUTF8(sv)
10977    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10978    : *(U8*)s;
10979
10980  if (!newlen || numlen != newlen) {
10981   uv = UNICODE_REPLACEMENT;
10982   *encp = NULL;
10983  }
10984  return uv;
10985 }
10986
10987 PERL_STATIC_INLINE U8
10988 S_compute_EXACTish(RExC_state_t *pRExC_state)
10989 {
10990  U8 op;
10991
10992  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10993
10994  if (! FOLD) {
10995   return EXACT;
10996  }
10997
10998  op = get_regex_charset(RExC_flags);
10999  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11000   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11001     been, so there is no hole */
11002  }
11003
11004  return op + EXACTF;
11005 }
11006
11007 PERL_STATIC_INLINE void
11008 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11009       regnode *node, I32* flagp, STRLEN len, UV code_point,
11010       bool downgradable)
11011 {
11012  /* This knows the details about sizing an EXACTish node, setting flags for
11013  * it (by setting <*flagp>, and potentially populating it with a single
11014  * character.
11015  *
11016  * If <len> (the length in bytes) is non-zero, this function assumes that
11017  * the node has already been populated, and just does the sizing.  In this
11018  * case <code_point> should be the final code point that has already been
11019  * placed into the node.  This value will be ignored except that under some
11020  * circumstances <*flagp> is set based on it.
11021  *
11022  * If <len> is zero, the function assumes that the node is to contain only
11023  * the single character given by <code_point> and calculates what <len>
11024  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
11025  * additionally will populate the node's STRING with <code_point> or its
11026  * fold if folding.
11027  *
11028  * In both cases <*flagp> is appropriately set
11029  *
11030  * It knows that under FOLD, the Latin Sharp S and UTF characters above
11031  * 255, must be folded (the former only when the rules indicate it can
11032  * match 'ss')
11033  *
11034  * When it does the populating, it looks at the flag 'downgradable'.  If
11035  * true with a node that folds, it checks if the single code point
11036  * participates in a fold, and if not downgrades the node to an EXACT.
11037  * This helps the optimizer */
11038
11039  bool len_passed_in = cBOOL(len != 0);
11040  U8 character[UTF8_MAXBYTES_CASE+1];
11041
11042  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11043
11044  /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11045  * sizing difference, and is extra work that is thrown away */
11046  if (downgradable && ! PASS2) {
11047   downgradable = FALSE;
11048  }
11049
11050  if (! len_passed_in) {
11051   if (UTF) {
11052    if (UNI_IS_INVARIANT(code_point)) {
11053     if (LOC || ! FOLD) {    /* /l defers folding until runtime */
11054      *character = (U8) code_point;
11055     }
11056     else { /* Here is /i and not /l (toFOLD() is defined on just
11057       ASCII, which isn't the same thing as INVARIANT on
11058       EBCDIC, but it works there, as the extra invariants
11059       fold to themselves) */
11060      *character = toFOLD((U8) code_point);
11061      if (downgradable
11062       && *character == code_point
11063       && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
11064      {
11065       OP(node) = EXACT;
11066      }
11067     }
11068     len = 1;
11069    }
11070    else if (FOLD && (! LOC
11071        || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11072    {   /* Folding, and ok to do so now */
11073     UV folded = _to_uni_fold_flags(
11074         code_point,
11075         character,
11076         &len,
11077         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11078              ? FOLD_FLAGS_NOMIX_ASCII
11079              : 0));
11080     if (downgradable
11081      && folded == code_point
11082      && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11083     {
11084      OP(node) = EXACT;
11085     }
11086    }
11087    else if (code_point <= MAX_UTF8_TWO_BYTE) {
11088
11089     /* Not folding this cp, and can output it directly */
11090     *character = UTF8_TWO_BYTE_HI(code_point);
11091     *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11092     len = 2;
11093    }
11094    else {
11095     uvchr_to_utf8( character, code_point);
11096     len = UTF8SKIP(character);
11097    }
11098   } /* Else pattern isn't UTF8.  */
11099   else if (! FOLD) {
11100    *character = (U8) code_point;
11101    len = 1;
11102   } /* Else is folded non-UTF8 */
11103   else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11104
11105    /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11106    * comments at join_exact()); */
11107    *character = (U8) code_point;
11108    len = 1;
11109
11110    /* Can turn into an EXACT node if we know the fold at compile time,
11111    * and it folds to itself and doesn't particpate in other folds */
11112    if (downgradable
11113     && ! LOC
11114     && PL_fold_latin1[code_point] == code_point
11115     && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11116      || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11117    {
11118     OP(node) = EXACT;
11119    }
11120   } /* else is Sharp s.  May need to fold it */
11121   else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11122    *character = 's';
11123    *(character + 1) = 's';
11124    len = 2;
11125   }
11126   else {
11127    *character = LATIN_SMALL_LETTER_SHARP_S;
11128    len = 1;
11129   }
11130  }
11131
11132  if (SIZE_ONLY) {
11133   RExC_size += STR_SZ(len);
11134  }
11135  else {
11136   RExC_emit += STR_SZ(len);
11137   STR_LEN(node) = len;
11138   if (! len_passed_in) {
11139    Copy((char *) character, STRING(node), len, char);
11140   }
11141  }
11142
11143  *flagp |= HASWIDTH;
11144
11145  /* A single character node is SIMPLE, except for the special-cased SHARP S
11146  * under /di. */
11147  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11148   && (code_point != LATIN_SMALL_LETTER_SHARP_S
11149    || ! FOLD || ! DEPENDS_SEMANTICS))
11150  {
11151   *flagp |= SIMPLE;
11152  }
11153
11154  /* The OP may not be well defined in PASS1 */
11155  if (PASS2 && OP(node) == EXACTFL) {
11156   RExC_contains_locale = 1;
11157  }
11158 }
11159
11160
11161 /* return atoi(p), unless it's too big to sensibly be a backref,
11162  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11163
11164 static I32
11165 S_backref_value(char *p)
11166 {
11167  char *q = p;
11168
11169  for (;isDIGIT(*q); q++) {} /* calculate length of num */
11170  if (q - p == 0 || q - p > 9)
11171   return I32_MAX;
11172  return atoi(p);
11173 }
11174
11175
11176 /*
11177  - regatom - the lowest level
11178
11179    Try to identify anything special at the start of the pattern. If there
11180    is, then handle it as required. This may involve generating a single regop,
11181    such as for an assertion; or it may involve recursing, such as to
11182    handle a () structure.
11183
11184    If the string doesn't start with something special then we gobble up
11185    as much literal text as we can.
11186
11187    Once we have been able to handle whatever type of thing started the
11188    sequence, we return.
11189
11190    Note: we have to be careful with escapes, as they can be both literal
11191    and special, and in the case of \10 and friends, context determines which.
11192
11193    A summary of the code structure is:
11194
11195    switch (first_byte) {
11196   cases for each special:
11197    handle this special;
11198    break;
11199   case '\\':
11200    switch (2nd byte) {
11201     cases for each unambiguous special:
11202      handle this special;
11203      break;
11204     cases for each ambigous special/literal:
11205      disambiguate;
11206      if (special)  handle here
11207      else goto defchar;
11208     default: // unambiguously literal:
11209      goto defchar;
11210    }
11211   default:  // is a literal char
11212    // FALL THROUGH
11213   defchar:
11214    create EXACTish node for literal;
11215    while (more input and node isn't full) {
11216     switch (input_byte) {
11217     cases for each special;
11218      make sure parse pointer is set so that the next call to
11219       regatom will see this special first
11220      goto loopdone; // EXACTish node terminated by prev. char
11221     default:
11222      append char to EXACTISH node;
11223     }
11224     get next input byte;
11225    }
11226   loopdone:
11227    }
11228    return the generated node;
11229
11230    Specifically there are two separate switches for handling
11231    escape sequences, with the one for handling literal escapes requiring
11232    a dummy entry for all of the special escapes that are actually handled
11233    by the other.
11234
11235    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11236    TRYAGAIN.
11237    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11238    restarted.
11239    Otherwise does not return NULL.
11240 */
11241
11242 STATIC regnode *
11243 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11244 {
11245  dVAR;
11246  regnode *ret = NULL;
11247  I32 flags = 0;
11248  char *parse_start = RExC_parse;
11249  U8 op;
11250  int invert = 0;
11251  U8 arg;
11252
11253  GET_RE_DEBUG_FLAGS_DECL;
11254
11255  *flagp = WORST;  /* Tentatively. */
11256
11257  DEBUG_PARSE("atom");
11258
11259  PERL_ARGS_ASSERT_REGATOM;
11260
11261 tryagain:
11262  switch ((U8)*RExC_parse) {
11263  case '^':
11264   RExC_seen_zerolen++;
11265   nextchar(pRExC_state);
11266   if (RExC_flags & RXf_PMf_MULTILINE)
11267    ret = reg_node(pRExC_state, MBOL);
11268   else if (RExC_flags & RXf_PMf_SINGLELINE)
11269    ret = reg_node(pRExC_state, SBOL);
11270   else
11271    ret = reg_node(pRExC_state, BOL);
11272   Set_Node_Length(ret, 1); /* MJD */
11273   break;
11274  case '$':
11275   nextchar(pRExC_state);
11276   if (*RExC_parse)
11277    RExC_seen_zerolen++;
11278   if (RExC_flags & RXf_PMf_MULTILINE)
11279    ret = reg_node(pRExC_state, MEOL);
11280   else if (RExC_flags & RXf_PMf_SINGLELINE)
11281    ret = reg_node(pRExC_state, SEOL);
11282   else
11283    ret = reg_node(pRExC_state, EOL);
11284   Set_Node_Length(ret, 1); /* MJD */
11285   break;
11286  case '.':
11287   nextchar(pRExC_state);
11288   if (RExC_flags & RXf_PMf_SINGLELINE)
11289    ret = reg_node(pRExC_state, SANY);
11290   else
11291    ret = reg_node(pRExC_state, REG_ANY);
11292   *flagp |= HASWIDTH|SIMPLE;
11293   RExC_naughty++;
11294   Set_Node_Length(ret, 1); /* MJD */
11295   break;
11296  case '[':
11297  {
11298   char * const oregcomp_parse = ++RExC_parse;
11299   ret = regclass(pRExC_state, flagp,depth+1,
11300      FALSE, /* means parse the whole char class */
11301      TRUE, /* allow multi-char folds */
11302      FALSE, /* don't silence non-portable warnings. */
11303      NULL);
11304   if (*RExC_parse != ']') {
11305    RExC_parse = oregcomp_parse;
11306    vFAIL("Unmatched [");
11307   }
11308   if (ret == NULL) {
11309    if (*flagp & RESTART_UTF8)
11310     return NULL;
11311    FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11312     (UV) *flagp);
11313   }
11314   nextchar(pRExC_state);
11315   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11316   break;
11317  }
11318  case '(':
11319   nextchar(pRExC_state);
11320   ret = reg(pRExC_state, 2, &flags,depth+1);
11321   if (ret == NULL) {
11322     if (flags & TRYAGAIN) {
11323      if (RExC_parse == RExC_end) {
11324       /* Make parent create an empty node if needed. */
11325       *flagp |= TRYAGAIN;
11326       return(NULL);
11327      }
11328      goto tryagain;
11329     }
11330     if (flags & RESTART_UTF8) {
11331      *flagp = RESTART_UTF8;
11332      return NULL;
11333     }
11334     FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11335                 (UV) flags);
11336   }
11337   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11338   break;
11339  case '|':
11340  case ')':
11341   if (flags & TRYAGAIN) {
11342    *flagp |= TRYAGAIN;
11343    return NULL;
11344   }
11345   vFAIL("Internal urp");
11346         /* Supposed to be caught earlier. */
11347   break;
11348  case '?':
11349  case '+':
11350  case '*':
11351   RExC_parse++;
11352   vFAIL("Quantifier follows nothing");
11353   break;
11354  case '\\':
11355   /* Special Escapes
11356
11357   This switch handles escape sequences that resolve to some kind
11358   of special regop and not to literal text. Escape sequnces that
11359   resolve to literal text are handled below in the switch marked
11360   "Literal Escapes".
11361
11362   Every entry in this switch *must* have a corresponding entry
11363   in the literal escape switch. However, the opposite is not
11364   required, as the default for this switch is to jump to the
11365   literal text handling code.
11366   */
11367   switch ((U8)*++RExC_parse) {
11368   /* Special Escapes */
11369   case 'A':
11370    RExC_seen_zerolen++;
11371    ret = reg_node(pRExC_state, SBOL);
11372    *flagp |= SIMPLE;
11373    goto finish_meta_pat;
11374   case 'G':
11375    ret = reg_node(pRExC_state, GPOS);
11376    RExC_seen |= REG_GPOS_SEEN;
11377    *flagp |= SIMPLE;
11378    goto finish_meta_pat;
11379   case 'K':
11380    RExC_seen_zerolen++;
11381    ret = reg_node(pRExC_state, KEEPS);
11382    *flagp |= SIMPLE;
11383    /* XXX:dmq : disabling in-place substitution seems to
11384    * be necessary here to avoid cases of memory corruption, as
11385    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11386    */
11387    RExC_seen |= REG_LOOKBEHIND_SEEN;
11388    goto finish_meta_pat;
11389   case 'Z':
11390    ret = reg_node(pRExC_state, SEOL);
11391    *flagp |= SIMPLE;
11392    RExC_seen_zerolen++;  /* Do not optimize RE away */
11393    goto finish_meta_pat;
11394   case 'z':
11395    ret = reg_node(pRExC_state, EOS);
11396    *flagp |= SIMPLE;
11397    RExC_seen_zerolen++;  /* Do not optimize RE away */
11398    goto finish_meta_pat;
11399   case 'C':
11400    ret = reg_node(pRExC_state, CANY);
11401    RExC_seen |= REG_CANY_SEEN;
11402    *flagp |= HASWIDTH|SIMPLE;
11403    goto finish_meta_pat;
11404   case 'X':
11405    ret = reg_node(pRExC_state, CLUMP);
11406    *flagp |= HASWIDTH;
11407    goto finish_meta_pat;
11408
11409   case 'W':
11410    invert = 1;
11411    /* FALLTHROUGH */
11412   case 'w':
11413    arg = ANYOF_WORDCHAR;
11414    goto join_posix;
11415
11416   case 'b':
11417    RExC_seen_zerolen++;
11418    RExC_seen |= REG_LOOKBEHIND_SEEN;
11419    op = BOUND + get_regex_charset(RExC_flags);
11420    if (op > BOUNDA) {  /* /aa is same as /a */
11421     op = BOUNDA;
11422    }
11423    else if (op == BOUNDL) {
11424     RExC_contains_locale = 1;
11425    }
11426    ret = reg_node(pRExC_state, op);
11427    FLAGS(ret) = get_regex_charset(RExC_flags);
11428    *flagp |= SIMPLE;
11429    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11430     /* diag_listed_as: Use "%s" instead of "%s" */
11431     vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11432    }
11433    goto finish_meta_pat;
11434   case 'B':
11435    RExC_seen_zerolen++;
11436    RExC_seen |= REG_LOOKBEHIND_SEEN;
11437    op = NBOUND + get_regex_charset(RExC_flags);
11438    if (op > NBOUNDA) { /* /aa is same as /a */
11439     op = NBOUNDA;
11440    }
11441    else if (op == NBOUNDL) {
11442     RExC_contains_locale = 1;
11443    }
11444    ret = reg_node(pRExC_state, op);
11445    FLAGS(ret) = get_regex_charset(RExC_flags);
11446    *flagp |= SIMPLE;
11447    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11448     /* diag_listed_as: Use "%s" instead of "%s" */
11449     vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11450    }
11451    goto finish_meta_pat;
11452
11453   case 'D':
11454    invert = 1;
11455    /* FALLTHROUGH */
11456   case 'd':
11457    arg = ANYOF_DIGIT;
11458    goto join_posix;
11459
11460   case 'R':
11461    ret = reg_node(pRExC_state, LNBREAK);
11462    *flagp |= HASWIDTH|SIMPLE;
11463    goto finish_meta_pat;
11464
11465   case 'H':
11466    invert = 1;
11467    /* FALLTHROUGH */
11468   case 'h':
11469    arg = ANYOF_BLANK;
11470    op = POSIXU;
11471    goto join_posix_op_known;
11472
11473   case 'V':
11474    invert = 1;
11475    /* FALLTHROUGH */
11476   case 'v':
11477    arg = ANYOF_VERTWS;
11478    op = POSIXU;
11479    goto join_posix_op_known;
11480
11481   case 'S':
11482    invert = 1;
11483    /* FALLTHROUGH */
11484   case 's':
11485    arg = ANYOF_SPACE;
11486
11487   join_posix:
11488
11489    op = POSIXD + get_regex_charset(RExC_flags);
11490    if (op > POSIXA) {  /* /aa is same as /a */
11491     op = POSIXA;
11492    }
11493    else if (op == POSIXL) {
11494     RExC_contains_locale = 1;
11495    }
11496
11497   join_posix_op_known:
11498
11499    if (invert) {
11500     op += NPOSIXD - POSIXD;
11501    }
11502
11503    ret = reg_node(pRExC_state, op);
11504    if (! SIZE_ONLY) {
11505     FLAGS(ret) = namedclass_to_classnum(arg);
11506    }
11507
11508    *flagp |= HASWIDTH|SIMPLE;
11509    /* FALLTHROUGH */
11510
11511   finish_meta_pat:
11512    nextchar(pRExC_state);
11513    Set_Node_Length(ret, 2); /* MJD */
11514    break;
11515   case 'p':
11516   case 'P':
11517    {
11518 #ifdef DEBUGGING
11519     char* parse_start = RExC_parse - 2;
11520 #endif
11521
11522     RExC_parse--;
11523
11524     ret = regclass(pRExC_state, flagp,depth+1,
11525        TRUE, /* means just parse this element */
11526        FALSE, /* don't allow multi-char folds */
11527        FALSE, /* don't silence non-portable warnings.
11528           It would be a bug if these returned
11529           non-portables */
11530        NULL);
11531     /* regclass() can only return RESTART_UTF8 if multi-char folds
11532     are allowed.  */
11533     if (!ret)
11534      FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11535       (UV) *flagp);
11536
11537     RExC_parse--;
11538
11539     Set_Node_Offset(ret, parse_start + 2);
11540     Set_Node_Cur_Length(ret, parse_start);
11541     nextchar(pRExC_state);
11542    }
11543    break;
11544   case 'N':
11545    /* Handle \N and \N{NAME} with multiple code points here and not
11546    * below because it can be multicharacter. join_exact() will join
11547    * them up later on.  Also this makes sure that things like
11548    * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11549    * The options to the grok function call causes it to fail if the
11550    * sequence is just a single code point.  We then go treat it as
11551    * just another character in the current EXACT node, and hence it
11552    * gets uniform treatment with all the other characters.  The
11553    * special treatment for quantifiers is not needed for such single
11554    * character sequences */
11555    ++RExC_parse;
11556    if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11557         FALSE /* not strict */ )) {
11558     if (*flagp & RESTART_UTF8)
11559      return NULL;
11560     RExC_parse--;
11561     goto defchar;
11562    }
11563    break;
11564   case 'k':    /* Handle \k<NAME> and \k'NAME' */
11565   parse_named_seq:
11566   {
11567    char ch= RExC_parse[1];
11568    if (ch != '<' && ch != '\'' && ch != '{') {
11569     RExC_parse++;
11570     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11571     vFAIL2("Sequence %.2s... not terminated",parse_start);
11572    } else {
11573     /* this pretty much dupes the code for (?P=...) in reg(), if
11574     you change this make sure you change that */
11575     char* name_start = (RExC_parse += 2);
11576     U32 num = 0;
11577     SV *sv_dat = reg_scan_name(pRExC_state,
11578      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11579     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11580     if (RExC_parse == name_start || *RExC_parse != ch)
11581      /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11582      vFAIL2("Sequence %.3s... not terminated",parse_start);
11583
11584     if (!SIZE_ONLY) {
11585      num = add_data( pRExC_state, STR_WITH_LEN("S"));
11586      RExC_rxi->data->data[num]=(void*)sv_dat;
11587      SvREFCNT_inc_simple_void(sv_dat);
11588     }
11589
11590     RExC_sawback = 1;
11591     ret = reganode(pRExC_state,
11592        ((! FOLD)
11593         ? NREF
11594         : (ASCII_FOLD_RESTRICTED)
11595         ? NREFFA
11596         : (AT_LEAST_UNI_SEMANTICS)
11597          ? NREFFU
11598          : (LOC)
11599          ? NREFFL
11600          : NREFF),
11601         num);
11602     *flagp |= HASWIDTH;
11603
11604     /* override incorrect value set in reganode MJD */
11605     Set_Node_Offset(ret, parse_start+1);
11606     Set_Node_Cur_Length(ret, parse_start);
11607     nextchar(pRExC_state);
11608
11609    }
11610    break;
11611   }
11612   case 'g':
11613   case '1': case '2': case '3': case '4':
11614   case '5': case '6': case '7': case '8': case '9':
11615    {
11616     I32 num;
11617     bool hasbrace = 0;
11618
11619     if (*RExC_parse == 'g') {
11620      bool isrel = 0;
11621
11622      RExC_parse++;
11623      if (*RExC_parse == '{') {
11624       RExC_parse++;
11625       hasbrace = 1;
11626      }
11627      if (*RExC_parse == '-') {
11628       RExC_parse++;
11629       isrel = 1;
11630      }
11631      if (hasbrace && !isDIGIT(*RExC_parse)) {
11632       if (isrel) RExC_parse--;
11633       RExC_parse -= 2;
11634       goto parse_named_seq;
11635      }
11636
11637      num = S_backref_value(RExC_parse);
11638      if (num == 0)
11639       vFAIL("Reference to invalid group 0");
11640      else if (num == I32_MAX) {
11641       if (isDIGIT(*RExC_parse))
11642        vFAIL("Reference to nonexistent group");
11643       else
11644        vFAIL("Unterminated \\g... pattern");
11645      }
11646
11647      if (isrel) {
11648       num = RExC_npar - num;
11649       if (num < 1)
11650        vFAIL("Reference to nonexistent or unclosed group");
11651      }
11652     }
11653     else {
11654      num = S_backref_value(RExC_parse);
11655      /* bare \NNN might be backref or octal - if it is larger than or equal
11656      * RExC_npar then it is assumed to be and octal escape.
11657      * Note RExC_npar is +1 from the actual number of parens*/
11658      if (num == I32_MAX || (num > 9 && num >= RExC_npar
11659        && *RExC_parse != '8' && *RExC_parse != '9'))
11660      {
11661       /* Probably a character specified in octal, e.g. \35 */
11662       goto defchar;
11663      }
11664     }
11665
11666     /* at this point RExC_parse definitely points to a backref
11667     * number */
11668     {
11669 #ifdef RE_TRACK_PATTERN_OFFSETS
11670      char * const parse_start = RExC_parse - 1; /* MJD */
11671 #endif
11672      while (isDIGIT(*RExC_parse))
11673       RExC_parse++;
11674      if (hasbrace) {
11675       if (*RExC_parse != '}')
11676        vFAIL("Unterminated \\g{...} pattern");
11677       RExC_parse++;
11678      }
11679      if (!SIZE_ONLY) {
11680       if (num > (I32)RExC_rx->nparens)
11681        vFAIL("Reference to nonexistent group");
11682      }
11683      RExC_sawback = 1;
11684      ret = reganode(pRExC_state,
11685         ((! FOLD)
11686          ? REF
11687          : (ASCII_FOLD_RESTRICTED)
11688          ? REFFA
11689          : (AT_LEAST_UNI_SEMANTICS)
11690           ? REFFU
11691           : (LOC)
11692           ? REFFL
11693           : REFF),
11694          num);
11695      *flagp |= HASWIDTH;
11696
11697      /* override incorrect value set in reganode MJD */
11698      Set_Node_Offset(ret, parse_start+1);
11699      Set_Node_Cur_Length(ret, parse_start);
11700      RExC_parse--;
11701      nextchar(pRExC_state);
11702     }
11703    }
11704    break;
11705   case '\0':
11706    if (RExC_parse >= RExC_end)
11707     FAIL("Trailing \\");
11708    /* FALLTHROUGH */
11709   default:
11710    /* Do not generate "unrecognized" warnings here, we fall
11711    back into the quick-grab loop below */
11712    parse_start--;
11713    goto defchar;
11714   }
11715   break;
11716
11717  case '#':
11718   if (RExC_flags & RXf_PMf_EXTENDED) {
11719    RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11720    if (RExC_parse < RExC_end)
11721     goto tryagain;
11722   }
11723   /* FALLTHROUGH */
11724
11725  default:
11726
11727    parse_start = RExC_parse - 1;
11728
11729    RExC_parse++;
11730
11731   defchar: {
11732    STRLEN len = 0;
11733    UV ender = 0;
11734    char *p;
11735    char *s;
11736 #define MAX_NODE_STRING_SIZE 127
11737    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11738    char *s0;
11739    U8 upper_parse = MAX_NODE_STRING_SIZE;
11740    U8 node_type = compute_EXACTish(pRExC_state);
11741    bool next_is_quantifier;
11742    char * oldp = NULL;
11743
11744    /* We can convert EXACTF nodes to EXACTFU if they contain only
11745    * characters that match identically regardless of the target
11746    * string's UTF8ness.  The reason to do this is that EXACTF is not
11747    * trie-able, EXACTFU is.
11748    *
11749    * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11750    * contain only above-Latin1 characters (hence must be in UTF8),
11751    * which don't participate in folds with Latin1-range characters,
11752    * as the latter's folds aren't known until runtime.  (We don't
11753    * need to figure this out until pass 2) */
11754    bool maybe_exactfu = PASS2
11755        && (node_type == EXACTF || node_type == EXACTFL);
11756
11757    /* If a folding node contains only code points that don't
11758    * participate in folds, it can be changed into an EXACT node,
11759    * which allows the optimizer more things to look for */
11760    bool maybe_exact;
11761
11762    ret = reg_node(pRExC_state, node_type);
11763
11764    /* In pass1, folded, we use a temporary buffer instead of the
11765    * actual node, as the node doesn't exist yet */
11766    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11767
11768    s0 = s;
11769
11770   reparse:
11771
11772    /* We do the EXACTFish to EXACT node only if folding.  (And we
11773    * don't need to figure this out until pass 2) */
11774    maybe_exact = FOLD && PASS2;
11775
11776    /* XXX The node can hold up to 255 bytes, yet this only goes to
11777    * 127.  I (khw) do not know why.  Keeping it somewhat less than
11778    * 255 allows us to not have to worry about overflow due to
11779    * converting to utf8 and fold expansion, but that value is
11780    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11781    * split up by this limit into a single one using the real max of
11782    * 255.  Even at 127, this breaks under rare circumstances.  If
11783    * folding, we do not want to split a node at a character that is a
11784    * non-final in a multi-char fold, as an input string could just
11785    * happen to want to match across the node boundary.  The join
11786    * would solve that problem if the join actually happens.  But a
11787    * series of more than two nodes in a row each of 127 would cause
11788    * the first join to succeed to get to 254, but then there wouldn't
11789    * be room for the next one, which could at be one of those split
11790    * multi-char folds.  I don't know of any fool-proof solution.  One
11791    * could back off to end with only a code point that isn't such a
11792    * non-final, but it is possible for there not to be any in the
11793    * entire node. */
11794    for (p = RExC_parse - 1;
11795     len < upper_parse && p < RExC_end;
11796     len++)
11797    {
11798     oldp = p;
11799
11800     if (RExC_flags & RXf_PMf_EXTENDED)
11801      p = regpatws(pRExC_state, p,
11802           TRUE); /* means recognize comments */
11803     switch ((U8)*p) {
11804     case '^':
11805     case '$':
11806     case '.':
11807     case '[':
11808     case '(':
11809     case ')':
11810     case '|':
11811      goto loopdone;
11812     case '\\':
11813      /* Literal Escapes Switch
11814
11815      This switch is meant to handle escape sequences that
11816      resolve to a literal character.
11817
11818      Every escape sequence that represents something
11819      else, like an assertion or a char class, is handled
11820      in the switch marked 'Special Escapes' above in this
11821      routine, but also has an entry here as anything that
11822      isn't explicitly mentioned here will be treated as
11823      an unescaped equivalent literal.
11824      */
11825
11826      switch ((U8)*++p) {
11827      /* These are all the special escapes. */
11828      case 'A':             /* Start assertion */
11829      case 'b': case 'B':   /* Word-boundary assertion*/
11830      case 'C':             /* Single char !DANGEROUS! */
11831      case 'd': case 'D':   /* digit class */
11832      case 'g': case 'G':   /* generic-backref, pos assertion */
11833      case 'h': case 'H':   /* HORIZWS */
11834      case 'k': case 'K':   /* named backref, keep marker */
11835      case 'p': case 'P':   /* Unicode property */
11836        case 'R':   /* LNBREAK */
11837      case 's': case 'S':   /* space class */
11838      case 'v': case 'V':   /* VERTWS */
11839      case 'w': case 'W':   /* word class */
11840      case 'X':             /* eXtended Unicode "combining
11841            character sequence" */
11842      case 'z': case 'Z':   /* End of line/string assertion */
11843       --p;
11844       goto loopdone;
11845
11846      /* Anything after here is an escape that resolves to a
11847      literal. (Except digits, which may or may not)
11848      */
11849      case 'n':
11850       ender = '\n';
11851       p++;
11852       break;
11853      case 'N': /* Handle a single-code point named character. */
11854       /* The options cause it to fail if a multiple code
11855       * point sequence.  Handle those in the switch() above
11856       * */
11857       RExC_parse = p + 1;
11858       if (! grok_bslash_N(pRExC_state, NULL, &ender,
11859            flagp, depth, FALSE,
11860            FALSE /* not strict */ ))
11861       {
11862        if (*flagp & RESTART_UTF8)
11863         FAIL("panic: grok_bslash_N set RESTART_UTF8");
11864        RExC_parse = p = oldp;
11865        goto loopdone;
11866       }
11867       p = RExC_parse;
11868       if (ender > 0xff) {
11869        REQUIRE_UTF8;
11870       }
11871       break;
11872      case 'r':
11873       ender = '\r';
11874       p++;
11875       break;
11876      case 't':
11877       ender = '\t';
11878       p++;
11879       break;
11880      case 'f':
11881       ender = '\f';
11882       p++;
11883       break;
11884      case 'e':
11885       ender = ASCII_TO_NATIVE('\033');
11886       p++;
11887       break;
11888      case 'a':
11889       ender = '\a';
11890       p++;
11891       break;
11892      case 'o':
11893       {
11894        UV result;
11895        const char* error_msg;
11896
11897        bool valid = grok_bslash_o(&p,
11898              &result,
11899              &error_msg,
11900              TRUE, /* out warnings */
11901              FALSE, /* not strict */
11902              TRUE, /* Output warnings
11903                 for non-
11904                 portables */
11905              UTF);
11906        if (! valid) {
11907         RExC_parse = p; /* going to die anyway; point
11908             to exact spot of failure */
11909         vFAIL(error_msg);
11910        }
11911        ender = result;
11912        if (PL_encoding && ender < 0x100) {
11913         goto recode_encoding;
11914        }
11915        if (ender > 0xff) {
11916         REQUIRE_UTF8;
11917        }
11918        break;
11919       }
11920      case 'x':
11921       {
11922        UV result = UV_MAX; /* initialize to erroneous
11923             value */
11924        const char* error_msg;
11925
11926        bool valid = grok_bslash_x(&p,
11927              &result,
11928              &error_msg,
11929              TRUE, /* out warnings */
11930              FALSE, /* not strict */
11931              TRUE, /* Output warnings
11932                 for non-
11933                 portables */
11934              UTF);
11935        if (! valid) {
11936         RExC_parse = p; /* going to die anyway; point
11937             to exact spot of failure */
11938         vFAIL(error_msg);
11939        }
11940        ender = result;
11941
11942        if (PL_encoding && ender < 0x100) {
11943         goto recode_encoding;
11944        }
11945        if (ender > 0xff) {
11946         REQUIRE_UTF8;
11947        }
11948        break;
11949       }
11950      case 'c':
11951       p++;
11952       ender = grok_bslash_c(*p++, SIZE_ONLY);
11953       break;
11954      case '8': case '9': /* must be a backreference */
11955       --p;
11956       goto loopdone;
11957      case '1': case '2': case '3':case '4':
11958      case '5': case '6': case '7':
11959       /* When we parse backslash escapes there is ambiguity
11960       * between backreferences and octal escapes. Any escape
11961       * from \1 - \9 is a backreference, any multi-digit
11962       * escape which does not start with 0 and which when
11963       * evaluated as decimal could refer to an already
11964       * parsed capture buffer is a backslash. Anything else
11965       * is octal.
11966       *
11967       * Note this implies that \118 could be interpreted as
11968       * 118 OR as "\11" . "8" depending on whether there
11969       * were 118 capture buffers defined already in the
11970       * pattern.  */
11971
11972       /* NOTE, RExC_npar is 1 more than the actual number of
11973       * parens we have seen so far, hence the < RExC_npar below. */
11974
11975       if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11976       {  /* Not to be treated as an octal constant, go
11977         find backref */
11978        --p;
11979        goto loopdone;
11980       }
11981       /* FALLTHROUGH */
11982      case '0':
11983       {
11984        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11985        STRLEN numlen = 3;
11986        ender = grok_oct(p, &numlen, &flags, NULL);
11987        if (ender > 0xff) {
11988         REQUIRE_UTF8;
11989        }
11990        p += numlen;
11991        if (SIZE_ONLY   /* like \08, \178 */
11992         && numlen < 3
11993         && p < RExC_end
11994         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11995        {
11996         reg_warn_non_literal_string(
11997           p + 1,
11998           form_short_octal_warning(p, numlen));
11999        }
12000       }
12001       if (PL_encoding && ender < 0x100)
12002        goto recode_encoding;
12003       break;
12004      recode_encoding:
12005       if (! RExC_override_recoding) {
12006        SV* enc = PL_encoding;
12007        ender = reg_recode((const char)(U8)ender, &enc);
12008        if (!enc && SIZE_ONLY)
12009         ckWARNreg(p, "Invalid escape in the specified encoding");
12010        REQUIRE_UTF8;
12011       }
12012       break;
12013      case '\0':
12014       if (p >= RExC_end)
12015        FAIL("Trailing \\");
12016       /* FALLTHROUGH */
12017      default:
12018       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12019        /* Include any { following the alpha to emphasize
12020        * that it could be part of an escape at some point
12021        * in the future */
12022        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12023        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12024       }
12025       goto normal_default;
12026      } /* End of switch on '\' */
12027      break;
12028     case '{':
12029      /* Currently we don't warn when the lbrace is at the start
12030      * of a construct.  This catches it in the middle of a
12031      * literal string, or when its the first thing after
12032      * something like "\b" */
12033      if (! SIZE_ONLY
12034       && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12035      {
12036       ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12037      }
12038      /*FALLTHROUGH*/
12039     default:    /* A literal character */
12040     normal_default:
12041      if (UTF8_IS_START(*p) && UTF) {
12042       STRLEN numlen;
12043       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12044            &numlen, UTF8_ALLOW_DEFAULT);
12045       p += numlen;
12046      }
12047      else
12048       ender = (U8) *p++;
12049      break;
12050     } /* End of switch on the literal */
12051
12052     /* Here, have looked at the literal character and <ender>
12053     * contains its ordinal, <p> points to the character after it
12054     */
12055
12056     if ( RExC_flags & RXf_PMf_EXTENDED)
12057      p = regpatws(pRExC_state, p,
12058           TRUE); /* means recognize comments */
12059
12060     /* If the next thing is a quantifier, it applies to this
12061     * character only, which means that this character has to be in
12062     * its own node and can't just be appended to the string in an
12063     * existing node, so if there are already other characters in
12064     * the node, close the node with just them, and set up to do
12065     * this character again next time through, when it will be the
12066     * only thing in its new node */
12067     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12068     {
12069      p = oldp;
12070      goto loopdone;
12071     }
12072
12073     if (! FOLD   /* The simple case, just append the literal */
12074      || (LOC  /* Also don't fold for tricky chars under /l */
12075       && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12076     {
12077      if (UTF) {
12078       const STRLEN unilen = reguni(pRExC_state, ender, s);
12079       if (unilen > 0) {
12080       s   += unilen;
12081       len += unilen;
12082       }
12083
12084       /* The loop increments <len> each time, as all but this
12085       * path (and one other) through it add a single byte to
12086       * the EXACTish node.  But this one has changed len to
12087       * be the correct final value, so subtract one to
12088       * cancel out the increment that follows */
12089       len--;
12090      }
12091      else {
12092       REGC((char)ender, s++);
12093      }
12094
12095      /* Can get here if folding only if is one of the /l
12096      * characters whose fold depends on the locale.  The
12097      * occurrence of any of these indicate that we can't
12098      * simplify things */
12099      if (FOLD) {
12100       maybe_exact = FALSE;
12101       maybe_exactfu = FALSE;
12102      }
12103     }
12104     else             /* FOLD */
12105      if (! ( UTF
12106       /* See comments for join_exact() as to why we fold this
12107       * non-UTF at compile time */
12108       || (node_type == EXACTFU
12109        && ender == LATIN_SMALL_LETTER_SHARP_S)))
12110     {
12111      /* Here, are folding and are not UTF-8 encoded; therefore
12112      * the character must be in the range 0-255, and is not /l
12113      * (Not /l because we already handled these under /l in
12114      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12115      if (IS_IN_SOME_FOLD_L1(ender)) {
12116       maybe_exact = FALSE;
12117
12118       /* See if the character's fold differs between /d and
12119       * /u.  This includes the multi-char fold SHARP S to
12120       * 'ss' */
12121       if (maybe_exactfu
12122        && (PL_fold[ender] != PL_fold_latin1[ender]
12123         || ender == LATIN_SMALL_LETTER_SHARP_S
12124         || (len > 0
12125         && isARG2_lower_or_UPPER_ARG1('s', ender)
12126         && isARG2_lower_or_UPPER_ARG1('s',
12127                 *(s-1)))))
12128       {
12129        maybe_exactfu = FALSE;
12130       }
12131      }
12132
12133      /* Even when folding, we store just the input character, as
12134      * we have an array that finds its fold quickly */
12135      *(s++) = (char) ender;
12136     }
12137     else {  /* FOLD and UTF */
12138      /* Unlike the non-fold case, we do actually have to
12139      * calculate the results here in pass 1.  This is for two
12140      * reasons, the folded length may be longer than the
12141      * unfolded, and we have to calculate how many EXACTish
12142      * nodes it will take; and we may run out of room in a node
12143      * in the middle of a potential multi-char fold, and have
12144      * to back off accordingly.  (Hence we can't use REGC for
12145      * the simple case just below.) */
12146
12147      UV folded;
12148      if (isASCII(ender)) {
12149       folded = toFOLD(ender);
12150       *(s)++ = (U8) folded;
12151      }
12152      else {
12153       STRLEN foldlen;
12154
12155       folded = _to_uni_fold_flags(
12156          ender,
12157          (U8 *) s,
12158          &foldlen,
12159          FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12160               ? FOLD_FLAGS_NOMIX_ASCII
12161               : 0));
12162       s += foldlen;
12163
12164       /* The loop increments <len> each time, as all but this
12165       * path (and one other) through it add a single byte to
12166       * the EXACTish node.  But this one has changed len to
12167       * be the correct final value, so subtract one to
12168       * cancel out the increment that follows */
12169       len += foldlen - 1;
12170      }
12171      /* If this node only contains non-folding code points so
12172      * far, see if this new one is also non-folding */
12173      if (maybe_exact) {
12174       if (folded != ender) {
12175        maybe_exact = FALSE;
12176       }
12177       else {
12178        /* Here the fold is the original; we have to check
12179        * further to see if anything folds to it */
12180        if (_invlist_contains_cp(PL_utf8_foldable,
12181               ender))
12182        {
12183         maybe_exact = FALSE;
12184        }
12185       }
12186      }
12187      ender = folded;
12188     }
12189
12190     if (next_is_quantifier) {
12191
12192      /* Here, the next input is a quantifier, and to get here,
12193      * the current character is the only one in the node.
12194      * Also, here <len> doesn't include the final byte for this
12195      * character */
12196      len++;
12197      goto loopdone;
12198     }
12199
12200    } /* End of loop through literal characters */
12201
12202    /* Here we have either exhausted the input or ran out of room in
12203    * the node.  (If we encountered a character that can't be in the
12204    * node, transfer is made directly to <loopdone>, and so we
12205    * wouldn't have fallen off the end of the loop.)  In the latter
12206    * case, we artificially have to split the node into two, because
12207    * we just don't have enough space to hold everything.  This
12208    * creates a problem if the final character participates in a
12209    * multi-character fold in the non-final position, as a match that
12210    * should have occurred won't, due to the way nodes are matched,
12211    * and our artificial boundary.  So back off until we find a non-
12212    * problematic character -- one that isn't at the beginning or
12213    * middle of such a fold.  (Either it doesn't participate in any
12214    * folds, or appears only in the final position of all the folds it
12215    * does participate in.)  A better solution with far fewer false
12216    * positives, and that would fill the nodes more completely, would
12217    * be to actually have available all the multi-character folds to
12218    * test against, and to back-off only far enough to be sure that
12219    * this node isn't ending with a partial one.  <upper_parse> is set
12220    * further below (if we need to reparse the node) to include just
12221    * up through that final non-problematic character that this code
12222    * identifies, so when it is set to less than the full node, we can
12223    * skip the rest of this */
12224    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12225
12226     const STRLEN full_len = len;
12227
12228     assert(len >= MAX_NODE_STRING_SIZE);
12229
12230     /* Here, <s> points to the final byte of the final character.
12231     * Look backwards through the string until find a non-
12232     * problematic character */
12233
12234     if (! UTF) {
12235
12236      /* This has no multi-char folds to non-UTF characters */
12237      if (ASCII_FOLD_RESTRICTED) {
12238       goto loopdone;
12239      }
12240
12241      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12242      len = s - s0 + 1;
12243     }
12244     else {
12245      if (!  PL_NonL1NonFinalFold) {
12246       PL_NonL1NonFinalFold = _new_invlist_C_array(
12247           NonL1_Perl_Non_Final_Folds_invlist);
12248      }
12249
12250      /* Point to the first byte of the final character */
12251      s = (char *) utf8_hop((U8 *) s, -1);
12252
12253      while (s >= s0) {   /* Search backwards until find
12254           non-problematic char */
12255       if (UTF8_IS_INVARIANT(*s)) {
12256
12257        /* There are no ascii characters that participate
12258        * in multi-char folds under /aa.  In EBCDIC, the
12259        * non-ascii invariants are all control characters,
12260        * so don't ever participate in any folds. */
12261        if (ASCII_FOLD_RESTRICTED
12262         || ! IS_NON_FINAL_FOLD(*s))
12263        {
12264         break;
12265        }
12266       }
12267       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12268        if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12269                 *s, *(s+1))))
12270        {
12271         break;
12272        }
12273       }
12274       else if (! _invlist_contains_cp(
12275           PL_NonL1NonFinalFold,
12276           valid_utf8_to_uvchr((U8 *) s, NULL)))
12277       {
12278        break;
12279       }
12280
12281       /* Here, the current character is problematic in that
12282       * it does occur in the non-final position of some
12283       * fold, so try the character before it, but have to
12284       * special case the very first byte in the string, so
12285       * we don't read outside the string */
12286       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12287      } /* End of loop backwards through the string */
12288
12289      /* If there were only problematic characters in the string,
12290      * <s> will point to before s0, in which case the length
12291      * should be 0, otherwise include the length of the
12292      * non-problematic character just found */
12293      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12294     }
12295
12296     /* Here, have found the final character, if any, that is
12297     * non-problematic as far as ending the node without splitting
12298     * it across a potential multi-char fold.  <len> contains the
12299     * number of bytes in the node up-to and including that
12300     * character, or is 0 if there is no such character, meaning
12301     * the whole node contains only problematic characters.  In
12302     * this case, give up and just take the node as-is.  We can't
12303     * do any better */
12304     if (len == 0) {
12305      len = full_len;
12306
12307      /* If the node ends in an 's' we make sure it stays EXACTF,
12308      * as if it turns into an EXACTFU, it could later get
12309      * joined with another 's' that would then wrongly match
12310      * the sharp s */
12311      if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12312      {
12313       maybe_exactfu = FALSE;
12314      }
12315     } else {
12316
12317      /* Here, the node does contain some characters that aren't
12318      * problematic.  If one such is the final character in the
12319      * node, we are done */
12320      if (len == full_len) {
12321       goto loopdone;
12322      }
12323      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12324
12325       /* If the final character is problematic, but the
12326       * penultimate is not, back-off that last character to
12327       * later start a new node with it */
12328       p = oldp;
12329       goto loopdone;
12330      }
12331
12332      /* Here, the final non-problematic character is earlier
12333      * in the input than the penultimate character.  What we do
12334      * is reparse from the beginning, going up only as far as
12335      * this final ok one, thus guaranteeing that the node ends
12336      * in an acceptable character.  The reason we reparse is
12337      * that we know how far in the character is, but we don't
12338      * know how to correlate its position with the input parse.
12339      * An alternate implementation would be to build that
12340      * correlation as we go along during the original parse,
12341      * but that would entail extra work for every node, whereas
12342      * this code gets executed only when the string is too
12343      * large for the node, and the final two characters are
12344      * problematic, an infrequent occurrence.  Yet another
12345      * possible strategy would be to save the tail of the
12346      * string, and the next time regatom is called, initialize
12347      * with that.  The problem with this is that unless you
12348      * back off one more character, you won't be guaranteed
12349      * regatom will get called again, unless regbranch,
12350      * regpiece ... are also changed.  If you do back off that
12351      * extra character, so that there is input guaranteed to
12352      * force calling regatom, you can't handle the case where
12353      * just the first character in the node is acceptable.  I
12354      * (khw) decided to try this method which doesn't have that
12355      * pitfall; if performance issues are found, we can do a
12356      * combination of the current approach plus that one */
12357      upper_parse = len;
12358      len = 0;
12359      s = s0;
12360      goto reparse;
12361     }
12362    }   /* End of verifying node ends with an appropriate char */
12363
12364   loopdone:   /* Jumped to when encounters something that shouldn't be in
12365      the node */
12366
12367    /* I (khw) don't know if you can get here with zero length, but the
12368    * old code handled this situation by creating a zero-length EXACT
12369    * node.  Might as well be NOTHING instead */
12370    if (len == 0) {
12371     OP(ret) = NOTHING;
12372    }
12373    else {
12374     if (FOLD) {
12375      /* If 'maybe_exact' is still set here, means there are no
12376      * code points in the node that participate in folds;
12377      * similarly for 'maybe_exactfu' and code points that match
12378      * differently depending on UTF8ness of the target string
12379      * (for /u), or depending on locale for /l */
12380      if (maybe_exact) {
12381       OP(ret) = EXACT;
12382      }
12383      else if (maybe_exactfu) {
12384       OP(ret) = EXACTFU;
12385      }
12386     }
12387     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12388           FALSE /* Don't look to see if could
12389              be turned into an EXACT
12390              node, as we have already
12391              computed that */
12392           );
12393    }
12394
12395    RExC_parse = p - 1;
12396    Set_Node_Cur_Length(ret, parse_start);
12397    nextchar(pRExC_state);
12398    {
12399     /* len is STRLEN which is unsigned, need to copy to signed */
12400     IV iv = len;
12401     if (iv < 0)
12402      vFAIL("Internal disaster");
12403    }
12404
12405   } /* End of label 'defchar:' */
12406   break;
12407  } /* End of giant switch on input character */
12408
12409  return(ret);
12410 }
12411
12412 STATIC char *
12413 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12414 {
12415  /* Returns the next non-pattern-white space, non-comment character (the
12416  * latter only if 'recognize_comment is true) in the string p, which is
12417  * ended by RExC_end.  See also reg_skipcomment */
12418  const char *e = RExC_end;
12419
12420  PERL_ARGS_ASSERT_REGPATWS;
12421
12422  while (p < e) {
12423   STRLEN len;
12424   if ((len = is_PATWS_safe(p, e, UTF))) {
12425    p += len;
12426   }
12427   else if (recognize_comment && *p == '#') {
12428    p = reg_skipcomment(pRExC_state, p);
12429   }
12430   else
12431    break;
12432  }
12433  return p;
12434 }
12435
12436 STATIC void
12437 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12438 {
12439  /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12440  * sets up the bitmap and any flags, removing those code points from the
12441  * inversion list, setting it to NULL should it become completely empty */
12442
12443  PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12444  assert(PL_regkind[OP(node)] == ANYOF);
12445
12446  ANYOF_BITMAP_ZERO(node);
12447  if (*invlist_ptr) {
12448
12449   /* This gets set if we actually need to modify things */
12450   bool change_invlist = FALSE;
12451
12452   UV start, end;
12453
12454   /* Start looking through *invlist_ptr */
12455   invlist_iterinit(*invlist_ptr);
12456   while (invlist_iternext(*invlist_ptr, &start, &end)) {
12457    UV high;
12458    int i;
12459
12460    if (end == UV_MAX && start <= 256) {
12461     ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12462    }
12463    else if (end >= 256) {
12464     ANYOF_FLAGS(node) |= ANYOF_UTF8;
12465    }
12466
12467    /* Quit if are above what we should change */
12468    if (start > 255) {
12469     break;
12470    }
12471
12472    change_invlist = TRUE;
12473
12474    /* Set all the bits in the range, up to the max that we are doing */
12475    high = (end < 255) ? end : 255;
12476    for (i = start; i <= (int) high; i++) {
12477     if (! ANYOF_BITMAP_TEST(node, i)) {
12478      ANYOF_BITMAP_SET(node, i);
12479     }
12480    }
12481   }
12482   invlist_iterfinish(*invlist_ptr);
12483
12484   /* Done with loop; remove any code points that are in the bitmap from
12485   * *invlist_ptr; similarly for code points above latin1 if we have a
12486   * flag to match all of them anyways */
12487   if (change_invlist) {
12488    _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12489   }
12490   if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12491    _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12492   }
12493
12494   /* If have completely emptied it, remove it completely */
12495   if (_invlist_len(*invlist_ptr) == 0) {
12496    SvREFCNT_dec_NN(*invlist_ptr);
12497    *invlist_ptr = NULL;
12498   }
12499  }
12500 }
12501
12502 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12503    Character classes ([:foo:]) can also be negated ([:^foo:]).
12504    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12505    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12506    but trigger failures because they are currently unimplemented. */
12507
12508 #define POSIXCC_DONE(c)   ((c) == ':')
12509 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12510 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12511
12512 PERL_STATIC_INLINE I32
12513 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12514 {
12515  dVAR;
12516  I32 namedclass = OOB_NAMEDCLASS;
12517
12518  PERL_ARGS_ASSERT_REGPPOSIXCC;
12519
12520  if (value == '[' && RExC_parse + 1 < RExC_end &&
12521   /* I smell either [: or [= or [. -- POSIX has been here, right? */
12522   POSIXCC(UCHARAT(RExC_parse)))
12523  {
12524   const char c = UCHARAT(RExC_parse);
12525   char* const s = RExC_parse++;
12526
12527   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12528    RExC_parse++;
12529   if (RExC_parse == RExC_end) {
12530    if (strict) {
12531
12532     /* Try to give a better location for the error (than the end of
12533     * the string) by looking for the matching ']' */
12534     RExC_parse = s;
12535     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12536      RExC_parse++;
12537     }
12538     vFAIL2("Unmatched '%c' in POSIX class", c);
12539    }
12540    /* Grandfather lone [:, [=, [. */
12541    RExC_parse = s;
12542   }
12543   else {
12544    const char* const t = RExC_parse++; /* skip over the c */
12545    assert(*t == c);
12546
12547    if (UCHARAT(RExC_parse) == ']') {
12548     const char *posixcc = s + 1;
12549     RExC_parse++; /* skip over the ending ] */
12550
12551     if (*s == ':') {
12552      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12553      const I32 skip = t - posixcc;
12554
12555      /* Initially switch on the length of the name.  */
12556      switch (skip) {
12557      case 4:
12558       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12559               this is the Perl \w
12560               */
12561        namedclass = ANYOF_WORDCHAR;
12562       break;
12563      case 5:
12564       /* Names all of length 5.  */
12565       /* alnum alpha ascii blank cntrl digit graph lower
12566       print punct space upper  */
12567       /* Offset 4 gives the best switch position.  */
12568       switch (posixcc[4]) {
12569       case 'a':
12570        if (memEQ(posixcc, "alph", 4)) /* alpha */
12571         namedclass = ANYOF_ALPHA;
12572        break;
12573       case 'e':
12574        if (memEQ(posixcc, "spac", 4)) /* space */
12575         namedclass = ANYOF_PSXSPC;
12576        break;
12577       case 'h':
12578        if (memEQ(posixcc, "grap", 4)) /* graph */
12579         namedclass = ANYOF_GRAPH;
12580        break;
12581       case 'i':
12582        if (memEQ(posixcc, "asci", 4)) /* ascii */
12583         namedclass = ANYOF_ASCII;
12584        break;
12585       case 'k':
12586        if (memEQ(posixcc, "blan", 4)) /* blank */
12587         namedclass = ANYOF_BLANK;
12588        break;
12589       case 'l':
12590        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12591         namedclass = ANYOF_CNTRL;
12592        break;
12593       case 'm':
12594        if (memEQ(posixcc, "alnu", 4)) /* alnum */
12595         namedclass = ANYOF_ALPHANUMERIC;
12596        break;
12597       case 'r':
12598        if (memEQ(posixcc, "lowe", 4)) /* lower */
12599         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12600        else if (memEQ(posixcc, "uppe", 4)) /* upper */
12601         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12602        break;
12603       case 't':
12604        if (memEQ(posixcc, "digi", 4)) /* digit */
12605         namedclass = ANYOF_DIGIT;
12606        else if (memEQ(posixcc, "prin", 4)) /* print */
12607         namedclass = ANYOF_PRINT;
12608        else if (memEQ(posixcc, "punc", 4)) /* punct */
12609         namedclass = ANYOF_PUNCT;
12610        break;
12611       }
12612       break;
12613      case 6:
12614       if (memEQ(posixcc, "xdigit", 6))
12615        namedclass = ANYOF_XDIGIT;
12616       break;
12617      }
12618
12619      if (namedclass == OOB_NAMEDCLASS)
12620       vFAIL2utf8f(
12621        "POSIX class [:%"UTF8f":] unknown",
12622        UTF8fARG(UTF, t - s - 1, s + 1));
12623
12624      /* The #defines are structured so each complement is +1 to
12625      * the normal one */
12626      if (complement) {
12627       namedclass++;
12628      }
12629      assert (posixcc[skip] == ':');
12630      assert (posixcc[skip+1] == ']');
12631     } else if (!SIZE_ONLY) {
12632      /* [[=foo=]] and [[.foo.]] are still future. */
12633
12634      /* adjust RExC_parse so the warning shows after
12635      the class closes */
12636      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12637       RExC_parse++;
12638      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12639     }
12640    } else {
12641     /* Maternal grandfather:
12642     * "[:" ending in ":" but not in ":]" */
12643     if (strict) {
12644      vFAIL("Unmatched '[' in POSIX class");
12645     }
12646
12647     /* Grandfather lone [:, [=, [. */
12648     RExC_parse = s;
12649    }
12650   }
12651  }
12652
12653  return namedclass;
12654 }
12655
12656 STATIC bool
12657 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12658 {
12659  /* This applies some heuristics at the current parse position (which should
12660  * be at a '[') to see if what follows might be intended to be a [:posix:]
12661  * class.  It returns true if it really is a posix class, of course, but it
12662  * also can return true if it thinks that what was intended was a posix
12663  * class that didn't quite make it.
12664  *
12665  * It will return true for
12666  *      [:alphanumerics:
12667  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12668  *                         ')' indicating the end of the (?[
12669  *      [:any garbage including %^&$ punctuation:]
12670  *
12671  * This is designed to be called only from S_handle_regex_sets; it could be
12672  * easily adapted to be called from the spot at the beginning of regclass()
12673  * that checks to see in a normal bracketed class if the surrounding []
12674  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12675  * change long-standing behavior, so I (khw) didn't do that */
12676  char* p = RExC_parse + 1;
12677  char first_char = *p;
12678
12679  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12680
12681  assert(*(p - 1) == '[');
12682
12683  if (! POSIXCC(first_char)) {
12684   return FALSE;
12685  }
12686
12687  p++;
12688  while (p < RExC_end && isWORDCHAR(*p)) p++;
12689
12690  if (p >= RExC_end) {
12691   return FALSE;
12692  }
12693
12694  if (p - RExC_parse > 2    /* Got at least 1 word character */
12695   && (*p == first_char
12696    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12697  {
12698   return TRUE;
12699  }
12700
12701  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12702
12703  return (p
12704    && p - RExC_parse > 2 /* [:] evaluates to colon;
12705          [::] is a bad posix class. */
12706    && first_char == *(p - 1));
12707 }
12708
12709 STATIC regnode *
12710 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12711      I32 *flagp, U32 depth,
12712      char * const oregcomp_parse)
12713 {
12714  /* Handle the (?[...]) construct to do set operations */
12715
12716  U8 curchar;
12717  UV start, end; /* End points of code point ranges */
12718  SV* result_string;
12719  char *save_end, *save_parse;
12720  SV* final;
12721  STRLEN len;
12722  regnode* node;
12723  AV* stack;
12724  const bool save_fold = FOLD;
12725
12726  GET_RE_DEBUG_FLAGS_DECL;
12727
12728  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12729
12730  if (LOC) {
12731   vFAIL("(?[...]) not valid in locale");
12732  }
12733  RExC_uni_semantics = 1;
12734
12735  /* This will return only an ANYOF regnode, or (unlikely) something smaller
12736  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12737  * call regclass to handle '[]' so as to not have to reinvent its parsing
12738  * rules here (throwing away the size it computes each time).  And, we exit
12739  * upon an unescaped ']' that isn't one ending a regclass.  To do both
12740  * these things, we need to realize that something preceded by a backslash
12741  * is escaped, so we have to keep track of backslashes */
12742  if (SIZE_ONLY) {
12743   UV depth = 0; /* how many nested (?[...]) constructs */
12744
12745   Perl_ck_warner_d(aTHX_
12746    packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12747    "The regex_sets feature is experimental" REPORT_LOCATION,
12748     UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12749     UTF8fARG(UTF,
12750       RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12751       RExC_precomp + (RExC_parse - RExC_precomp)));
12752
12753   while (RExC_parse < RExC_end) {
12754    SV* current = NULL;
12755    RExC_parse = regpatws(pRExC_state, RExC_parse,
12756           TRUE); /* means recognize comments */
12757    switch (*RExC_parse) {
12758     case '?':
12759      if (RExC_parse[1] == '[') depth++, RExC_parse++;
12760      /* FALLTHROUGH */
12761     default:
12762      break;
12763     case '\\':
12764      /* Skip the next byte (which could cause us to end up in
12765      * the middle of a UTF-8 character, but since none of those
12766      * are confusable with anything we currently handle in this
12767      * switch (invariants all), it's safe.  We'll just hit the
12768      * default: case next time and keep on incrementing until
12769      * we find one of the invariants we do handle. */
12770      RExC_parse++;
12771      break;
12772     case '[':
12773     {
12774      /* If this looks like it is a [:posix:] class, leave the
12775      * parse pointer at the '[' to fool regclass() into
12776      * thinking it is part of a '[[:posix:]]'.  That function
12777      * will use strict checking to force a syntax error if it
12778      * doesn't work out to a legitimate class */
12779      bool is_posix_class
12780          = could_it_be_a_POSIX_class(pRExC_state);
12781      if (! is_posix_class) {
12782       RExC_parse++;
12783      }
12784
12785      /* regclass() can only return RESTART_UTF8 if multi-char
12786      folds are allowed.  */
12787      if (!regclass(pRExC_state, flagp,depth+1,
12788         is_posix_class, /* parse the whole char
12789              class only if not a
12790              posix class */
12791         FALSE, /* don't allow multi-char folds */
12792         TRUE, /* silence non-portable warnings. */
12793         &current))
12794       FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12795        (UV) *flagp);
12796
12797      /* function call leaves parse pointing to the ']', except
12798      * if we faked it */
12799      if (is_posix_class) {
12800       RExC_parse--;
12801      }
12802
12803      SvREFCNT_dec(current);   /* In case it returned something */
12804      break;
12805     }
12806
12807     case ']':
12808      if (depth--) break;
12809      RExC_parse++;
12810      if (RExC_parse < RExC_end
12811       && *RExC_parse == ')')
12812      {
12813       node = reganode(pRExC_state, ANYOF, 0);
12814       RExC_size += ANYOF_SKIP;
12815       nextchar(pRExC_state);
12816       Set_Node_Length(node,
12817         RExC_parse - oregcomp_parse + 1); /* MJD */
12818       return node;
12819      }
12820      goto no_close;
12821    }
12822    RExC_parse++;
12823   }
12824
12825   no_close:
12826   FAIL("Syntax error in (?[...])");
12827  }
12828
12829  /* Pass 2 only after this.  Everything in this construct is a
12830  * metacharacter.  Operands begin with either a '\' (for an escape
12831  * sequence), or a '[' for a bracketed character class.  Any other
12832  * character should be an operator, or parenthesis for grouping.  Both
12833  * types of operands are handled by calling regclass() to parse them.  It
12834  * is called with a parameter to indicate to return the computed inversion
12835  * list.  The parsing here is implemented via a stack.  Each entry on the
12836  * stack is a single character representing one of the operators, or the
12837  * '('; or else a pointer to an operand inversion list. */
12838
12839 #define IS_OPERAND(a)  (! SvIOK(a))
12840
12841  /* The stack starts empty.  It is a syntax error if the first thing parsed
12842  * is a binary operator; everything else is pushed on the stack.  When an
12843  * operand is parsed, the top of the stack is examined.  If it is a binary
12844  * operator, the item before it should be an operand, and both are replaced
12845  * by the result of doing that operation on the new operand and the one on
12846  * the stack.   Thus a sequence of binary operands is reduced to a single
12847  * one before the next one is parsed.
12848  *
12849  * A unary operator may immediately follow a binary in the input, for
12850  * example
12851  *      [a] + ! [b]
12852  * When an operand is parsed and the top of the stack is a unary operator,
12853  * the operation is performed, and then the stack is rechecked to see if
12854  * this new operand is part of a binary operation; if so, it is handled as
12855  * above.
12856  *
12857  * A '(' is simply pushed on the stack; it is valid only if the stack is
12858  * empty, or the top element of the stack is an operator or another '('
12859  * (for which the parenthesized expression will become an operand).  By the
12860  * time the corresponding ')' is parsed everything in between should have
12861  * been parsed and evaluated to a single operand (or else is a syntax
12862  * error), and is handled as a regular operand */
12863
12864  sv_2mortal((SV *)(stack = newAV()));
12865
12866  while (RExC_parse < RExC_end) {
12867   I32 top_index = av_tindex(stack);
12868   SV** top_ptr;
12869   SV* current = NULL;
12870
12871   /* Skip white space */
12872   RExC_parse = regpatws(pRExC_state, RExC_parse,
12873           TRUE /* means recognize comments */ );
12874   if (RExC_parse >= RExC_end) {
12875    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12876   }
12877   if ((curchar = UCHARAT(RExC_parse)) == ']') {
12878    break;
12879   }
12880
12881   switch (curchar) {
12882
12883    case '?':
12884     if (av_tindex(stack) >= 0   /* This makes sure that we can
12885            safely subtract 1 from
12886            RExC_parse in the next clause.
12887            If we have something on the
12888            stack, we have parsed something
12889            */
12890      && UCHARAT(RExC_parse - 1) == '('
12891      && RExC_parse < RExC_end)
12892     {
12893      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12894      * This happens when we have some thing like
12895      *
12896      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12897      *   ...
12898      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12899      *
12900      * Here we would be handling the interpolated
12901      * '$thai_or_lao'.  We handle this by a recursive call to
12902      * ourselves which returns the inversion list the
12903      * interpolated expression evaluates to.  We use the flags
12904      * from the interpolated pattern. */
12905      U32 save_flags = RExC_flags;
12906      const char * const save_parse = ++RExC_parse;
12907
12908      parse_lparen_question_flags(pRExC_state);
12909
12910      if (RExC_parse == save_parse  /* Makes sure there was at
12911              least one flag (or this
12912              embedding wasn't compiled)
12913             */
12914       || RExC_parse >= RExC_end - 4
12915       || UCHARAT(RExC_parse) != ':'
12916       || UCHARAT(++RExC_parse) != '('
12917       || UCHARAT(++RExC_parse) != '?'
12918       || UCHARAT(++RExC_parse) != '[')
12919      {
12920
12921       /* In combination with the above, this moves the
12922       * pointer to the point just after the first erroneous
12923       * character (or if there are no flags, to where they
12924       * should have been) */
12925       if (RExC_parse >= RExC_end - 4) {
12926        RExC_parse = RExC_end;
12927       }
12928       else if (RExC_parse != save_parse) {
12929        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12930       }
12931       vFAIL("Expecting '(?flags:(?[...'");
12932      }
12933      RExC_parse++;
12934      (void) handle_regex_sets(pRExC_state, &current, flagp,
12935              depth+1, oregcomp_parse);
12936
12937      /* Here, 'current' contains the embedded expression's
12938      * inversion list, and RExC_parse points to the trailing
12939      * ']'; the next character should be the ')' which will be
12940      * paired with the '(' that has been put on the stack, so
12941      * the whole embedded expression reduces to '(operand)' */
12942      RExC_parse++;
12943
12944      RExC_flags = save_flags;
12945      goto handle_operand;
12946     }
12947     /* FALLTHROUGH */
12948
12949    default:
12950     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12951     vFAIL("Unexpected character");
12952
12953    case '\\':
12954     /* regclass() can only return RESTART_UTF8 if multi-char
12955     folds are allowed.  */
12956     if (!regclass(pRExC_state, flagp,depth+1,
12957        TRUE, /* means parse just the next thing */
12958        FALSE, /* don't allow multi-char folds */
12959        FALSE, /* don't silence non-portable warnings.  */
12960        &current))
12961      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12962       (UV) *flagp);
12963     /* regclass() will return with parsing just the \ sequence,
12964     * leaving the parse pointer at the next thing to parse */
12965     RExC_parse--;
12966     goto handle_operand;
12967
12968    case '[':   /* Is a bracketed character class */
12969    {
12970     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12971
12972     if (! is_posix_class) {
12973      RExC_parse++;
12974     }
12975
12976     /* regclass() can only return RESTART_UTF8 if multi-char
12977     folds are allowed.  */
12978     if(!regclass(pRExC_state, flagp,depth+1,
12979        is_posix_class, /* parse the whole char class
12980             only if not a posix class */
12981        FALSE, /* don't allow multi-char folds */
12982        FALSE, /* don't silence non-portable warnings.  */
12983        &current))
12984      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12985       (UV) *flagp);
12986     /* function call leaves parse pointing to the ']', except if we
12987     * faked it */
12988     if (is_posix_class) {
12989      RExC_parse--;
12990     }
12991
12992     goto handle_operand;
12993    }
12994
12995    case '&':
12996    case '|':
12997    case '+':
12998    case '-':
12999    case '^':
13000     if (top_index < 0
13001      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13002      || ! IS_OPERAND(*top_ptr))
13003     {
13004      RExC_parse++;
13005      vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13006     }
13007     av_push(stack, newSVuv(curchar));
13008     break;
13009
13010    case '!':
13011     av_push(stack, newSVuv(curchar));
13012     break;
13013
13014    case '(':
13015     if (top_index >= 0) {
13016      top_ptr = av_fetch(stack, top_index, FALSE);
13017      assert(top_ptr);
13018      if (IS_OPERAND(*top_ptr)) {
13019       RExC_parse++;
13020       vFAIL("Unexpected '(' with no preceding operator");
13021      }
13022     }
13023     av_push(stack, newSVuv(curchar));
13024     break;
13025
13026    case ')':
13027    {
13028     SV* lparen;
13029     if (top_index < 1
13030      || ! (current = av_pop(stack))
13031      || ! IS_OPERAND(current)
13032      || ! (lparen = av_pop(stack))
13033      || IS_OPERAND(lparen)
13034      || SvUV(lparen) != '(')
13035     {
13036      SvREFCNT_dec(current);
13037      RExC_parse++;
13038      vFAIL("Unexpected ')'");
13039     }
13040     top_index -= 2;
13041     SvREFCNT_dec_NN(lparen);
13042
13043     /* FALLTHROUGH */
13044    }
13045
13046    handle_operand:
13047
13048     /* Here, we have an operand to process, in 'current' */
13049
13050     if (top_index < 0) {    /* Just push if stack is empty */
13051      av_push(stack, current);
13052     }
13053     else {
13054      SV* top = av_pop(stack);
13055      SV *prev = NULL;
13056      char current_operator;
13057
13058      if (IS_OPERAND(top)) {
13059       SvREFCNT_dec_NN(top);
13060       SvREFCNT_dec_NN(current);
13061       vFAIL("Operand with no preceding operator");
13062      }
13063      current_operator = (char) SvUV(top);
13064      switch (current_operator) {
13065       case '(':   /* Push the '(' back on followed by the new
13066          operand */
13067        av_push(stack, top);
13068        av_push(stack, current);
13069        SvREFCNT_inc(top);  /* Counters the '_dec' done
13070             just after the 'break', so
13071             it doesn't get wrongly freed
13072             */
13073        break;
13074
13075       case '!':
13076        _invlist_invert(current);
13077
13078        /* Unlike binary operators, the top of the stack,
13079        * now that this unary one has been popped off, may
13080        * legally be an operator, and we now have operand
13081        * for it. */
13082        top_index--;
13083        SvREFCNT_dec_NN(top);
13084        goto handle_operand;
13085
13086       case '&':
13087        prev = av_pop(stack);
13088        _invlist_intersection(prev,
13089             current,
13090             &current);
13091        av_push(stack, current);
13092        break;
13093
13094       case '|':
13095       case '+':
13096        prev = av_pop(stack);
13097        _invlist_union(prev, current, &current);
13098        av_push(stack, current);
13099        break;
13100
13101       case '-':
13102        prev = av_pop(stack);;
13103        _invlist_subtract(prev, current, &current);
13104        av_push(stack, current);
13105        break;
13106
13107       case '^':   /* The union minus the intersection */
13108       {
13109        SV* i = NULL;
13110        SV* u = NULL;
13111        SV* element;
13112
13113        prev = av_pop(stack);
13114        _invlist_union(prev, current, &u);
13115        _invlist_intersection(prev, current, &i);
13116        /* _invlist_subtract will overwrite current
13117         without freeing what it already contains */
13118        element = current;
13119        _invlist_subtract(u, i, &current);
13120        av_push(stack, current);
13121        SvREFCNT_dec_NN(i);
13122        SvREFCNT_dec_NN(u);
13123        SvREFCNT_dec_NN(element);
13124        break;
13125       }
13126
13127       default:
13128        Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13129     }
13130     SvREFCNT_dec_NN(top);
13131     SvREFCNT_dec(prev);
13132    }
13133   }
13134
13135   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13136  }
13137
13138  if (av_tindex(stack) < 0   /* Was empty */
13139   || ((final = av_pop(stack)) == NULL)
13140   || ! IS_OPERAND(final)
13141   || av_tindex(stack) >= 0)  /* More left on stack */
13142  {
13143   vFAIL("Incomplete expression within '(?[ ])'");
13144  }
13145
13146  /* Here, 'final' is the resultant inversion list from evaluating the
13147  * expression.  Return it if so requested */
13148  if (return_invlist) {
13149   *return_invlist = final;
13150   return END;
13151  }
13152
13153  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13154  * expecting a string of ranges and individual code points */
13155  invlist_iterinit(final);
13156  result_string = newSVpvs("");
13157  while (invlist_iternext(final, &start, &end)) {
13158   if (start == end) {
13159    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13160   }
13161   else {
13162    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13163              start,          end);
13164   }
13165  }
13166
13167  save_parse = RExC_parse;
13168  RExC_parse = SvPV(result_string, len);
13169  save_end = RExC_end;
13170  RExC_end = RExC_parse + len;
13171
13172  /* We turn off folding around the call, as the class we have constructed
13173  * already has all folding taken into consideration, and we don't want
13174  * regclass() to add to that */
13175  RExC_flags &= ~RXf_PMf_FOLD;
13176  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13177  */
13178  node = regclass(pRExC_state, flagp,depth+1,
13179      FALSE, /* means parse the whole char class */
13180      FALSE, /* don't allow multi-char folds */
13181      TRUE, /* silence non-portable warnings.  The above may very
13182        well have generated non-portable code points, but
13183        they're valid on this machine */
13184      NULL);
13185  if (!node)
13186   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13187      PTR2UV(flagp));
13188  if (save_fold) {
13189   RExC_flags |= RXf_PMf_FOLD;
13190  }
13191  RExC_parse = save_parse + 1;
13192  RExC_end = save_end;
13193  SvREFCNT_dec_NN(final);
13194  SvREFCNT_dec_NN(result_string);
13195
13196  nextchar(pRExC_state);
13197  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13198  return node;
13199 }
13200 #undef IS_OPERAND
13201
13202 STATIC void
13203 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13204 {
13205  /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13206  * innocent-looking character class, like /[ks]/i won't have to go out to
13207  * disk to find the possible matches.
13208  *
13209  * This should be called only for a Latin1-range code points, cp, which is
13210  * known to be involved in a fold with other code points above Latin1.  It
13211  * would give false results if /aa has been specified.  Multi-char folds
13212  * are outside the scope of this, and must be handled specially.
13213  *
13214  * XXX It would be better to generate these via regen, in case a new
13215  * version of the Unicode standard adds new mappings, though that is not
13216  * really likely, and may be caught by the default: case of the switch
13217  * below. */
13218
13219  PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13220
13221  switch (cp) {
13222   case 'k':
13223   case 'K':
13224   *invlist =
13225    add_cp_to_invlist(*invlist, KELVIN_SIGN);
13226    break;
13227   case 's':
13228   case 'S':
13229   *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13230    break;
13231   case MICRO_SIGN:
13232   *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13233   *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13234    break;
13235   case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13236   case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13237   *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13238    break;
13239   case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13240   *invlist = add_cp_to_invlist(*invlist,
13241           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13242    break;
13243   case LATIN_SMALL_LETTER_SHARP_S:
13244   *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13245    break;
13246   case 'F': case 'f':
13247   case 'I': case 'i':
13248   case 'L': case 'l':
13249   case 'T': case 't':
13250   case 'A': case 'a':
13251   case 'H': case 'h':
13252   case 'J': case 'j':
13253   case 'N': case 'n':
13254   case 'W': case 'w':
13255   case 'Y': case 'y':
13256    /* These all are targets of multi-character folds from code points
13257    * that require UTF8 to express, so they can't match unless the
13258    * target string is in UTF-8, so no action here is necessary, as
13259    * regexec.c properly handles the general case for UTF-8 matching
13260    * and multi-char folds */
13261    break;
13262   default:
13263    /* Use deprecated warning to increase the chances of this being
13264    * output */
13265    ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13266    break;
13267  }
13268 }
13269
13270 /* The names of properties whose definitions are not known at compile time are
13271  * stored in this SV, after a constant heading.  So if the length has been
13272  * changed since initialization, then there is a run-time definition. */
13273 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13274           (SvCUR(listsv) != initial_listsv_len)
13275
13276 STATIC regnode *
13277 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13278     const bool stop_at_1,  /* Just parse the next thing, don't
13279           look for a full character class */
13280     bool allow_multi_folds,
13281     const bool silence_non_portable,   /* Don't output warnings
13282              about too large
13283              characters */
13284     SV** ret_invlist)  /* Return an inversion list, not a node */
13285 {
13286  /* parse a bracketed class specification.  Most of these will produce an
13287  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13288  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13289  * under /i with multi-character folds: it will be rewritten following the
13290  * paradigm of this example, where the <multi-fold>s are characters which
13291  * fold to multiple character sequences:
13292  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13293  * gets effectively rewritten as:
13294  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13295  * reg() gets called (recursively) on the rewritten version, and this
13296  * function will return what it constructs.  (Actually the <multi-fold>s
13297  * aren't physically removed from the [abcdefghi], it's just that they are
13298  * ignored in the recursion by means of a flag:
13299  * <RExC_in_multi_char_class>.)
13300  *
13301  * ANYOF nodes contain a bit map for the first 256 characters, with the
13302  * corresponding bit set if that character is in the list.  For characters
13303  * above 255, a range list or swash is used.  There are extra bits for \w,
13304  * etc. in locale ANYOFs, as what these match is not determinable at
13305  * compile time
13306  *
13307  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13308  * to be restarted.  This can only happen if ret_invlist is non-NULL.
13309  */
13310
13311  dVAR;
13312  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13313  IV range = 0;
13314  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13315  regnode *ret;
13316  STRLEN numlen;
13317  IV namedclass = OOB_NAMEDCLASS;
13318  char *rangebegin = NULL;
13319  bool need_class = 0;
13320  SV *listsv = NULL;
13321  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13322          than just initialized.  */
13323  SV* properties = NULL;    /* Code points that match \p{} \P{} */
13324  SV* posixes = NULL;     /* Code points that match classes like [:word:],
13325        extended beyond the Latin1 range.  These have to
13326        be kept separate from other code points for much
13327        of this function because their handling  is
13328        different under /i, and for most classes under
13329        /d as well */
13330  SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13331        separate for a while from the non-complemented
13332        versions because of complications with /d
13333        matching */
13334  UV element_count = 0;   /* Number of distinct elements in the class.
13335        Optimizations may be possible if this is tiny */
13336  AV * multi_char_matches = NULL; /* Code points that fold to more than one
13337          character; used under /i */
13338  UV n;
13339  char * stop_ptr = RExC_end;    /* where to stop parsing */
13340  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13341             space? */
13342  const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13343
13344  /* Unicode properties are stored in a swash; this holds the current one
13345  * being parsed.  If this swash is the only above-latin1 component of the
13346  * character class, an optimization is to pass it directly on to the
13347  * execution engine.  Otherwise, it is set to NULL to indicate that there
13348  * are other things in the class that have to be dealt with at execution
13349  * time */
13350  SV* swash = NULL;  /* Code points that match \p{} \P{} */
13351
13352  /* Set if a component of this character class is user-defined; just passed
13353  * on to the engine */
13354  bool has_user_defined_property = FALSE;
13355
13356  /* inversion list of code points this node matches only when the target
13357  * string is in UTF-8.  (Because is under /d) */
13358  SV* depends_list = NULL;
13359
13360  /* Inversion list of code points this node matches regardless of things
13361  * like locale, folding, utf8ness of the target string */
13362  SV* cp_list = NULL;
13363
13364  /* Like cp_list, but code points on this list need to be checked for things
13365  * that fold to/from them under /i */
13366  SV* cp_foldable_list = NULL;
13367
13368  /* Like cp_list, but code points on this list are valid only when the
13369  * runtime locale is UTF-8 */
13370  SV* only_utf8_locale_list = NULL;
13371
13372 #ifdef EBCDIC
13373  /* In a range, counts how many 0-2 of the ends of it came from literals,
13374  * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13375  UV literal_endpoint = 0;
13376 #endif
13377  bool invert = FALSE;    /* Is this class to be complemented */
13378
13379  bool warn_super = ALWAYS_WARN_SUPER;
13380
13381  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13382   case we need to change the emitted regop to an EXACT. */
13383  const char * orig_parse = RExC_parse;
13384  const SSize_t orig_size = RExC_size;
13385  bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13386  GET_RE_DEBUG_FLAGS_DECL;
13387
13388  PERL_ARGS_ASSERT_REGCLASS;
13389 #ifndef DEBUGGING
13390  PERL_UNUSED_ARG(depth);
13391 #endif
13392
13393  DEBUG_PARSE("clas");
13394
13395  /* Assume we are going to generate an ANYOF node. */
13396  ret = reganode(pRExC_state, ANYOF, 0);
13397
13398  if (SIZE_ONLY) {
13399   RExC_size += ANYOF_SKIP;
13400   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13401  }
13402  else {
13403   ANYOF_FLAGS(ret) = 0;
13404
13405   RExC_emit += ANYOF_SKIP;
13406   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13407   initial_listsv_len = SvCUR(listsv);
13408   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13409  }
13410
13411  if (skip_white) {
13412   RExC_parse = regpatws(pRExC_state, RExC_parse,
13413        FALSE /* means don't recognize comments */ );
13414  }
13415
13416  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13417   RExC_parse++;
13418   invert = TRUE;
13419   allow_multi_folds = FALSE;
13420   RExC_naughty++;
13421   if (skip_white) {
13422    RExC_parse = regpatws(pRExC_state, RExC_parse,
13423         FALSE /* means don't recognize comments */ );
13424   }
13425  }
13426
13427  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13428  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13429   const char *s = RExC_parse;
13430   const char  c = *s++;
13431
13432   while (isWORDCHAR(*s))
13433    s++;
13434   if (*s && c == *s && s[1] == ']') {
13435    SAVEFREESV(RExC_rx_sv);
13436    ckWARN3reg(s+2,
13437      "POSIX syntax [%c %c] belongs inside character classes",
13438      c, c);
13439    (void)ReREFCNT_inc(RExC_rx_sv);
13440   }
13441  }
13442
13443  /* If the caller wants us to just parse a single element, accomplish this
13444  * by faking the loop ending condition */
13445  if (stop_at_1 && RExC_end > RExC_parse) {
13446   stop_ptr = RExC_parse + 1;
13447  }
13448
13449  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13450  if (UCHARAT(RExC_parse) == ']')
13451   goto charclassloop;
13452
13453 parseit:
13454  while (1) {
13455   if  (RExC_parse >= stop_ptr) {
13456    break;
13457   }
13458
13459   if (skip_white) {
13460    RExC_parse = regpatws(pRExC_state, RExC_parse,
13461         FALSE /* means don't recognize comments */ );
13462   }
13463
13464   if  (UCHARAT(RExC_parse) == ']') {
13465    break;
13466   }
13467
13468  charclassloop:
13469
13470   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13471   save_value = value;
13472   save_prevvalue = prevvalue;
13473
13474   if (!range) {
13475    rangebegin = RExC_parse;
13476    element_count++;
13477   }
13478   if (UTF) {
13479    value = utf8n_to_uvchr((U8*)RExC_parse,
13480         RExC_end - RExC_parse,
13481         &numlen, UTF8_ALLOW_DEFAULT);
13482    RExC_parse += numlen;
13483   }
13484   else
13485    value = UCHARAT(RExC_parse++);
13486
13487   if (value == '['
13488    && RExC_parse < RExC_end
13489    && POSIXCC(UCHARAT(RExC_parse)))
13490   {
13491    namedclass = regpposixcc(pRExC_state, value, strict);
13492   }
13493   else if (value == '\\') {
13494    if (UTF) {
13495     value = utf8n_to_uvchr((U8*)RExC_parse,
13496         RExC_end - RExC_parse,
13497         &numlen, UTF8_ALLOW_DEFAULT);
13498     RExC_parse += numlen;
13499    }
13500    else
13501     value = UCHARAT(RExC_parse++);
13502
13503    /* Some compilers cannot handle switching on 64-bit integer
13504    * values, therefore value cannot be an UV.  Yes, this will
13505    * be a problem later if we want switch on Unicode.
13506    * A similar issue a little bit later when switching on
13507    * namedclass. --jhi */
13508
13509    /* If the \ is escaping white space when white space is being
13510    * skipped, it means that that white space is wanted literally, and
13511    * is already in 'value'.  Otherwise, need to translate the escape
13512    * into what it signifies. */
13513    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13514
13515    case 'w': namedclass = ANYOF_WORDCHAR; break;
13516    case 'W': namedclass = ANYOF_NWORDCHAR; break;
13517    case 's': namedclass = ANYOF_SPACE; break;
13518    case 'S': namedclass = ANYOF_NSPACE; break;
13519    case 'd': namedclass = ANYOF_DIGIT; break;
13520    case 'D': namedclass = ANYOF_NDIGIT; break;
13521    case 'v': namedclass = ANYOF_VERTWS; break;
13522    case 'V': namedclass = ANYOF_NVERTWS; break;
13523    case 'h': namedclass = ANYOF_HORIZWS; break;
13524    case 'H': namedclass = ANYOF_NHORIZWS; break;
13525    case 'N':  /* Handle \N{NAME} in class */
13526     {
13527      /* We only pay attention to the first char of
13528      multichar strings being returned. I kinda wonder
13529      if this makes sense as it does change the behaviour
13530      from earlier versions, OTOH that behaviour was broken
13531      as well. */
13532      if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13533          TRUE, /* => charclass */
13534          strict))
13535      {
13536       if (*flagp & RESTART_UTF8)
13537        FAIL("panic: grok_bslash_N set RESTART_UTF8");
13538       goto parseit;
13539      }
13540     }
13541     break;
13542    case 'p':
13543    case 'P':
13544     {
13545     char *e;
13546
13547     /* We will handle any undefined properties ourselves */
13548     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13549          /* And we actually would prefer to get
13550           * the straight inversion list of the
13551           * swash, since we will be accessing it
13552           * anyway, to save a little time */
13553          |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13554
13555     if (RExC_parse >= RExC_end)
13556      vFAIL2("Empty \\%c{}", (U8)value);
13557     if (*RExC_parse == '{') {
13558      const U8 c = (U8)value;
13559      e = strchr(RExC_parse++, '}');
13560      if (!e)
13561       vFAIL2("Missing right brace on \\%c{}", c);
13562      while (isSPACE(*RExC_parse))
13563       RExC_parse++;
13564      if (e == RExC_parse)
13565       vFAIL2("Empty \\%c{}", c);
13566      n = e - RExC_parse;
13567      while (isSPACE(*(RExC_parse + n - 1)))
13568       n--;
13569     }
13570     else {
13571      e = RExC_parse;
13572      n = 1;
13573     }
13574     if (!SIZE_ONLY) {
13575      SV* invlist;
13576      char* name;
13577
13578      if (UCHARAT(RExC_parse) == '^') {
13579       RExC_parse++;
13580       n--;
13581       /* toggle.  (The rhs xor gets the single bit that
13582       * differs between P and p; the other xor inverts just
13583       * that bit) */
13584       value ^= 'P' ^ 'p';
13585
13586       while (isSPACE(*RExC_parse)) {
13587        RExC_parse++;
13588        n--;
13589       }
13590      }
13591      /* Try to get the definition of the property into
13592      * <invlist>.  If /i is in effect, the effective property
13593      * will have its name be <__NAME_i>.  The design is
13594      * discussed in commit
13595      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13596      name = savepv(Perl_form(aTHX_
13597           "%s%.*s%s\n",
13598           (FOLD) ? "__" : "",
13599           (int)n,
13600           RExC_parse,
13601           (FOLD) ? "_i" : ""
13602         ));
13603
13604      /* Look up the property name, and get its swash and
13605      * inversion list, if the property is found  */
13606      if (swash) {
13607       SvREFCNT_dec_NN(swash);
13608      }
13609      swash = _core_swash_init("utf8", name, &PL_sv_undef,
13610            1, /* binary */
13611            0, /* not tr/// */
13612            NULL, /* No inversion list */
13613            &swash_init_flags
13614            );
13615      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13616       HV* curpkg = (IN_PERL_COMPILETIME)
13617          ? PL_curstash
13618          : CopSTASH(PL_curcop);
13619       if (swash) {
13620        SvREFCNT_dec_NN(swash);
13621        swash = NULL;
13622       }
13623
13624       /* Here didn't find it.  It could be a user-defined
13625       * property that will be available at run-time.  If we
13626       * accept only compile-time properties, is an error;
13627       * otherwise add it to the list for run-time look up */
13628       if (ret_invlist) {
13629        RExC_parse = e + 1;
13630        vFAIL2utf8f(
13631         "Property '%"UTF8f"' is unknown",
13632         UTF8fARG(UTF, n, name));
13633       }
13634
13635       /* If the property name doesn't already have a package
13636       * name, add the current one to it so that it can be
13637       * referred to outside it. [perl #121777] */
13638       if (curpkg && ! instr(name, "::")) {
13639        char* pkgname = HvNAME(curpkg);
13640        if (strNE(pkgname, "main")) {
13641         char* full_name = Perl_form(aTHX_
13642                "%s::%s",
13643                pkgname,
13644                name);
13645         n = strlen(full_name);
13646         Safefree(name);
13647         name = savepvn(full_name, n);
13648        }
13649       }
13650       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13651           (value == 'p' ? '+' : '!'),
13652           UTF8fARG(UTF, n, name));
13653       has_user_defined_property = TRUE;
13654
13655       /* We don't know yet, so have to assume that the
13656       * property could match something in the Latin1 range,
13657       * hence something that isn't utf8.  Note that this
13658       * would cause things in <depends_list> to match
13659       * inappropriately, except that any \p{}, including
13660       * this one forces Unicode semantics, which means there
13661       * is no <depends_list> */
13662       ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13663      }
13664      else {
13665
13666       /* Here, did get the swash and its inversion list.  If
13667       * the swash is from a user-defined property, then this
13668       * whole character class should be regarded as such */
13669       if (swash_init_flags
13670        & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13671       {
13672        has_user_defined_property = TRUE;
13673       }
13674       else if
13675        /* We warn on matching an above-Unicode code point
13676        * if the match would return true, except don't
13677        * warn for \p{All}, which has exactly one element
13678        * = 0 */
13679        (_invlist_contains_cp(invlist, 0x110000)
13680         && (! (_invlist_len(invlist) == 1
13681          && *invlist_array(invlist) == 0)))
13682       {
13683        warn_super = TRUE;
13684       }
13685
13686
13687       /* Invert if asking for the complement */
13688       if (value == 'P') {
13689        _invlist_union_complement_2nd(properties,
13690               invlist,
13691               &properties);
13692
13693        /* The swash can't be used as-is, because we've
13694        * inverted things; delay removing it to here after
13695        * have copied its invlist above */
13696        SvREFCNT_dec_NN(swash);
13697        swash = NULL;
13698       }
13699       else {
13700        _invlist_union(properties, invlist, &properties);
13701       }
13702      }
13703      Safefree(name);
13704     }
13705     RExC_parse = e + 1;
13706     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13707             named */
13708
13709     /* \p means they want Unicode semantics */
13710     RExC_uni_semantics = 1;
13711     }
13712     break;
13713    case 'n': value = '\n';   break;
13714    case 'r': value = '\r';   break;
13715    case 't': value = '\t';   break;
13716    case 'f': value = '\f';   break;
13717    case 'b': value = '\b';   break;
13718    case 'e': value = ASCII_TO_NATIVE('\033');break;
13719    case 'a': value = '\a';                   break;
13720    case 'o':
13721     RExC_parse--; /* function expects to be pointed at the 'o' */
13722     {
13723      const char* error_msg;
13724      bool valid = grok_bslash_o(&RExC_parse,
13725            &value,
13726            &error_msg,
13727            SIZE_ONLY,   /* warnings in pass
13728                1 only */
13729            strict,
13730            silence_non_portable,
13731            UTF);
13732      if (! valid) {
13733       vFAIL(error_msg);
13734      }
13735     }
13736     if (PL_encoding && value < 0x100) {
13737      goto recode_encoding;
13738     }
13739     break;
13740    case 'x':
13741     RExC_parse--; /* function expects to be pointed at the 'x' */
13742     {
13743      const char* error_msg;
13744      bool valid = grok_bslash_x(&RExC_parse,
13745            &value,
13746            &error_msg,
13747            TRUE, /* Output warnings */
13748            strict,
13749            silence_non_portable,
13750            UTF);
13751      if (! valid) {
13752       vFAIL(error_msg);
13753      }
13754     }
13755     if (PL_encoding && value < 0x100)
13756      goto recode_encoding;
13757     break;
13758    case 'c':
13759     value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13760     break;
13761    case '0': case '1': case '2': case '3': case '4':
13762    case '5': case '6': case '7':
13763     {
13764      /* Take 1-3 octal digits */
13765      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13766      numlen = (strict) ? 4 : 3;
13767      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13768      RExC_parse += numlen;
13769      if (numlen != 3) {
13770       if (strict) {
13771        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13772        vFAIL("Need exactly 3 octal digits");
13773       }
13774       else if (! SIZE_ONLY /* like \08, \178 */
13775         && numlen < 3
13776         && RExC_parse < RExC_end
13777         && isDIGIT(*RExC_parse)
13778         && ckWARN(WARN_REGEXP))
13779       {
13780        SAVEFREESV(RExC_rx_sv);
13781        reg_warn_non_literal_string(
13782         RExC_parse + 1,
13783         form_short_octal_warning(RExC_parse, numlen));
13784        (void)ReREFCNT_inc(RExC_rx_sv);
13785       }
13786      }
13787      if (PL_encoding && value < 0x100)
13788       goto recode_encoding;
13789      break;
13790     }
13791    recode_encoding:
13792     if (! RExC_override_recoding) {
13793      SV* enc = PL_encoding;
13794      value = reg_recode((const char)(U8)value, &enc);
13795      if (!enc) {
13796       if (strict) {
13797        vFAIL("Invalid escape in the specified encoding");
13798       }
13799       else if (SIZE_ONLY) {
13800        ckWARNreg(RExC_parse,
13801         "Invalid escape in the specified encoding");
13802       }
13803      }
13804      break;
13805     }
13806    default:
13807     /* Allow \_ to not give an error */
13808     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13809      if (strict) {
13810       vFAIL2("Unrecognized escape \\%c in character class",
13811        (int)value);
13812      }
13813      else {
13814       SAVEFREESV(RExC_rx_sv);
13815       ckWARN2reg(RExC_parse,
13816        "Unrecognized escape \\%c in character class passed through",
13817        (int)value);
13818       (void)ReREFCNT_inc(RExC_rx_sv);
13819      }
13820     }
13821     break;
13822    }   /* End of switch on char following backslash */
13823   } /* end of handling backslash escape sequences */
13824 #ifdef EBCDIC
13825   else
13826    literal_endpoint++;
13827 #endif
13828
13829   /* Here, we have the current token in 'value' */
13830
13831   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13832    U8 classnum;
13833
13834    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13835    * literal, as is the character that began the false range, i.e.
13836    * the 'a' in the examples */
13837    if (range) {
13838     if (!SIZE_ONLY) {
13839      const int w = (RExC_parse >= rangebegin)
13840         ? RExC_parse - rangebegin
13841         : 0;
13842      if (strict) {
13843       vFAIL2utf8f(
13844        "False [] range \"%"UTF8f"\"",
13845        UTF8fARG(UTF, w, rangebegin));
13846      }
13847      else {
13848       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13849       ckWARN2reg(RExC_parse,
13850        "False [] range \"%"UTF8f"\"",
13851        UTF8fARG(UTF, w, rangebegin));
13852       (void)ReREFCNT_inc(RExC_rx_sv);
13853       cp_list = add_cp_to_invlist(cp_list, '-');
13854       cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13855                prevvalue);
13856      }
13857     }
13858
13859     range = 0; /* this was not a true range */
13860     element_count += 2; /* So counts for three values */
13861    }
13862
13863    classnum = namedclass_to_classnum(namedclass);
13864
13865    if (LOC && namedclass < ANYOF_POSIXL_MAX
13866 #ifndef HAS_ISASCII
13867     && classnum != _CC_ASCII
13868 #endif
13869    ) {
13870     /* What the Posix classes (like \w, [:space:]) match in locale
13871     * isn't knowable under locale until actual match time.  Room
13872     * must be reserved (one time per outer bracketed class) to
13873     * store such classes.  The space will contain a bit for each
13874     * named class that is to be matched against.  This isn't
13875     * needed for \p{} and pseudo-classes, as they are not affected
13876     * by locale, and hence are dealt with separately */
13877     if (! need_class) {
13878      need_class = 1;
13879      if (SIZE_ONLY) {
13880       RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13881      }
13882      else {
13883       RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13884      }
13885      ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13886      ANYOF_POSIXL_ZERO(ret);
13887     }
13888
13889     /* Coverity thinks it is possible for this to be negative; both
13890     * jhi and khw think it's not, but be safer */
13891     assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13892      || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
13893
13894     /* See if it already matches the complement of this POSIX
13895     * class */
13896     if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13897      && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13898                ? -1
13899                : 1)))
13900     {
13901      posixl_matches_all = TRUE;
13902      break;  /* No need to continue.  Since it matches both
13903        e.g., \w and \W, it matches everything, and the
13904        bracketed class can be optimized into qr/./s */
13905     }
13906
13907     /* Add this class to those that should be checked at runtime */
13908     ANYOF_POSIXL_SET(ret, namedclass);
13909
13910     /* The above-Latin1 characters are not subject to locale rules.
13911     * Just add them, in the second pass, to the
13912     * unconditionally-matched list */
13913     if (! SIZE_ONLY) {
13914      SV* scratch_list = NULL;
13915
13916      /* Get the list of the above-Latin1 code points this
13917      * matches */
13918      _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13919           PL_XPosix_ptrs[classnum],
13920
13921           /* Odd numbers are complements, like
13922           * NDIGIT, NASCII, ... */
13923           namedclass % 2 != 0,
13924           &scratch_list);
13925      /* Checking if 'cp_list' is NULL first saves an extra
13926      * clone.  Its reference count will be decremented at the
13927      * next union, etc, or if this is the only instance, at the
13928      * end of the routine */
13929      if (! cp_list) {
13930       cp_list = scratch_list;
13931      }
13932      else {
13933       _invlist_union(cp_list, scratch_list, &cp_list);
13934       SvREFCNT_dec_NN(scratch_list);
13935      }
13936      continue;   /* Go get next character */
13937     }
13938    }
13939    else if (! SIZE_ONLY) {
13940
13941     /* Here, not in pass1 (in that pass we skip calculating the
13942     * contents of this class), and is /l, or is a POSIX class for
13943     * which /l doesn't matter (or is a Unicode property, which is
13944     * skipped here). */
13945     if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13946      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13947
13948       /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13949       * nor /l make a difference in what these match,
13950       * therefore we just add what they match to cp_list. */
13951       if (classnum != _CC_VERTSPACE) {
13952        assert(   namedclass == ANYOF_HORIZWS
13953         || namedclass == ANYOF_NHORIZWS);
13954
13955        /* It turns out that \h is just a synonym for
13956        * XPosixBlank */
13957        classnum = _CC_BLANK;
13958       }
13959
13960       _invlist_union_maybe_complement_2nd(
13961         cp_list,
13962         PL_XPosix_ptrs[classnum],
13963         namedclass % 2 != 0,    /* Complement if odd
13964               (NHORIZWS, NVERTWS)
13965               */
13966         &cp_list);
13967      }
13968     }
13969     else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13970       complement and use nposixes */
13971      SV** posixes_ptr = namedclass % 2 == 0
13972          ? &posixes
13973          : &nposixes;
13974      SV** source_ptr = &PL_XPosix_ptrs[classnum];
13975      _invlist_union_maybe_complement_2nd(
13976              *posixes_ptr,
13977              *source_ptr,
13978              namedclass % 2 != 0,
13979              posixes_ptr);
13980     }
13981     continue;   /* Go get next character */
13982    }
13983   } /* end of namedclass \blah */
13984
13985   /* Here, we have a single value.  If 'range' is set, it is the ending
13986   * of a range--check its validity.  Later, we will handle each
13987   * individual code point in the range.  If 'range' isn't set, this
13988   * could be the beginning of a range, so check for that by looking
13989   * ahead to see if the next real character to be processed is the range
13990   * indicator--the minus sign */
13991
13992   if (skip_white) {
13993    RExC_parse = regpatws(pRExC_state, RExC_parse,
13994         FALSE /* means don't recognize comments */ );
13995   }
13996
13997   if (range) {
13998    if (prevvalue > value) /* b-a */ {
13999     const int w = RExC_parse - rangebegin;
14000     vFAIL2utf8f(
14001      "Invalid [] range \"%"UTF8f"\"",
14002      UTF8fARG(UTF, w, rangebegin));
14003     range = 0; /* not a valid range */
14004    }
14005   }
14006   else {
14007    prevvalue = value; /* save the beginning of the potential range */
14008    if (! stop_at_1     /* Can't be a range if parsing just one thing */
14009     && *RExC_parse == '-')
14010    {
14011     char* next_char_ptr = RExC_parse + 1;
14012     if (skip_white) {   /* Get the next real char after the '-' */
14013      next_char_ptr = regpatws(pRExC_state,
14014            RExC_parse + 1,
14015            FALSE); /* means don't recognize
14016               comments */
14017     }
14018
14019     /* If the '-' is at the end of the class (just before the ']',
14020     * it is a literal minus; otherwise it is a range */
14021     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14022      RExC_parse = next_char_ptr;
14023
14024      /* a bad range like \w-, [:word:]- ? */
14025      if (namedclass > OOB_NAMEDCLASS) {
14026       if (strict || ckWARN(WARN_REGEXP)) {
14027        const int w =
14028         RExC_parse >= rangebegin ?
14029         RExC_parse - rangebegin : 0;
14030        if (strict) {
14031         vFAIL4("False [] range \"%*.*s\"",
14032          w, w, rangebegin);
14033        }
14034        else {
14035         vWARN4(RExC_parse,
14036          "False [] range \"%*.*s\"",
14037          w, w, rangebegin);
14038        }
14039       }
14040       if (!SIZE_ONLY) {
14041        cp_list = add_cp_to_invlist(cp_list, '-');
14042       }
14043       element_count++;
14044      } else
14045       range = 1; /* yeah, it's a range! */
14046      continue; /* but do it the next time */
14047     }
14048    }
14049   }
14050
14051   /* Here, <prevvalue> is the beginning of the range, if any; or <value>
14052   * if not */
14053
14054   /* non-Latin1 code point implies unicode semantics.  Must be set in
14055   * pass1 so is there for the whole of pass 2 */
14056   if (value > 255) {
14057    RExC_uni_semantics = 1;
14058   }
14059
14060   /* Ready to process either the single value, or the completed range.
14061   * For single-valued non-inverted ranges, we consider the possibility
14062   * of multi-char folds.  (We made a conscious decision to not do this
14063   * for the other cases because it can often lead to non-intuitive
14064   * results.  For example, you have the peculiar case that:
14065   *  "s s" =~ /^[^\xDF]+$/i => Y
14066   *  "ss"  =~ /^[^\xDF]+$/i => N
14067   *
14068   * See [perl #89750] */
14069   if (FOLD && allow_multi_folds && value == prevvalue) {
14070    if (value == LATIN_SMALL_LETTER_SHARP_S
14071     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14072               value)))
14073    {
14074     /* Here <value> is indeed a multi-char fold.  Get what it is */
14075
14076     U8 foldbuf[UTF8_MAXBYTES_CASE];
14077     STRLEN foldlen;
14078
14079     UV folded = _to_uni_fold_flags(
14080         value,
14081         foldbuf,
14082         &foldlen,
14083         FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14084             ? FOLD_FLAGS_NOMIX_ASCII
14085             : 0)
14086         );
14087
14088     /* Here, <folded> should be the first character of the
14089     * multi-char fold of <value>, with <foldbuf> containing the
14090     * whole thing.  But, if this fold is not allowed (because of
14091     * the flags), <fold> will be the same as <value>, and should
14092     * be processed like any other character, so skip the special
14093     * handling */
14094     if (folded != value) {
14095
14096      /* Skip if we are recursed, currently parsing the class
14097      * again.  Otherwise add this character to the list of
14098      * multi-char folds. */
14099      if (! RExC_in_multi_char_class) {
14100       AV** this_array_ptr;
14101       AV* this_array;
14102       STRLEN cp_count = utf8_length(foldbuf,
14103              foldbuf + foldlen);
14104       SV* multi_fold = sv_2mortal(newSVpvs(""));
14105
14106       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14107
14108
14109       if (! multi_char_matches) {
14110        multi_char_matches = newAV();
14111       }
14112
14113       /* <multi_char_matches> is actually an array of arrays.
14114       * There will be one or two top-level elements: [2],
14115       * and/or [3].  The [2] element is an array, each
14116       * element thereof is a character which folds to TWO
14117       * characters; [3] is for folds to THREE characters.
14118       * (Unicode guarantees a maximum of 3 characters in any
14119       * fold.)  When we rewrite the character class below,
14120       * we will do so such that the longest folds are
14121       * written first, so that it prefers the longest
14122       * matching strings first.  This is done even if it
14123       * turns out that any quantifier is non-greedy, out of
14124       * programmer laziness.  Tom Christiansen has agreed
14125       * that this is ok.  This makes the test for the
14126       * ligature 'ffi' come before the test for 'ff' */
14127       if (av_exists(multi_char_matches, cp_count)) {
14128        this_array_ptr = (AV**) av_fetch(multi_char_matches,
14129                cp_count, FALSE);
14130        this_array = *this_array_ptr;
14131       }
14132       else {
14133        this_array = newAV();
14134        av_store(multi_char_matches, cp_count,
14135          (SV*) this_array);
14136       }
14137       av_push(this_array, multi_fold);
14138      }
14139
14140      /* This element should not be processed further in this
14141      * class */
14142      element_count--;
14143      value = save_value;
14144      prevvalue = save_prevvalue;
14145      continue;
14146     }
14147    }
14148   }
14149
14150   /* Deal with this element of the class */
14151   if (! SIZE_ONLY) {
14152 #ifndef EBCDIC
14153    cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14154              prevvalue, value);
14155 #else
14156    SV* this_range = _new_invlist(1);
14157    _append_range_to_invlist(this_range, prevvalue, value);
14158
14159    /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14160    * If this range was specified using something like 'i-j', we want
14161    * to include only the 'i' and the 'j', and not anything in
14162    * between, so exclude non-ASCII, non-alphabetics from it.
14163    * However, if the range was specified with something like
14164    * [\x89-\x91] or [\x89-j], all code points within it should be
14165    * included.  literal_endpoint==2 means both ends of the range used
14166    * a literal character, not \x{foo} */
14167    if (literal_endpoint == 2
14168     && ((prevvalue >= 'a' && value <= 'z')
14169      || (prevvalue >= 'A' && value <= 'Z')))
14170    {
14171     _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14172          &this_range);
14173
14174     /* Since this above only contains ascii, the intersection of it
14175     * with anything will still yield only ascii */
14176     _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14177          &this_range);
14178    }
14179    _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14180    literal_endpoint = 0;
14181 #endif
14182   }
14183
14184   range = 0; /* this range (if it was one) is done now */
14185  } /* End of loop through all the text within the brackets */
14186
14187  /* If anything in the class expands to more than one character, we have to
14188  * deal with them by building up a substitute parse string, and recursively
14189  * calling reg() on it, instead of proceeding */
14190  if (multi_char_matches) {
14191   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14192   I32 cp_count;
14193   STRLEN len;
14194   char *save_end = RExC_end;
14195   char *save_parse = RExC_parse;
14196   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14197          a "|" */
14198   I32 reg_flags;
14199
14200   assert(! invert);
14201 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14202   because too confusing */
14203   if (invert) {
14204    sv_catpv(substitute_parse, "(?:");
14205   }
14206 #endif
14207
14208   /* Look at the longest folds first */
14209   for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14210
14211    if (av_exists(multi_char_matches, cp_count)) {
14212     AV** this_array_ptr;
14213     SV* this_sequence;
14214
14215     this_array_ptr = (AV**) av_fetch(multi_char_matches,
14216             cp_count, FALSE);
14217     while ((this_sequence = av_pop(*this_array_ptr)) !=
14218                 &PL_sv_undef)
14219     {
14220      if (! first_time) {
14221       sv_catpv(substitute_parse, "|");
14222      }
14223      first_time = FALSE;
14224
14225      sv_catpv(substitute_parse, SvPVX(this_sequence));
14226     }
14227    }
14228   }
14229
14230   /* If the character class contains anything else besides these
14231   * multi-character folds, have to include it in recursive parsing */
14232   if (element_count) {
14233    sv_catpv(substitute_parse, "|[");
14234    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14235    sv_catpv(substitute_parse, "]");
14236   }
14237
14238   sv_catpv(substitute_parse, ")");
14239 #if 0
14240   if (invert) {
14241    /* This is a way to get the parse to skip forward a whole named
14242    * sequence instead of matching the 2nd character when it fails the
14243    * first */
14244    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14245   }
14246 #endif
14247
14248   RExC_parse = SvPV(substitute_parse, len);
14249   RExC_end = RExC_parse + len;
14250   RExC_in_multi_char_class = 1;
14251   RExC_emit = (regnode *)orig_emit;
14252
14253   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14254
14255   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14256
14257   RExC_parse = save_parse;
14258   RExC_end = save_end;
14259   RExC_in_multi_char_class = 0;
14260   SvREFCNT_dec_NN(multi_char_matches);
14261   return ret;
14262  }
14263
14264  /* Here, we've gone through the entire class and dealt with multi-char
14265  * folds.  We are now in a position that we can do some checks to see if we
14266  * can optimize this ANYOF node into a simpler one, even in Pass 1.
14267  * Currently we only do two checks:
14268  * 1) is in the unlikely event that the user has specified both, eg. \w and
14269  *    \W under /l, then the class matches everything.  (This optimization
14270  *    is done only to make the optimizer code run later work.)
14271  * 2) if the character class contains only a single element (including a
14272  *    single range), we see if there is an equivalent node for it.
14273  * Other checks are possible */
14274  if (! ret_invlist   /* Can't optimize if returning the constructed
14275       inversion list */
14276   && (UNLIKELY(posixl_matches_all) || element_count == 1))
14277  {
14278   U8 op = END;
14279   U8 arg = 0;
14280
14281   if (UNLIKELY(posixl_matches_all)) {
14282    op = SANY;
14283   }
14284   else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14285             \w or [:digit:] or \p{foo}
14286             */
14287
14288    /* All named classes are mapped into POSIXish nodes, with its FLAG
14289    * argument giving which class it is */
14290    switch ((I32)namedclass) {
14291     case ANYOF_UNIPROP:
14292      break;
14293
14294     /* These don't depend on the charset modifiers.  They always
14295     * match under /u rules */
14296     case ANYOF_NHORIZWS:
14297     case ANYOF_HORIZWS:
14298      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14299      /* FALLTHROUGH */
14300
14301     case ANYOF_NVERTWS:
14302     case ANYOF_VERTWS:
14303      op = POSIXU;
14304      goto join_posix;
14305
14306     /* The actual POSIXish node for all the rest depends on the
14307     * charset modifier.  The ones in the first set depend only on
14308     * ASCII or, if available on this platform, locale */
14309     case ANYOF_ASCII:
14310     case ANYOF_NASCII:
14311 #ifdef HAS_ISASCII
14312      op = (LOC) ? POSIXL : POSIXA;
14313 #else
14314      op = POSIXA;
14315 #endif
14316      goto join_posix;
14317
14318     case ANYOF_NCASED:
14319     case ANYOF_LOWER:
14320     case ANYOF_NLOWER:
14321     case ANYOF_UPPER:
14322     case ANYOF_NUPPER:
14323      /* under /a could be alpha */
14324      if (FOLD) {
14325       if (ASCII_RESTRICTED) {
14326        namedclass = ANYOF_ALPHA + (namedclass % 2);
14327       }
14328       else if (! LOC) {
14329        break;
14330       }
14331      }
14332      /* FALLTHROUGH */
14333
14334     /* The rest have more possibilities depending on the charset.
14335     * We take advantage of the enum ordering of the charset
14336     * modifiers to get the exact node type, */
14337     default:
14338      op = POSIXD + get_regex_charset(RExC_flags);
14339      if (op > POSIXA) { /* /aa is same as /a */
14340       op = POSIXA;
14341      }
14342
14343     join_posix:
14344      /* The odd numbered ones are the complements of the
14345      * next-lower even number one */
14346      if (namedclass % 2 == 1) {
14347       invert = ! invert;
14348       namedclass--;
14349      }
14350      arg = namedclass_to_classnum(namedclass);
14351      break;
14352    }
14353   }
14354   else if (value == prevvalue) {
14355
14356    /* Here, the class consists of just a single code point */
14357
14358    if (invert) {
14359     if (! LOC && value == '\n') {
14360      op = REG_ANY; /* Optimize [^\n] */
14361      *flagp |= HASWIDTH|SIMPLE;
14362      RExC_naughty++;
14363     }
14364    }
14365    else if (value < 256 || UTF) {
14366
14367     /* Optimize a single value into an EXACTish node, but not if it
14368     * would require converting the pattern to UTF-8. */
14369     op = compute_EXACTish(pRExC_state);
14370    }
14371   } /* Otherwise is a range */
14372   else if (! LOC) {   /* locale could vary these */
14373    if (prevvalue == '0') {
14374     if (value == '9') {
14375      arg = _CC_DIGIT;
14376      op = POSIXA;
14377     }
14378    }
14379    else if (prevvalue == 'A') {
14380     if (value == 'Z'
14381 #ifdef EBCDIC
14382      && literal_endpoint == 2
14383 #endif
14384     ) {
14385      arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14386      op = POSIXA;
14387     }
14388    }
14389    else if (prevvalue == 'a') {
14390     if (value == 'z'
14391 #ifdef EBCDIC
14392      && literal_endpoint == 2
14393 #endif
14394     ) {
14395      arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14396      op = POSIXA;
14397     }
14398    }
14399   }
14400
14401   /* Here, we have changed <op> away from its initial value iff we found
14402   * an optimization */
14403   if (op != END) {
14404
14405    /* Throw away this ANYOF regnode, and emit the calculated one,
14406    * which should correspond to the beginning, not current, state of
14407    * the parse */
14408    const char * cur_parse = RExC_parse;
14409    RExC_parse = (char *)orig_parse;
14410    if ( SIZE_ONLY) {
14411     if (! LOC) {
14412
14413      /* To get locale nodes to not use the full ANYOF size would
14414      * require moving the code above that writes the portions
14415      * of it that aren't in other nodes to after this point.
14416      * e.g.  ANYOF_POSIXL_SET */
14417      RExC_size = orig_size;
14418     }
14419    }
14420    else {
14421     RExC_emit = (regnode *)orig_emit;
14422     if (PL_regkind[op] == POSIXD) {
14423      if (op == POSIXL) {
14424       RExC_contains_locale = 1;
14425      }
14426      if (invert) {
14427       op += NPOSIXD - POSIXD;
14428      }
14429     }
14430    }
14431
14432    ret = reg_node(pRExC_state, op);
14433
14434    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14435     if (! SIZE_ONLY) {
14436      FLAGS(ret) = arg;
14437     }
14438     *flagp |= HASWIDTH|SIMPLE;
14439    }
14440    else if (PL_regkind[op] == EXACT) {
14441     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14442           TRUE /* downgradable to EXACT */
14443           );
14444    }
14445
14446    RExC_parse = (char *) cur_parse;
14447
14448    SvREFCNT_dec(posixes);
14449    SvREFCNT_dec(nposixes);
14450    SvREFCNT_dec(cp_list);
14451    SvREFCNT_dec(cp_foldable_list);
14452    return ret;
14453   }
14454  }
14455
14456  if (SIZE_ONLY)
14457   return ret;
14458  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14459
14460  /* If folding, we calculate all characters that could fold to or from the
14461  * ones already on the list */
14462  if (cp_foldable_list) {
14463   if (FOLD) {
14464    UV start, end; /* End points of code point ranges */
14465
14466    SV* fold_intersection = NULL;
14467    SV** use_list;
14468
14469    /* Our calculated list will be for Unicode rules.  For locale
14470    * matching, we have to keep a separate list that is consulted at
14471    * runtime only when the locale indicates Unicode rules.  For
14472    * non-locale, we just use to the general list */
14473    if (LOC) {
14474     use_list = &only_utf8_locale_list;
14475    }
14476    else {
14477     use_list = &cp_list;
14478    }
14479
14480    /* Only the characters in this class that participate in folds need
14481    * be checked.  Get the intersection of this class and all the
14482    * possible characters that are foldable.  This can quickly narrow
14483    * down a large class */
14484    _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14485         &fold_intersection);
14486
14487    /* The folds for all the Latin1 characters are hard-coded into this
14488    * program, but we have to go out to disk to get the others. */
14489    if (invlist_highest(cp_foldable_list) >= 256) {
14490
14491     /* This is a hash that for a particular fold gives all
14492     * characters that are involved in it */
14493     if (! PL_utf8_foldclosures) {
14494      _load_PL_utf8_foldclosures();
14495     }
14496    }
14497
14498    /* Now look at the foldable characters in this class individually */
14499    invlist_iterinit(fold_intersection);
14500    while (invlist_iternext(fold_intersection, &start, &end)) {
14501     UV j;
14502
14503     /* Look at every character in the range */
14504     for (j = start; j <= end; j++) {
14505      U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14506      STRLEN foldlen;
14507      SV** listp;
14508
14509      if (j < 256) {
14510
14511       if (IS_IN_SOME_FOLD_L1(j)) {
14512
14513        /* ASCII is always matched; non-ASCII is matched
14514        * only under Unicode rules (which could happen
14515        * under /l if the locale is a UTF-8 one */
14516        if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14517         *use_list = add_cp_to_invlist(*use_list,
14518                PL_fold_latin1[j]);
14519        }
14520        else {
14521         depends_list =
14522         add_cp_to_invlist(depends_list,
14523             PL_fold_latin1[j]);
14524        }
14525       }
14526
14527       if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14528        && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14529       {
14530        add_above_Latin1_folds(pRExC_state,
14531             (U8) j,
14532             use_list);
14533       }
14534       continue;
14535      }
14536
14537      /* Here is an above Latin1 character.  We don't have the
14538      * rules hard-coded for it.  First, get its fold.  This is
14539      * the simple fold, as the multi-character folds have been
14540      * handled earlier and separated out */
14541      _to_uni_fold_flags(j, foldbuf, &foldlen,
14542               (ASCII_FOLD_RESTRICTED)
14543               ? FOLD_FLAGS_NOMIX_ASCII
14544               : 0);
14545
14546      /* Single character fold of above Latin1.  Add everything in
14547      * its fold closure to the list that this node should match.
14548      * The fold closures data structure is a hash with the keys
14549      * being the UTF-8 of every character that is folded to, like
14550      * 'k', and the values each an array of all code points that
14551      * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14552      * Multi-character folds are not included */
14553      if ((listp = hv_fetch(PL_utf8_foldclosures,
14554           (char *) foldbuf, foldlen, FALSE)))
14555      {
14556       AV* list = (AV*) *listp;
14557       IV k;
14558       for (k = 0; k <= av_tindex(list); k++) {
14559        SV** c_p = av_fetch(list, k, FALSE);
14560        UV c;
14561        assert(c_p);
14562
14563        c = SvUV(*c_p);
14564
14565        /* /aa doesn't allow folds between ASCII and non- */
14566        if ((ASCII_FOLD_RESTRICTED
14567         && (isASCII(c) != isASCII(j))))
14568        {
14569         continue;
14570        }
14571
14572        /* Folds under /l which cross the 255/256 boundary
14573        * are added to a separate list.  (These are valid
14574        * only when the locale is UTF-8.) */
14575        if (c < 256 && LOC) {
14576         *use_list = add_cp_to_invlist(*use_list, c);
14577         continue;
14578        }
14579
14580        if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14581        {
14582         cp_list = add_cp_to_invlist(cp_list, c);
14583        }
14584        else {
14585         /* Similarly folds involving non-ascii Latin1
14586         * characters under /d are added to their list */
14587         depends_list = add_cp_to_invlist(depends_list,
14588                 c);
14589        }
14590       }
14591      }
14592     }
14593    }
14594    SvREFCNT_dec_NN(fold_intersection);
14595   }
14596
14597   /* Now that we have finished adding all the folds, there is no reason
14598   * to keep the foldable list separate */
14599   _invlist_union(cp_list, cp_foldable_list, &cp_list);
14600   SvREFCNT_dec_NN(cp_foldable_list);
14601  }
14602
14603  /* And combine the result (if any) with any inversion list from posix
14604  * classes.  The lists are kept separate up to now because we don't want to
14605  * fold the classes (folding of those is automatically handled by the swash
14606  * fetching code) */
14607  if (posixes || nposixes) {
14608   if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14609    /* Under /a and /aa, nothing above ASCII matches these */
14610    _invlist_intersection(posixes,
14611         PL_XPosix_ptrs[_CC_ASCII],
14612         &posixes);
14613   }
14614   if (nposixes) {
14615    if (DEPENDS_SEMANTICS) {
14616     /* Under /d, everything in the upper half of the Latin1 range
14617     * matches these complements */
14618     ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14619    }
14620    else if (AT_LEAST_ASCII_RESTRICTED) {
14621     /* Under /a and /aa, everything above ASCII matches these
14622     * complements */
14623     _invlist_union_complement_2nd(nposixes,
14624            PL_XPosix_ptrs[_CC_ASCII],
14625            &nposixes);
14626    }
14627    if (posixes) {
14628     _invlist_union(posixes, nposixes, &posixes);
14629     SvREFCNT_dec_NN(nposixes);
14630    }
14631    else {
14632     posixes = nposixes;
14633    }
14634   }
14635   if (! DEPENDS_SEMANTICS) {
14636    if (cp_list) {
14637     _invlist_union(cp_list, posixes, &cp_list);
14638     SvREFCNT_dec_NN(posixes);
14639    }
14640    else {
14641     cp_list = posixes;
14642    }
14643   }
14644   else {
14645    /* Under /d, we put into a separate list the Latin1 things that
14646    * match only when the target string is utf8 */
14647    SV* nonascii_but_latin1_properties = NULL;
14648    _invlist_intersection(posixes, PL_UpperLatin1,
14649         &nonascii_but_latin1_properties);
14650    _invlist_subtract(posixes, nonascii_but_latin1_properties,
14651        &posixes);
14652    if (cp_list) {
14653     _invlist_union(cp_list, posixes, &cp_list);
14654     SvREFCNT_dec_NN(posixes);
14655    }
14656    else {
14657     cp_list = posixes;
14658    }
14659
14660    if (depends_list) {
14661     _invlist_union(depends_list, nonascii_but_latin1_properties,
14662        &depends_list);
14663     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14664    }
14665    else {
14666     depends_list = nonascii_but_latin1_properties;
14667    }
14668   }
14669  }
14670
14671  /* And combine the result (if any) with any inversion list from properties.
14672  * The lists are kept separate up to now so that we can distinguish the two
14673  * in regards to matching above-Unicode.  A run-time warning is generated
14674  * if a Unicode property is matched against a non-Unicode code point. But,
14675  * we allow user-defined properties to match anything, without any warning,
14676  * and we also suppress the warning if there is a portion of the character
14677  * class that isn't a Unicode property, and which matches above Unicode, \W
14678  * or [\x{110000}] for example.
14679  * (Note that in this case, unlike the Posix one above, there is no
14680  * <depends_list>, because having a Unicode property forces Unicode
14681  * semantics */
14682  if (properties) {
14683   if (cp_list) {
14684
14685    /* If it matters to the final outcome, see if a non-property
14686    * component of the class matches above Unicode.  If so, the
14687    * warning gets suppressed.  This is true even if just a single
14688    * such code point is specified, as though not strictly correct if
14689    * another such code point is matched against, the fact that they
14690    * are using above-Unicode code points indicates they should know
14691    * the issues involved */
14692    if (warn_super) {
14693     warn_super = ! (invert
14694        ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14695    }
14696
14697    _invlist_union(properties, cp_list, &cp_list);
14698    SvREFCNT_dec_NN(properties);
14699   }
14700   else {
14701    cp_list = properties;
14702   }
14703
14704   if (warn_super) {
14705    ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14706   }
14707  }
14708
14709  /* Here, we have calculated what code points should be in the character
14710  * class.
14711  *
14712  * Now we can see about various optimizations.  Fold calculation (which we
14713  * did above) needs to take place before inversion.  Otherwise /[^k]/i
14714  * would invert to include K, which under /i would match k, which it
14715  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14716  * folded until runtime */
14717
14718  /* If we didn't do folding, it's because some information isn't available
14719  * until runtime; set the run-time fold flag for these.  (We don't have to
14720  * worry about properties folding, as that is taken care of by the swash
14721  * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14722  * locales, or the class matches at least one 0-255 range code point */
14723  if (LOC && FOLD) {
14724   if (only_utf8_locale_list) {
14725    ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14726   }
14727   else if (cp_list) { /* Look to see if there a 0-255 code point is in
14728        the list */
14729    UV start, end;
14730    invlist_iterinit(cp_list);
14731    if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14732     ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14733    }
14734    invlist_iterfinish(cp_list);
14735   }
14736  }
14737
14738  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14739  * at compile time.  Besides not inverting folded locale now, we can't
14740  * invert if there are things such as \w, which aren't known until runtime
14741  * */
14742  if (cp_list
14743   && invert
14744   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14745   && ! depends_list
14746   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14747  {
14748   _invlist_invert(cp_list);
14749
14750   /* Any swash can't be used as-is, because we've inverted things */
14751   if (swash) {
14752    SvREFCNT_dec_NN(swash);
14753    swash = NULL;
14754   }
14755
14756   /* Clear the invert flag since have just done it here */
14757   invert = FALSE;
14758  }
14759
14760  if (ret_invlist) {
14761   *ret_invlist = cp_list;
14762   SvREFCNT_dec(swash);
14763
14764   /* Discard the generated node */
14765   if (SIZE_ONLY) {
14766    RExC_size = orig_size;
14767   }
14768   else {
14769    RExC_emit = orig_emit;
14770   }
14771   return orig_emit;
14772  }
14773
14774  /* Some character classes are equivalent to other nodes.  Such nodes take
14775  * up less room and generally fewer operations to execute than ANYOF nodes.
14776  * Above, we checked for and optimized into some such equivalents for
14777  * certain common classes that are easy to test.  Getting to this point in
14778  * the code means that the class didn't get optimized there.  Since this
14779  * code is only executed in Pass 2, it is too late to save space--it has
14780  * been allocated in Pass 1, and currently isn't given back.  But turning
14781  * things into an EXACTish node can allow the optimizer to join it to any
14782  * adjacent such nodes.  And if the class is equivalent to things like /./,
14783  * expensive run-time swashes can be avoided.  Now that we have more
14784  * complete information, we can find things necessarily missed by the
14785  * earlier code.  I (khw) am not sure how much to look for here.  It would
14786  * be easy, but perhaps too slow, to check any candidates against all the
14787  * node types they could possibly match using _invlistEQ(). */
14788
14789  if (cp_list
14790   && ! invert
14791   && ! depends_list
14792   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14793   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14794
14795   /* We don't optimize if we are supposed to make sure all non-Unicode
14796    * code points raise a warning, as only ANYOF nodes have this check.
14797    * */
14798   && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14799  {
14800   UV start, end;
14801   U8 op = END;  /* The optimzation node-type */
14802   const char * cur_parse= RExC_parse;
14803
14804   invlist_iterinit(cp_list);
14805   if (! invlist_iternext(cp_list, &start, &end)) {
14806
14807    /* Here, the list is empty.  This happens, for example, when a
14808    * Unicode property is the only thing in the character class, and
14809    * it doesn't match anything.  (perluniprops.pod notes such
14810    * properties) */
14811    op = OPFAIL;
14812    *flagp |= HASWIDTH|SIMPLE;
14813   }
14814   else if (start == end) {    /* The range is a single code point */
14815    if (! invlist_iternext(cp_list, &start, &end)
14816
14817      /* Don't do this optimization if it would require changing
14818      * the pattern to UTF-8 */
14819     && (start < 256 || UTF))
14820    {
14821     /* Here, the list contains a single code point.  Can optimize
14822     * into an EXACTish node */
14823
14824     value = start;
14825
14826     if (! FOLD) {
14827      op = EXACT;
14828     }
14829     else if (LOC) {
14830
14831      /* A locale node under folding with one code point can be
14832      * an EXACTFL, as its fold won't be calculated until
14833      * runtime */
14834      op = EXACTFL;
14835     }
14836     else {
14837
14838      /* Here, we are generally folding, but there is only one
14839      * code point to match.  If we have to, we use an EXACT
14840      * node, but it would be better for joining with adjacent
14841      * nodes in the optimization pass if we used the same
14842      * EXACTFish node that any such are likely to be.  We can
14843      * do this iff the code point doesn't participate in any
14844      * folds.  For example, an EXACTF of a colon is the same as
14845      * an EXACT one, since nothing folds to or from a colon. */
14846      if (value < 256) {
14847       if (IS_IN_SOME_FOLD_L1(value)) {
14848        op = EXACT;
14849       }
14850      }
14851      else {
14852       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14853        op = EXACT;
14854       }
14855      }
14856
14857      /* If we haven't found the node type, above, it means we
14858      * can use the prevailing one */
14859      if (op == END) {
14860       op = compute_EXACTish(pRExC_state);
14861      }
14862     }
14863    }
14864   }
14865   else if (start == 0) {
14866    if (end == UV_MAX) {
14867     op = SANY;
14868     *flagp |= HASWIDTH|SIMPLE;
14869     RExC_naughty++;
14870    }
14871    else if (end == '\n' - 1
14872      && invlist_iternext(cp_list, &start, &end)
14873      && start == '\n' + 1 && end == UV_MAX)
14874    {
14875     op = REG_ANY;
14876     *flagp |= HASWIDTH|SIMPLE;
14877     RExC_naughty++;
14878    }
14879   }
14880   invlist_iterfinish(cp_list);
14881
14882   if (op != END) {
14883    RExC_parse = (char *)orig_parse;
14884    RExC_emit = (regnode *)orig_emit;
14885
14886    ret = reg_node(pRExC_state, op);
14887
14888    RExC_parse = (char *)cur_parse;
14889
14890    if (PL_regkind[op] == EXACT) {
14891     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14892           TRUE /* downgradable to EXACT */
14893           );
14894    }
14895
14896    SvREFCNT_dec_NN(cp_list);
14897    return ret;
14898   }
14899  }
14900
14901  /* Here, <cp_list> contains all the code points we can determine at
14902  * compile time that match under all conditions.  Go through it, and
14903  * for things that belong in the bitmap, put them there, and delete from
14904  * <cp_list>.  While we are at it, see if everything above 255 is in the
14905  * list, and if so, set a flag to speed up execution */
14906
14907  populate_ANYOF_from_invlist(ret, &cp_list);
14908
14909  if (invert) {
14910   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14911  }
14912
14913  /* Here, the bitmap has been populated with all the Latin1 code points that
14914  * always match.  Can now add to the overall list those that match only
14915  * when the target string is UTF-8 (<depends_list>). */
14916  if (depends_list) {
14917   if (cp_list) {
14918    _invlist_union(cp_list, depends_list, &cp_list);
14919    SvREFCNT_dec_NN(depends_list);
14920   }
14921   else {
14922    cp_list = depends_list;
14923   }
14924   ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14925  }
14926
14927  /* If there is a swash and more than one element, we can't use the swash in
14928  * the optimization below. */
14929  if (swash && element_count > 1) {
14930   SvREFCNT_dec_NN(swash);
14931   swash = NULL;
14932  }
14933
14934  set_ANYOF_arg(pRExC_state, ret, cp_list,
14935     (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14936     ? listsv : NULL,
14937     only_utf8_locale_list,
14938     swash, has_user_defined_property);
14939
14940  *flagp |= HASWIDTH|SIMPLE;
14941
14942  if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14943   RExC_contains_locale = 1;
14944  }
14945
14946  return ret;
14947 }
14948
14949 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14950
14951 STATIC void
14952 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14953     regnode* const node,
14954     SV* const cp_list,
14955     SV* const runtime_defns,
14956     SV* const only_utf8_locale_list,
14957     SV* const swash,
14958     const bool has_user_defined_property)
14959 {
14960  /* Sets the arg field of an ANYOF-type node 'node', using information about
14961  * the node passed-in.  If there is nothing outside the node's bitmap, the
14962  * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14963  * the count returned by add_data(), having allocated and stored an array,
14964  * av, that that count references, as follows:
14965  *  av[0] stores the character class description in its textual form.
14966  *        This is used later (regexec.c:Perl_regclass_swash()) to
14967  *        initialize the appropriate swash, and is also useful for dumping
14968  *        the regnode.  This is set to &PL_sv_undef if the textual
14969  *        description is not needed at run-time (as happens if the other
14970  *        elements completely define the class)
14971  *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14972  *        computed from av[0].  But if no further computation need be done,
14973  *        the swash is stored here now (and av[0] is &PL_sv_undef).
14974  *  av[2] stores the inversion list of code points that match only if the
14975  *        current locale is UTF-8
14976  *  av[3] stores the cp_list inversion list for use in addition or instead
14977  *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14978  *        (Otherwise everything needed is already in av[0] and av[1])
14979  *  av[4] is set if any component of the class is from a user-defined
14980  *        property; used only if av[3] exists */
14981
14982  UV n;
14983
14984  PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14985
14986  if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14987   assert(! (ANYOF_FLAGS(node)
14988      & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14989   ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14990  }
14991  else {
14992   AV * const av = newAV();
14993   SV *rv;
14994
14995   assert(ANYOF_FLAGS(node)
14996      & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14997
14998   av_store(av, 0, (runtime_defns)
14999       ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15000   if (swash) {
15001    assert(cp_list);
15002    av_store(av, 1, swash);
15003    SvREFCNT_dec_NN(cp_list);
15004   }
15005   else {
15006    av_store(av, 1, &PL_sv_undef);
15007    if (cp_list) {
15008     av_store(av, 3, cp_list);
15009     av_store(av, 4, newSVuv(has_user_defined_property));
15010    }
15011   }
15012
15013   if (only_utf8_locale_list) {
15014    av_store(av, 2, only_utf8_locale_list);
15015   }
15016   else {
15017    av_store(av, 2, &PL_sv_undef);
15018   }
15019
15020   rv = newRV_noinc(MUTABLE_SV(av));
15021   n = add_data(pRExC_state, STR_WITH_LEN("s"));
15022   RExC_rxi->data->data[n] = (void*)rv;
15023   ARG_SET(node, n);
15024  }
15025 }
15026
15027
15028 /* reg_skipcomment()
15029
15030    Absorbs an /x style # comment from the input stream,
15031    returning a pointer to the first character beyond the comment, or if the
15032    comment terminates the pattern without anything following it, this returns
15033    one past the final character of the pattern (in other words, RExC_end) and
15034    sets the REG_RUN_ON_COMMENT_SEEN flag.
15035
15036    Note it's the callers responsibility to ensure that we are
15037    actually in /x mode
15038
15039 */
15040
15041 PERL_STATIC_INLINE char*
15042 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15043 {
15044  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15045
15046  assert(*p = '#');
15047
15048  while (p < RExC_end) {
15049   if (*(++p) == '\n') {
15050    return p+1;
15051   }
15052  }
15053
15054  /* we ran off the end of the pattern without ending the comment, so we have
15055  * to add an \n when wrapping */
15056  RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15057  return p;
15058 }
15059
15060 /* nextchar()
15061
15062    Advances the parse position, and optionally absorbs
15063    "whitespace" from the inputstream.
15064
15065    Without /x "whitespace" means (?#...) style comments only,
15066    with /x this means (?#...) and # comments and whitespace proper.
15067
15068    Returns the RExC_parse point from BEFORE the scan occurs.
15069
15070    This is the /x friendly way of saying RExC_parse++.
15071 */
15072
15073 STATIC char*
15074 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15075 {
15076  char* const retval = RExC_parse++;
15077
15078  PERL_ARGS_ASSERT_NEXTCHAR;
15079
15080  for (;;) {
15081   if (RExC_end - RExC_parse >= 3
15082    && *RExC_parse == '('
15083    && RExC_parse[1] == '?'
15084    && RExC_parse[2] == '#')
15085   {
15086    while (*RExC_parse != ')') {
15087     if (RExC_parse == RExC_end)
15088      FAIL("Sequence (?#... not terminated");
15089     RExC_parse++;
15090    }
15091    RExC_parse++;
15092    continue;
15093   }
15094   if (RExC_flags & RXf_PMf_EXTENDED) {
15095    char * p = regpatws(pRExC_state, RExC_parse,
15096           TRUE); /* means recognize comments */
15097    if (p != RExC_parse) {
15098     RExC_parse = p;
15099     continue;
15100    }
15101   }
15102   return retval;
15103  }
15104 }
15105
15106 /*
15107 - reg_node - emit a node
15108 */
15109 STATIC regnode *   /* Location. */
15110 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15111 {
15112  dVAR;
15113  regnode *ptr;
15114  regnode * const ret = RExC_emit;
15115  GET_RE_DEBUG_FLAGS_DECL;
15116
15117  PERL_ARGS_ASSERT_REG_NODE;
15118
15119  if (SIZE_ONLY) {
15120   SIZE_ALIGN(RExC_size);
15121   RExC_size += 1;
15122   return(ret);
15123  }
15124  if (RExC_emit >= RExC_emit_bound)
15125   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15126     op, (void*)RExC_emit, (void*)RExC_emit_bound);
15127
15128  NODE_ALIGN_FILL(ret);
15129  ptr = ret;
15130  FILL_ADVANCE_NODE(ptr, op);
15131  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
15132 #ifdef RE_TRACK_PATTERN_OFFSETS
15133  if (RExC_offsets) {         /* MJD */
15134   MJD_OFFSET_DEBUG(
15135    ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15136    "reg_node", __LINE__,
15137    PL_reg_name[op],
15138    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15139     ? "Overwriting end of array!\n" : "OK",
15140    (UV)(RExC_emit - RExC_emit_start),
15141    (UV)(RExC_parse - RExC_start),
15142    (UV)RExC_offsets[0]));
15143   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15144  }
15145 #endif
15146  RExC_emit = ptr;
15147  return(ret);
15148 }
15149
15150 /*
15151 - reganode - emit a node with an argument
15152 */
15153 STATIC regnode *   /* Location. */
15154 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15155 {
15156  dVAR;
15157  regnode *ptr;
15158  regnode * const ret = RExC_emit;
15159  GET_RE_DEBUG_FLAGS_DECL;
15160
15161  PERL_ARGS_ASSERT_REGANODE;
15162
15163  if (SIZE_ONLY) {
15164   SIZE_ALIGN(RExC_size);
15165   RExC_size += 2;
15166   /*
15167   We can't do this:
15168
15169   assert(2==regarglen[op]+1);
15170
15171   Anything larger than this has to allocate the extra amount.
15172   If we changed this to be:
15173
15174   RExC_size += (1 + regarglen[op]);
15175
15176   then it wouldn't matter. Its not clear what side effect
15177   might come from that so its not done so far.
15178   -- dmq
15179   */
15180   return(ret);
15181  }
15182  if (RExC_emit >= RExC_emit_bound)
15183   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15184     op, (void*)RExC_emit, (void*)RExC_emit_bound);
15185
15186  NODE_ALIGN_FILL(ret);
15187  ptr = ret;
15188  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15189  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
15190 #ifdef RE_TRACK_PATTERN_OFFSETS
15191  if (RExC_offsets) {         /* MJD */
15192   MJD_OFFSET_DEBUG(
15193    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15194    "reganode",
15195    __LINE__,
15196    PL_reg_name[op],
15197    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15198    "Overwriting end of array!\n" : "OK",
15199    (UV)(RExC_emit - RExC_emit_start),
15200    (UV)(RExC_parse - RExC_start),
15201    (UV)RExC_offsets[0]));
15202   Set_Cur_Node_Offset;
15203  }
15204 #endif
15205  RExC_emit = ptr;
15206  return(ret);
15207 }
15208
15209 /*
15210 - reguni - emit (if appropriate) a Unicode character
15211 */
15212 PERL_STATIC_INLINE STRLEN
15213 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15214 {
15215  dVAR;
15216
15217  PERL_ARGS_ASSERT_REGUNI;
15218
15219  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15220 }
15221
15222 /*
15223 - reginsert - insert an operator in front of already-emitted operand
15224 *
15225 * Means relocating the operand.
15226 */
15227 STATIC void
15228 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15229 {
15230  dVAR;
15231  regnode *src;
15232  regnode *dst;
15233  regnode *place;
15234  const int offset = regarglen[(U8)op];
15235  const int size = NODE_STEP_REGNODE + offset;
15236  GET_RE_DEBUG_FLAGS_DECL;
15237
15238  PERL_ARGS_ASSERT_REGINSERT;
15239  PERL_UNUSED_CONTEXT;
15240  PERL_UNUSED_ARG(depth);
15241 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15242  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15243  if (SIZE_ONLY) {
15244   RExC_size += size;
15245   return;
15246  }
15247
15248  src = RExC_emit;
15249  RExC_emit += size;
15250  dst = RExC_emit;
15251  if (RExC_open_parens) {
15252   int paren;
15253   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15254   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15255    if ( RExC_open_parens[paren] >= opnd ) {
15256     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15257     RExC_open_parens[paren] += size;
15258    } else {
15259     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15260    }
15261    if ( RExC_close_parens[paren] >= opnd ) {
15262     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15263     RExC_close_parens[paren] += size;
15264    } else {
15265     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15266    }
15267   }
15268  }
15269
15270  while (src > opnd) {
15271   StructCopy(--src, --dst, regnode);
15272 #ifdef RE_TRACK_PATTERN_OFFSETS
15273   if (RExC_offsets) {     /* MJD 20010112 */
15274    MJD_OFFSET_DEBUG(
15275     ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15276     "reg_insert",
15277     __LINE__,
15278     PL_reg_name[op],
15279     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15280      ? "Overwriting end of array!\n" : "OK",
15281     (UV)(src - RExC_emit_start),
15282     (UV)(dst - RExC_emit_start),
15283     (UV)RExC_offsets[0]));
15284    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15285    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15286   }
15287 #endif
15288  }
15289
15290
15291  place = opnd;  /* Op node, where operand used to be. */
15292 #ifdef RE_TRACK_PATTERN_OFFSETS
15293  if (RExC_offsets) {         /* MJD */
15294   MJD_OFFSET_DEBUG(
15295    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15296    "reginsert",
15297    __LINE__,
15298    PL_reg_name[op],
15299    (UV)(place - RExC_emit_start) > RExC_offsets[0]
15300    ? "Overwriting end of array!\n" : "OK",
15301    (UV)(place - RExC_emit_start),
15302    (UV)(RExC_parse - RExC_start),
15303    (UV)RExC_offsets[0]));
15304   Set_Node_Offset(place, RExC_parse);
15305   Set_Node_Length(place, 1);
15306  }
15307 #endif
15308  src = NEXTOPER(place);
15309  FILL_ADVANCE_NODE(place, op);
15310  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
15311  Zero(src, offset, regnode);
15312 }
15313
15314 /*
15315 - regtail - set the next-pointer at the end of a node chain of p to val.
15316 - SEE ALSO: regtail_study
15317 */
15318 /* TODO: All three parms should be const */
15319 STATIC void
15320 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15321     const regnode *val,U32 depth)
15322 {
15323  dVAR;
15324  regnode *scan;
15325  GET_RE_DEBUG_FLAGS_DECL;
15326
15327  PERL_ARGS_ASSERT_REGTAIL;
15328 #ifndef DEBUGGING
15329  PERL_UNUSED_ARG(depth);
15330 #endif
15331
15332  if (SIZE_ONLY)
15333   return;
15334
15335  /* Find last node. */
15336  scan = p;
15337  for (;;) {
15338   regnode * const temp = regnext(scan);
15339   DEBUG_PARSE_r({
15340    SV * const mysv=sv_newmortal();
15341    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15342    regprop(RExC_rx, mysv, scan, NULL);
15343    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15344     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15345      (temp == NULL ? "->" : ""),
15346      (temp == NULL ? PL_reg_name[OP(val)] : "")
15347    );
15348   });
15349   if (temp == NULL)
15350    break;
15351   scan = temp;
15352  }
15353
15354  if (reg_off_by_arg[OP(scan)]) {
15355   ARG_SET(scan, val - scan);
15356  }
15357  else {
15358   NEXT_OFF(scan) = val - scan;
15359  }
15360 }
15361
15362 #ifdef DEBUGGING
15363 /*
15364 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15365 - Look for optimizable sequences at the same time.
15366 - currently only looks for EXACT chains.
15367
15368 This is experimental code. The idea is to use this routine to perform
15369 in place optimizations on branches and groups as they are constructed,
15370 with the long term intention of removing optimization from study_chunk so
15371 that it is purely analytical.
15372
15373 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15374 to control which is which.
15375
15376 */
15377 /* TODO: All four parms should be const */
15378
15379 STATIC U8
15380 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15381      const regnode *val,U32 depth)
15382 {
15383  dVAR;
15384  regnode *scan;
15385  U8 exact = PSEUDO;
15386 #ifdef EXPERIMENTAL_INPLACESCAN
15387  I32 min = 0;
15388 #endif
15389  GET_RE_DEBUG_FLAGS_DECL;
15390
15391  PERL_ARGS_ASSERT_REGTAIL_STUDY;
15392
15393
15394  if (SIZE_ONLY)
15395   return exact;
15396
15397  /* Find last node. */
15398
15399  scan = p;
15400  for (;;) {
15401   regnode * const temp = regnext(scan);
15402 #ifdef EXPERIMENTAL_INPLACESCAN
15403   if (PL_regkind[OP(scan)] == EXACT) {
15404    bool unfolded_multi_char; /* Unexamined in this routine */
15405    if (join_exact(pRExC_state, scan, &min,
15406       &unfolded_multi_char, 1, val, depth+1))
15407     return EXACT;
15408   }
15409 #endif
15410   if ( exact ) {
15411    switch (OP(scan)) {
15412     case EXACT:
15413     case EXACTF:
15414     case EXACTFA_NO_TRIE:
15415     case EXACTFA:
15416     case EXACTFU:
15417     case EXACTFU_SS:
15418     case EXACTFL:
15419       if( exact == PSEUDO )
15420        exact= OP(scan);
15421       else if ( exact != OP(scan) )
15422        exact= 0;
15423     case NOTHING:
15424      break;
15425     default:
15426      exact= 0;
15427    }
15428   }
15429   DEBUG_PARSE_r({
15430    SV * const mysv=sv_newmortal();
15431    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15432    regprop(RExC_rx, mysv, scan, NULL);
15433    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15434     SvPV_nolen_const(mysv),
15435     REG_NODE_NUM(scan),
15436     PL_reg_name[exact]);
15437   });
15438   if (temp == NULL)
15439    break;
15440   scan = temp;
15441  }
15442  DEBUG_PARSE_r({
15443   SV * const mysv_val=sv_newmortal();
15444   DEBUG_PARSE_MSG("");
15445   regprop(RExC_rx, mysv_val, val, NULL);
15446   PerlIO_printf(Perl_debug_log,
15447      "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15448      SvPV_nolen_const(mysv_val),
15449      (IV)REG_NODE_NUM(val),
15450      (IV)(val - scan)
15451   );
15452  });
15453  if (reg_off_by_arg[OP(scan)]) {
15454   ARG_SET(scan, val - scan);
15455  }
15456  else {
15457   NEXT_OFF(scan) = val - scan;
15458  }
15459
15460  return exact;
15461 }
15462 #endif
15463
15464 /*
15465  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15466  */
15467 #ifdef DEBUGGING
15468
15469 static void
15470 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15471 {
15472  int bit;
15473  int set=0;
15474
15475  ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15476
15477  for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15478   if (flags & (1<<bit)) {
15479    if (!set++ && lead)
15480     PerlIO_printf(Perl_debug_log, "%s",lead);
15481    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15482   }
15483  }
15484  if (lead)  {
15485   if (set)
15486    PerlIO_printf(Perl_debug_log, "\n");
15487   else
15488    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15489  }
15490 }
15491
15492 static void
15493 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15494 {
15495  int bit;
15496  int set=0;
15497  regex_charset cs;
15498
15499  ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15500
15501  for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15502   if (flags & (1<<bit)) {
15503    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15504     continue;
15505    }
15506    if (!set++ && lead)
15507     PerlIO_printf(Perl_debug_log, "%s",lead);
15508    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15509   }
15510  }
15511  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15512    if (!set++ && lead) {
15513     PerlIO_printf(Perl_debug_log, "%s",lead);
15514    }
15515    switch (cs) {
15516     case REGEX_UNICODE_CHARSET:
15517      PerlIO_printf(Perl_debug_log, "UNICODE");
15518      break;
15519     case REGEX_LOCALE_CHARSET:
15520      PerlIO_printf(Perl_debug_log, "LOCALE");
15521      break;
15522     case REGEX_ASCII_RESTRICTED_CHARSET:
15523      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15524      break;
15525     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15526      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15527      break;
15528     default:
15529      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15530      break;
15531    }
15532  }
15533  if (lead)  {
15534   if (set)
15535    PerlIO_printf(Perl_debug_log, "\n");
15536   else
15537    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15538  }
15539 }
15540 #endif
15541
15542 void
15543 Perl_regdump(pTHX_ const regexp *r)
15544 {
15545 #ifdef DEBUGGING
15546  dVAR;
15547  SV * const sv = sv_newmortal();
15548  SV *dsv= sv_newmortal();
15549  RXi_GET_DECL(r,ri);
15550  GET_RE_DEBUG_FLAGS_DECL;
15551
15552  PERL_ARGS_ASSERT_REGDUMP;
15553
15554  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15555
15556  /* Header fields of interest. */
15557  if (r->anchored_substr) {
15558   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15559    RE_SV_DUMPLEN(r->anchored_substr), 30);
15560   PerlIO_printf(Perl_debug_log,
15561      "anchored %s%s at %"IVdf" ",
15562      s, RE_SV_TAIL(r->anchored_substr),
15563      (IV)r->anchored_offset);
15564  } else if (r->anchored_utf8) {
15565   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15566    RE_SV_DUMPLEN(r->anchored_utf8), 30);
15567   PerlIO_printf(Perl_debug_log,
15568      "anchored utf8 %s%s at %"IVdf" ",
15569      s, RE_SV_TAIL(r->anchored_utf8),
15570      (IV)r->anchored_offset);
15571  }
15572  if (r->float_substr) {
15573   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15574    RE_SV_DUMPLEN(r->float_substr), 30);
15575   PerlIO_printf(Perl_debug_log,
15576      "floating %s%s at %"IVdf"..%"UVuf" ",
15577      s, RE_SV_TAIL(r->float_substr),
15578      (IV)r->float_min_offset, (UV)r->float_max_offset);
15579  } else if (r->float_utf8) {
15580   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15581    RE_SV_DUMPLEN(r->float_utf8), 30);
15582   PerlIO_printf(Perl_debug_log,
15583      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15584      s, RE_SV_TAIL(r->float_utf8),
15585      (IV)r->float_min_offset, (UV)r->float_max_offset);
15586  }
15587  if (r->check_substr || r->check_utf8)
15588   PerlIO_printf(Perl_debug_log,
15589      (const char *)
15590      (r->check_substr == r->float_substr
15591      && r->check_utf8 == r->float_utf8
15592      ? "(checking floating" : "(checking anchored"));
15593  if (r->intflags & PREGf_NOSCAN)
15594   PerlIO_printf(Perl_debug_log, " noscan");
15595  if (r->extflags & RXf_CHECK_ALL)
15596   PerlIO_printf(Perl_debug_log, " isall");
15597  if (r->check_substr || r->check_utf8)
15598   PerlIO_printf(Perl_debug_log, ") ");
15599
15600  if (ri->regstclass) {
15601   regprop(r, sv, ri->regstclass, NULL);
15602   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15603  }
15604  if (r->intflags & PREGf_ANCH) {
15605   PerlIO_printf(Perl_debug_log, "anchored");
15606   if (r->intflags & PREGf_ANCH_BOL)
15607    PerlIO_printf(Perl_debug_log, "(BOL)");
15608   if (r->intflags & PREGf_ANCH_MBOL)
15609    PerlIO_printf(Perl_debug_log, "(MBOL)");
15610   if (r->intflags & PREGf_ANCH_SBOL)
15611    PerlIO_printf(Perl_debug_log, "(SBOL)");
15612   if (r->intflags & PREGf_ANCH_GPOS)
15613    PerlIO_printf(Perl_debug_log, "(GPOS)");
15614   PerlIO_putc(Perl_debug_log, ' ');
15615  }
15616  if (r->intflags & PREGf_GPOS_SEEN)
15617   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15618  if (r->intflags & PREGf_SKIP)
15619   PerlIO_printf(Perl_debug_log, "plus ");
15620  if (r->intflags & PREGf_IMPLICIT)
15621   PerlIO_printf(Perl_debug_log, "implicit ");
15622  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15623  if (r->extflags & RXf_EVAL_SEEN)
15624   PerlIO_printf(Perl_debug_log, "with eval ");
15625  PerlIO_printf(Perl_debug_log, "\n");
15626  DEBUG_FLAGS_r({
15627   regdump_extflags("r->extflags: ",r->extflags);
15628   regdump_intflags("r->intflags: ",r->intflags);
15629  });
15630 #else
15631  PERL_ARGS_ASSERT_REGDUMP;
15632  PERL_UNUSED_CONTEXT;
15633  PERL_UNUSED_ARG(r);
15634 #endif /* DEBUGGING */
15635 }
15636
15637 /*
15638 - regprop - printable representation of opcode, with run time support
15639 */
15640
15641 void
15642 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15643 {
15644 #ifdef DEBUGGING
15645  dVAR;
15646  int k;
15647
15648  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15649  static const char * const anyofs[] = {
15650 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15651  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15652  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15653  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15654  || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15655  || _CC_VERTSPACE != 16
15656   #error Need to adjust order of anyofs[]
15657 #endif
15658   "\\w",
15659   "\\W",
15660   "\\d",
15661   "\\D",
15662   "[:alpha:]",
15663   "[:^alpha:]",
15664   "[:lower:]",
15665   "[:^lower:]",
15666   "[:upper:]",
15667   "[:^upper:]",
15668   "[:punct:]",
15669   "[:^punct:]",
15670   "[:print:]",
15671   "[:^print:]",
15672   "[:alnum:]",
15673   "[:^alnum:]",
15674   "[:graph:]",
15675   "[:^graph:]",
15676   "[:cased:]",
15677   "[:^cased:]",
15678   "\\s",
15679   "\\S",
15680   "[:blank:]",
15681   "[:^blank:]",
15682   "[:xdigit:]",
15683   "[:^xdigit:]",
15684   "[:space:]",
15685   "[:^space:]",
15686   "[:cntrl:]",
15687   "[:^cntrl:]",
15688   "[:ascii:]",
15689   "[:^ascii:]",
15690   "\\v",
15691   "\\V"
15692  };
15693  RXi_GET_DECL(prog,progi);
15694  GET_RE_DEBUG_FLAGS_DECL;
15695
15696  PERL_ARGS_ASSERT_REGPROP;
15697
15698  sv_setpvs(sv, "");
15699
15700  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
15701   /* It would be nice to FAIL() here, but this may be called from
15702   regexec.c, and it would be hard to supply pRExC_state. */
15703   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15704            (int)OP(o), (int)REGNODE_MAX);
15705  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15706
15707  k = PL_regkind[OP(o)];
15708
15709  if (k == EXACT) {
15710   sv_catpvs(sv, " ");
15711   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15712   * is a crude hack but it may be the best for now since
15713   * we have no flag "this EXACTish node was UTF-8"
15714   * --jhi */
15715   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15716     PERL_PV_ESCAPE_UNI_DETECT |
15717     PERL_PV_ESCAPE_NONASCII   |
15718     PERL_PV_PRETTY_ELLIPSES   |
15719     PERL_PV_PRETTY_LTGT       |
15720     PERL_PV_PRETTY_NOCLEAR
15721     );
15722  } else if (k == TRIE) {
15723   /* print the details of the trie in dumpuntil instead, as
15724   * progi->data isn't available here */
15725   const char op = OP(o);
15726   const U32 n = ARG(o);
15727   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15728    (reg_ac_data *)progi->data->data[n] :
15729    NULL;
15730   const reg_trie_data * const trie
15731    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15732
15733   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15734   DEBUG_TRIE_COMPILE_r(
15735   Perl_sv_catpvf(aTHX_ sv,
15736    "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15737    (UV)trie->startstate,
15738    (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15739    (UV)trie->wordcount,
15740    (UV)trie->minlen,
15741    (UV)trie->maxlen,
15742    (UV)TRIE_CHARCOUNT(trie),
15743    (UV)trie->uniquecharcount
15744   );
15745   );
15746   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15747    sv_catpvs(sv, "[");
15748    (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15749             ? ANYOF_BITMAP(o)
15750             : TRIE_BITMAP(trie));
15751    sv_catpvs(sv, "]");
15752   }
15753
15754  } else if (k == CURLY) {
15755   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15756    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15757   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15758  }
15759  else if (k == WHILEM && o->flags)   /* Ordinal/of */
15760   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15761  else if (k == REF || k == OPEN || k == CLOSE
15762    || k == GROUPP || OP(o)==ACCEPT)
15763  {
15764   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15765   if ( RXp_PAREN_NAMES(prog) ) {
15766    if ( k != REF || (OP(o) < NREF)) {
15767     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15768     SV **name= av_fetch(list, ARG(o), 0 );
15769     if (name)
15770      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15771    }
15772    else {
15773     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15774     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15775     I32 *nums=(I32*)SvPVX(sv_dat);
15776     SV **name= av_fetch(list, nums[0], 0 );
15777     I32 n;
15778     if (name) {
15779      for ( n=0; n<SvIVX(sv_dat); n++ ) {
15780       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15781          (n ? "," : ""), (IV)nums[n]);
15782      }
15783      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15784     }
15785    }
15786   }
15787   if ( k == REF && reginfo) {
15788    U32 n = ARG(o);  /* which paren pair */
15789    I32 ln = prog->offs[n].start;
15790    if (prog->lastparen < n || ln == -1)
15791     Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15792    else if (ln == prog->offs[n].end)
15793     Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15794    else {
15795     const char *s = reginfo->strbeg + ln;
15796     Perl_sv_catpvf(aTHX_ sv, ": ");
15797     Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15798      PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15799    }
15800   }
15801  } else if (k == GOSUB)
15802   /* Paren and offset */
15803   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15804  else if (k == VERB) {
15805   if (!o->flags)
15806    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15807       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15808  } else if (k == LOGICAL)
15809   /* 2: embedded, otherwise 1 */
15810   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15811  else if (k == ANYOF) {
15812   const U8 flags = ANYOF_FLAGS(o);
15813   int do_sep = 0;
15814
15815
15816   if (flags & ANYOF_LOCALE_FLAGS)
15817    sv_catpvs(sv, "{loc}");
15818   if (flags & ANYOF_LOC_FOLD)
15819    sv_catpvs(sv, "{i}");
15820   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15821   if (flags & ANYOF_INVERT)
15822    sv_catpvs(sv, "^");
15823
15824   /* output what the standard cp 0-255 bitmap matches */
15825   do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15826
15827   /* output any special charclass tests (used entirely under use
15828   * locale) * */
15829   if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15830    int i;
15831    for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15832     if (ANYOF_POSIXL_TEST(o,i)) {
15833      sv_catpv(sv, anyofs[i]);
15834      do_sep = 1;
15835     }
15836    }
15837   }
15838
15839   if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15840      |ANYOF_UTF8
15841      |ANYOF_NONBITMAP_NON_UTF8
15842      |ANYOF_LOC_FOLD)))
15843   {
15844    if (do_sep) {
15845     Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15846     if (flags & ANYOF_INVERT)
15847      /*make sure the invert info is in each */
15848      sv_catpvs(sv, "^");
15849    }
15850
15851    if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15852     sv_catpvs(sv, "{non-utf8-latin1-all}");
15853    }
15854
15855    /* output information about the unicode matching */
15856    if (flags & ANYOF_ABOVE_LATIN1_ALL)
15857     sv_catpvs(sv, "{unicode_all}");
15858    else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15859     SV *lv; /* Set if there is something outside the bit map. */
15860     bool byte_output = FALSE;   /* If something in the bitmap has
15861            been output */
15862     SV *only_utf8_locale;
15863
15864     /* Get the stuff that wasn't in the bitmap */
15865     (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15866              &lv, &only_utf8_locale);
15867     if (lv && lv != &PL_sv_undef) {
15868      char *s = savesvpv(lv);
15869      char * const origs = s;
15870
15871      while (*s && *s != '\n')
15872       s++;
15873
15874      if (*s == '\n') {
15875       const char * const t = ++s;
15876
15877       if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15878        sv_catpvs(sv, "{outside bitmap}");
15879       }
15880       else {
15881        sv_catpvs(sv, "{utf8}");
15882       }
15883
15884       if (byte_output) {
15885        sv_catpvs(sv, " ");
15886       }
15887
15888       while (*s) {
15889        if (*s == '\n') {
15890
15891         /* Truncate very long output */
15892         if (s - origs > 256) {
15893          Perl_sv_catpvf(aTHX_ sv,
15894             "%.*s...",
15895             (int) (s - origs - 1),
15896             t);
15897          goto out_dump;
15898         }
15899         *s = ' ';
15900        }
15901        else if (*s == '\t') {
15902         *s = '-';
15903        }
15904        s++;
15905       }
15906       if (s[-1] == ' ')
15907        s[-1] = 0;
15908
15909       sv_catpv(sv, t);
15910      }
15911
15912     out_dump:
15913
15914      Safefree(origs);
15915      SvREFCNT_dec_NN(lv);
15916     }
15917
15918     if ((flags & ANYOF_LOC_FOLD)
15919      && only_utf8_locale
15920      && only_utf8_locale != &PL_sv_undef)
15921     {
15922      UV start, end;
15923      int max_entries = 256;
15924
15925      sv_catpvs(sv, "{utf8 locale}");
15926      invlist_iterinit(only_utf8_locale);
15927      while (invlist_iternext(only_utf8_locale,
15928            &start, &end)) {
15929       put_range(sv, start, end);
15930       max_entries --;
15931       if (max_entries < 0) {
15932        sv_catpvs(sv, "...");
15933        break;
15934       }
15935      }
15936      invlist_iterfinish(only_utf8_locale);
15937     }
15938    }
15939   }
15940
15941   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15942  }
15943  else if (k == POSIXD || k == NPOSIXD) {
15944   U8 index = FLAGS(o) * 2;
15945   if (index < C_ARRAY_LENGTH(anyofs)) {
15946    if (*anyofs[index] != '[')  {
15947     sv_catpv(sv, "[");
15948    }
15949    sv_catpv(sv, anyofs[index]);
15950    if (*anyofs[index] != '[')  {
15951     sv_catpv(sv, "]");
15952    }
15953   }
15954   else {
15955    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15956   }
15957  }
15958  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15959   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15960 #else
15961  PERL_UNUSED_CONTEXT;
15962  PERL_UNUSED_ARG(sv);
15963  PERL_UNUSED_ARG(o);
15964  PERL_UNUSED_ARG(prog);
15965  PERL_UNUSED_ARG(reginfo);
15966 #endif /* DEBUGGING */
15967 }
15968
15969
15970
15971 SV *
15972 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15973 {    /* Assume that RE_INTUIT is set */
15974  dVAR;
15975  struct regexp *const prog = ReANY(r);
15976  GET_RE_DEBUG_FLAGS_DECL;
15977
15978  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15979  PERL_UNUSED_CONTEXT;
15980
15981  DEBUG_COMPILE_r(
15982   {
15983    const char * const s = SvPV_nolen_const(prog->check_substr
15984      ? prog->check_substr : prog->check_utf8);
15985
15986    if (!PL_colorset) reginitcolors();
15987    PerlIO_printf(Perl_debug_log,
15988      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15989      PL_colors[4],
15990      prog->check_substr ? "" : "utf8 ",
15991      PL_colors[5],PL_colors[0],
15992      s,
15993      PL_colors[1],
15994      (strlen(s) > 60 ? "..." : ""));
15995   } );
15996
15997  return prog->check_substr ? prog->check_substr : prog->check_utf8;
15998 }
15999
16000 /*
16001    pregfree()
16002
16003    handles refcounting and freeing the perl core regexp structure. When
16004    it is necessary to actually free the structure the first thing it
16005    does is call the 'free' method of the regexp_engine associated to
16006    the regexp, allowing the handling of the void *pprivate; member
16007    first. (This routine is not overridable by extensions, which is why
16008    the extensions free is called first.)
16009
16010    See regdupe and regdupe_internal if you change anything here.
16011 */
16012 #ifndef PERL_IN_XSUB_RE
16013 void
16014 Perl_pregfree(pTHX_ REGEXP *r)
16015 {
16016  SvREFCNT_dec(r);
16017 }
16018
16019 void
16020 Perl_pregfree2(pTHX_ REGEXP *rx)
16021 {
16022  dVAR;
16023  struct regexp *const r = ReANY(rx);
16024  GET_RE_DEBUG_FLAGS_DECL;
16025
16026  PERL_ARGS_ASSERT_PREGFREE2;
16027
16028  if (r->mother_re) {
16029   ReREFCNT_dec(r->mother_re);
16030  } else {
16031   CALLREGFREE_PVT(rx); /* free the private data */
16032   SvREFCNT_dec(RXp_PAREN_NAMES(r));
16033   Safefree(r->xpv_len_u.xpvlenu_pv);
16034  }
16035  if (r->substrs) {
16036   SvREFCNT_dec(r->anchored_substr);
16037   SvREFCNT_dec(r->anchored_utf8);
16038   SvREFCNT_dec(r->float_substr);
16039   SvREFCNT_dec(r->float_utf8);
16040   Safefree(r->substrs);
16041  }
16042  RX_MATCH_COPY_FREE(rx);
16043 #ifdef PERL_ANY_COW
16044  SvREFCNT_dec(r->saved_copy);
16045 #endif
16046  Safefree(r->offs);
16047  SvREFCNT_dec(r->qr_anoncv);
16048  rx->sv_u.svu_rx = 0;
16049 }
16050
16051 /*  reg_temp_copy()
16052
16053  This is a hacky workaround to the structural issue of match results
16054  being stored in the regexp structure which is in turn stored in
16055  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16056  could be PL_curpm in multiple contexts, and could require multiple
16057  result sets being associated with the pattern simultaneously, such
16058  as when doing a recursive match with (??{$qr})
16059
16060  The solution is to make a lightweight copy of the regexp structure
16061  when a qr// is returned from the code executed by (??{$qr}) this
16062  lightweight copy doesn't actually own any of its data except for
16063  the starp/end and the actual regexp structure itself.
16064
16065 */
16066
16067
16068 REGEXP *
16069 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16070 {
16071  struct regexp *ret;
16072  struct regexp *const r = ReANY(rx);
16073  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16074
16075  PERL_ARGS_ASSERT_REG_TEMP_COPY;
16076
16077  if (!ret_x)
16078   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16079  else {
16080   SvOK_off((SV *)ret_x);
16081   if (islv) {
16082    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16083    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16084    made both spots point to the same regexp body.) */
16085    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16086    assert(!SvPVX(ret_x));
16087    ret_x->sv_u.svu_rx = temp->sv_any;
16088    temp->sv_any = NULL;
16089    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16090    SvREFCNT_dec_NN(temp);
16091    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16092    ing below will not set it. */
16093    SvCUR_set(ret_x, SvCUR(rx));
16094   }
16095  }
16096  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16097  sv_force_normal(sv) is called.  */
16098  SvFAKE_on(ret_x);
16099  ret = ReANY(ret_x);
16100
16101  SvFLAGS(ret_x) |= SvUTF8(rx);
16102  /* We share the same string buffer as the original regexp, on which we
16103  hold a reference count, incremented when mother_re is set below.
16104  The string pointer is copied here, being part of the regexp struct.
16105  */
16106  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16107   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16108  if (r->offs) {
16109   const I32 npar = r->nparens+1;
16110   Newx(ret->offs, npar, regexp_paren_pair);
16111   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16112  }
16113  if (r->substrs) {
16114   Newx(ret->substrs, 1, struct reg_substr_data);
16115   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16116
16117   SvREFCNT_inc_void(ret->anchored_substr);
16118   SvREFCNT_inc_void(ret->anchored_utf8);
16119   SvREFCNT_inc_void(ret->float_substr);
16120   SvREFCNT_inc_void(ret->float_utf8);
16121
16122   /* check_substr and check_utf8, if non-NULL, point to either their
16123   anchored or float namesakes, and don't hold a second reference.  */
16124  }
16125  RX_MATCH_COPIED_off(ret_x);
16126 #ifdef PERL_ANY_COW
16127  ret->saved_copy = NULL;
16128 #endif
16129  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16130  SvREFCNT_inc_void(ret->qr_anoncv);
16131
16132  return ret_x;
16133 }
16134 #endif
16135
16136 /* regfree_internal()
16137
16138    Free the private data in a regexp. This is overloadable by
16139    extensions. Perl takes care of the regexp structure in pregfree(),
16140    this covers the *pprivate pointer which technically perl doesn't
16141    know about, however of course we have to handle the
16142    regexp_internal structure when no extension is in use.
16143
16144    Note this is called before freeing anything in the regexp
16145    structure.
16146  */
16147
16148 void
16149 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16150 {
16151  dVAR;
16152  struct regexp *const r = ReANY(rx);
16153  RXi_GET_DECL(r,ri);
16154  GET_RE_DEBUG_FLAGS_DECL;
16155
16156  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16157
16158  DEBUG_COMPILE_r({
16159   if (!PL_colorset)
16160    reginitcolors();
16161   {
16162    SV *dsv= sv_newmortal();
16163    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16164     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16165    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16166     PL_colors[4],PL_colors[5],s);
16167   }
16168  });
16169 #ifdef RE_TRACK_PATTERN_OFFSETS
16170  if (ri->u.offsets)
16171   Safefree(ri->u.offsets);             /* 20010421 MJD */
16172 #endif
16173  if (ri->code_blocks) {
16174   int n;
16175   for (n = 0; n < ri->num_code_blocks; n++)
16176    SvREFCNT_dec(ri->code_blocks[n].src_regex);
16177   Safefree(ri->code_blocks);
16178  }
16179
16180  if (ri->data) {
16181   int n = ri->data->count;
16182
16183   while (--n >= 0) {
16184   /* If you add a ->what type here, update the comment in regcomp.h */
16185    switch (ri->data->what[n]) {
16186    case 'a':
16187    case 'r':
16188    case 's':
16189    case 'S':
16190    case 'u':
16191     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16192     break;
16193    case 'f':
16194     Safefree(ri->data->data[n]);
16195     break;
16196    case 'l':
16197    case 'L':
16198     break;
16199    case 'T':
16200     { /* Aho Corasick add-on structure for a trie node.
16201      Used in stclass optimization only */
16202      U32 refcount;
16203      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16204      OP_REFCNT_LOCK;
16205      refcount = --aho->refcount;
16206      OP_REFCNT_UNLOCK;
16207      if ( !refcount ) {
16208       PerlMemShared_free(aho->states);
16209       PerlMemShared_free(aho->fail);
16210       /* do this last!!!! */
16211       PerlMemShared_free(ri->data->data[n]);
16212       /* we should only ever get called once, so
16213       * assert as much, and also guard the free
16214       * which /might/ happen twice. At the least
16215       * it will make code anlyzers happy and it
16216       * doesn't cost much. - Yves */
16217       assert(ri->regstclass);
16218       if (ri->regstclass) {
16219        PerlMemShared_free(ri->regstclass);
16220        ri->regstclass = 0;
16221       }
16222      }
16223     }
16224     break;
16225    case 't':
16226     {
16227      /* trie structure. */
16228      U32 refcount;
16229      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16230      OP_REFCNT_LOCK;
16231      refcount = --trie->refcount;
16232      OP_REFCNT_UNLOCK;
16233      if ( !refcount ) {
16234       PerlMemShared_free(trie->charmap);
16235       PerlMemShared_free(trie->states);
16236       PerlMemShared_free(trie->trans);
16237       if (trie->bitmap)
16238        PerlMemShared_free(trie->bitmap);
16239       if (trie->jump)
16240        PerlMemShared_free(trie->jump);
16241       PerlMemShared_free(trie->wordinfo);
16242       /* do this last!!!! */
16243       PerlMemShared_free(ri->data->data[n]);
16244      }
16245     }
16246     break;
16247    default:
16248     Perl_croak(aTHX_ "panic: regfree data code '%c'",
16249              ri->data->what[n]);
16250    }
16251   }
16252   Safefree(ri->data->what);
16253   Safefree(ri->data);
16254  }
16255
16256  Safefree(ri);
16257 }
16258
16259 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16260 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16261 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16262
16263 /*
16264    re_dup - duplicate a regexp.
16265
16266    This routine is expected to clone a given regexp structure. It is only
16267    compiled under USE_ITHREADS.
16268
16269    After all of the core data stored in struct regexp is duplicated
16270    the regexp_engine.dupe method is used to copy any private data
16271    stored in the *pprivate pointer. This allows extensions to handle
16272    any duplication it needs to do.
16273
16274    See pregfree() and regfree_internal() if you change anything here.
16275 */
16276 #if defined(USE_ITHREADS)
16277 #ifndef PERL_IN_XSUB_RE
16278 void
16279 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16280 {
16281  dVAR;
16282  I32 npar;
16283  const struct regexp *r = ReANY(sstr);
16284  struct regexp *ret = ReANY(dstr);
16285
16286  PERL_ARGS_ASSERT_RE_DUP_GUTS;
16287
16288  npar = r->nparens+1;
16289  Newx(ret->offs, npar, regexp_paren_pair);
16290  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16291
16292  if (ret->substrs) {
16293   /* Do it this way to avoid reading from *r after the StructCopy().
16294   That way, if any of the sv_dup_inc()s dislodge *r from the L1
16295   cache, it doesn't matter.  */
16296   const bool anchored = r->check_substr
16297    ? r->check_substr == r->anchored_substr
16298    : r->check_utf8 == r->anchored_utf8;
16299   Newx(ret->substrs, 1, struct reg_substr_data);
16300   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16301
16302   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16303   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16304   ret->float_substr = sv_dup_inc(ret->float_substr, param);
16305   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16306
16307   /* check_substr and check_utf8, if non-NULL, point to either their
16308   anchored or float namesakes, and don't hold a second reference.  */
16309
16310   if (ret->check_substr) {
16311    if (anchored) {
16312     assert(r->check_utf8 == r->anchored_utf8);
16313     ret->check_substr = ret->anchored_substr;
16314     ret->check_utf8 = ret->anchored_utf8;
16315    } else {
16316     assert(r->check_substr == r->float_substr);
16317     assert(r->check_utf8 == r->float_utf8);
16318     ret->check_substr = ret->float_substr;
16319     ret->check_utf8 = ret->float_utf8;
16320    }
16321   } else if (ret->check_utf8) {
16322    if (anchored) {
16323     ret->check_utf8 = ret->anchored_utf8;
16324    } else {
16325     ret->check_utf8 = ret->float_utf8;
16326    }
16327   }
16328  }
16329
16330  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16331  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16332
16333  if (ret->pprivate)
16334   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16335
16336  if (RX_MATCH_COPIED(dstr))
16337   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16338  else
16339   ret->subbeg = NULL;
16340 #ifdef PERL_ANY_COW
16341  ret->saved_copy = NULL;
16342 #endif
16343
16344  /* Whether mother_re be set or no, we need to copy the string.  We
16345  cannot refrain from copying it when the storage points directly to
16346  our mother regexp, because that's
16347    1: a buffer in a different thread
16348    2: something we no longer hold a reference on
16349    so we need to copy it locally.  */
16350  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16351  ret->mother_re   = NULL;
16352 }
16353 #endif /* PERL_IN_XSUB_RE */
16354
16355 /*
16356    regdupe_internal()
16357
16358    This is the internal complement to regdupe() which is used to copy
16359    the structure pointed to by the *pprivate pointer in the regexp.
16360    This is the core version of the extension overridable cloning hook.
16361    The regexp structure being duplicated will be copied by perl prior
16362    to this and will be provided as the regexp *r argument, however
16363    with the /old/ structures pprivate pointer value. Thus this routine
16364    may override any copying normally done by perl.
16365
16366    It returns a pointer to the new regexp_internal structure.
16367 */
16368
16369 void *
16370 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16371 {
16372  dVAR;
16373  struct regexp *const r = ReANY(rx);
16374  regexp_internal *reti;
16375  int len;
16376  RXi_GET_DECL(r,ri);
16377
16378  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16379
16380  len = ProgLen(ri);
16381
16382  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16383   char, regexp_internal);
16384  Copy(ri->program, reti->program, len+1, regnode);
16385
16386  reti->num_code_blocks = ri->num_code_blocks;
16387  if (ri->code_blocks) {
16388   int n;
16389   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16390     struct reg_code_block);
16391   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16392     struct reg_code_block);
16393   for (n = 0; n < ri->num_code_blocks; n++)
16394    reti->code_blocks[n].src_regex = (REGEXP*)
16395      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16396  }
16397  else
16398   reti->code_blocks = NULL;
16399
16400  reti->regstclass = NULL;
16401
16402  if (ri->data) {
16403   struct reg_data *d;
16404   const int count = ri->data->count;
16405   int i;
16406
16407   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16408     char, struct reg_data);
16409   Newx(d->what, count, U8);
16410
16411   d->count = count;
16412   for (i = 0; i < count; i++) {
16413    d->what[i] = ri->data->what[i];
16414    switch (d->what[i]) {
16415     /* see also regcomp.h and regfree_internal() */
16416    case 'a': /* actually an AV, but the dup function is identical.  */
16417    case 'r':
16418    case 's':
16419    case 'S':
16420    case 'u': /* actually an HV, but the dup function is identical.  */
16421     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16422     break;
16423    case 'f':
16424     /* This is cheating. */
16425     Newx(d->data[i], 1, regnode_ssc);
16426     StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16427     reti->regstclass = (regnode*)d->data[i];
16428     break;
16429    case 'T':
16430     /* Trie stclasses are readonly and can thus be shared
16431     * without duplication. We free the stclass in pregfree
16432     * when the corresponding reg_ac_data struct is freed.
16433     */
16434     reti->regstclass= ri->regstclass;
16435     /* FALLTHROUGH */
16436    case 't':
16437     OP_REFCNT_LOCK;
16438     ((reg_trie_data*)ri->data->data[i])->refcount++;
16439     OP_REFCNT_UNLOCK;
16440     /* FALLTHROUGH */
16441    case 'l':
16442    case 'L':
16443     d->data[i] = ri->data->data[i];
16444     break;
16445    default:
16446     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16447               ri->data->what[i]);
16448    }
16449   }
16450
16451   reti->data = d;
16452  }
16453  else
16454   reti->data = NULL;
16455
16456  reti->name_list_idx = ri->name_list_idx;
16457
16458 #ifdef RE_TRACK_PATTERN_OFFSETS
16459  if (ri->u.offsets) {
16460   Newx(reti->u.offsets, 2*len+1, U32);
16461   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16462  }
16463 #else
16464  SetProgLen(reti,len);
16465 #endif
16466
16467  return (void*)reti;
16468 }
16469
16470 #endif    /* USE_ITHREADS */
16471
16472 #ifndef PERL_IN_XSUB_RE
16473
16474 /*
16475  - regnext - dig the "next" pointer out of a node
16476  */
16477 regnode *
16478 Perl_regnext(pTHX_ regnode *p)
16479 {
16480  dVAR;
16481  I32 offset;
16482
16483  if (!p)
16484   return(NULL);
16485
16486  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
16487   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16488             (int)OP(p), (int)REGNODE_MAX);
16489  }
16490
16491  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16492  if (offset == 0)
16493   return(NULL);
16494
16495  return(p+offset);
16496 }
16497 #endif
16498
16499 STATIC void
16500 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16501 {
16502  va_list args;
16503  STRLEN l1 = strlen(pat1);
16504  STRLEN l2 = strlen(pat2);
16505  char buf[512];
16506  SV *msv;
16507  const char *message;
16508
16509  PERL_ARGS_ASSERT_RE_CROAK2;
16510
16511  if (l1 > 510)
16512   l1 = 510;
16513  if (l1 + l2 > 510)
16514   l2 = 510 - l1;
16515  Copy(pat1, buf, l1 , char);
16516  Copy(pat2, buf + l1, l2 , char);
16517  buf[l1 + l2] = '\n';
16518  buf[l1 + l2 + 1] = '\0';
16519  va_start(args, pat2);
16520  msv = vmess(buf, &args);
16521  va_end(args);
16522  message = SvPV_const(msv,l1);
16523  if (l1 > 512)
16524   l1 = 512;
16525  Copy(message, buf, l1 , char);
16526  /* l1-1 to avoid \n */
16527  Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16528 }
16529
16530 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16531
16532 #ifndef PERL_IN_XSUB_RE
16533 void
16534 Perl_save_re_context(pTHX)
16535 {
16536  dVAR;
16537
16538  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16539  if (PL_curpm) {
16540   const REGEXP * const rx = PM_GETRE(PL_curpm);
16541   if (rx) {
16542    U32 i;
16543    for (i = 1; i <= RX_NPARENS(rx); i++) {
16544     char digits[TYPE_CHARS(long)];
16545     const STRLEN len = my_snprintf(digits, sizeof(digits),
16546            "%lu", (long)i);
16547     GV *const *const gvp
16548      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16549
16550     if (gvp) {
16551      GV * const gv = *gvp;
16552      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16553       save_scalar(gv);
16554     }
16555    }
16556   }
16557  }
16558 }
16559 #endif
16560
16561 #ifdef DEBUGGING
16562
16563 STATIC void
16564 S_put_byte(pTHX_ SV *sv, int c)
16565 {
16566  PERL_ARGS_ASSERT_PUT_BYTE;
16567
16568  if (!isPRINT(c)) {
16569   switch (c) {
16570    case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16571    case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16572    case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16573    case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16574    case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16575
16576    default:
16577     Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16578     break;
16579   }
16580  }
16581  else {
16582   const char string = c;
16583   if (c == '-' || c == ']' || c == '\\' || c == '^')
16584    sv_catpvs(sv, "\\");
16585   sv_catpvn(sv, &string, 1);
16586  }
16587 }
16588
16589 STATIC void
16590 S_put_range(pTHX_ SV *sv, UV start, UV end)
16591 {
16592
16593  /* Appends to 'sv' a displayable version of the range of code points from
16594  * 'start' to 'end' */
16595
16596  assert(start <= end);
16597
16598  PERL_ARGS_ASSERT_PUT_RANGE;
16599
16600  if (end - start < 3) {  /* Individual chars in short ranges */
16601   for (; start <= end; start++)
16602    put_byte(sv, start);
16603  }
16604  else if (   end > 255
16605    || ! isALPHANUMERIC(start)
16606    || ! isALPHANUMERIC(end)
16607    || isDIGIT(start) != isDIGIT(end)
16608    || isUPPER(start) != isUPPER(end)
16609    || isLOWER(start) != isLOWER(end)
16610
16611     /* This final test should get optimized out except on EBCDIC
16612     * platforms, where it causes ranges that cross discontinuities
16613     * like i/j to be shown as hex instead of the misleading,
16614     * e.g. H-K (since that range includes more than H, I, J, K).
16615     * */
16616    || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16617  {
16618   Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16619      start,
16620      (end < 256) ? end : 255);
16621  }
16622  else { /* Here, the ends of the range are both digits, or both uppercase,
16623    or both lowercase; and there's no discontinuity in the range
16624    (which could happen on EBCDIC platforms) */
16625   put_byte(sv, start);
16626   sv_catpvs(sv, "-");
16627   put_byte(sv, end);
16628  }
16629 }
16630
16631 STATIC bool
16632 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16633 {
16634  /* Appends to 'sv' a displayable version of the innards of the bracketed
16635  * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16636  * output anything */
16637
16638  int i;
16639  bool has_output_anything = FALSE;
16640
16641  PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16642
16643  for (i = 0; i < 256; i++) {
16644   if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16645
16646    /* The character at index i should be output.  Find the next
16647    * character that should NOT be output */
16648    int j;
16649    for (j = i + 1; j <= 256; j++) {
16650     if (! BITMAP_TEST((U8 *) bitmap, j)) {
16651      break;
16652     }
16653    }
16654
16655    /* Everything between them is a single range that should be output
16656    * */
16657    put_range(sv, i, j - 1);
16658    has_output_anything = TRUE;
16659    i = j;
16660   }
16661  }
16662
16663  return has_output_anything;
16664 }
16665
16666 #define CLEAR_OPTSTART \
16667  if (optstart) STMT_START {                                               \
16668   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16669        " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16670   optstart=NULL;                                                       \
16671  } STMT_END
16672
16673 #define DUMPUNTIL(b,e)                                                       \
16674      CLEAR_OPTSTART;                                          \
16675      node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16676
16677 STATIC const regnode *
16678 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16679    const regnode *last, const regnode *plast,
16680    SV* sv, I32 indent, U32 depth)
16681 {
16682  dVAR;
16683  U8 op = PSEUDO; /* Arbitrary non-END op. */
16684  const regnode *next;
16685  const regnode *optstart= NULL;
16686
16687  RXi_GET_DECL(r,ri);
16688  GET_RE_DEBUG_FLAGS_DECL;
16689
16690  PERL_ARGS_ASSERT_DUMPUNTIL;
16691
16692 #ifdef DEBUG_DUMPUNTIL
16693  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16694   last ? last-start : 0,plast ? plast-start : 0);
16695 #endif
16696
16697  if (plast && plast < last)
16698   last= plast;
16699
16700  while (PL_regkind[op] != END && (!last || node < last)) {
16701   assert(node);
16702   /* While that wasn't END last time... */
16703   NODE_ALIGN(node);
16704   op = OP(node);
16705   if (op == CLOSE || op == WHILEM)
16706    indent--;
16707   next = regnext((regnode *)node);
16708
16709   /* Where, what. */
16710   if (OP(node) == OPTIMIZED) {
16711    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16712     optstart = node;
16713    else
16714     goto after_print;
16715   } else
16716    CLEAR_OPTSTART;
16717
16718   regprop(r, sv, node, NULL);
16719   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16720      (int)(2*indent + 1), "", SvPVX_const(sv));
16721
16722   if (OP(node) != OPTIMIZED) {
16723    if (next == NULL)  /* Next ptr. */
16724     PerlIO_printf(Perl_debug_log, " (0)");
16725    else if (PL_regkind[(U8)op] == BRANCH
16726      && PL_regkind[OP(next)] != BRANCH )
16727     PerlIO_printf(Perl_debug_log, " (FAIL)");
16728    else
16729     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16730    (void)PerlIO_putc(Perl_debug_log, '\n');
16731   }
16732
16733  after_print:
16734   if (PL_regkind[(U8)op] == BRANCHJ) {
16735    assert(next);
16736    {
16737     const regnode *nnode = (OP(next) == LONGJMP
16738          ? regnext((regnode *)next)
16739          : next);
16740     if (last && nnode > last)
16741      nnode = last;
16742     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16743    }
16744   }
16745   else if (PL_regkind[(U8)op] == BRANCH) {
16746    assert(next);
16747    DUMPUNTIL(NEXTOPER(node), next);
16748   }
16749   else if ( PL_regkind[(U8)op]  == TRIE ) {
16750    const regnode *this_trie = node;
16751    const char op = OP(node);
16752    const U32 n = ARG(node);
16753    const reg_ac_data * const ac = op>=AHOCORASICK ?
16754    (reg_ac_data *)ri->data->data[n] :
16755    NULL;
16756    const reg_trie_data * const trie =
16757     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16758 #ifdef DEBUGGING
16759    AV *const trie_words
16760       = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16761 #endif
16762    const regnode *nextbranch= NULL;
16763    I32 word_idx;
16764    sv_setpvs(sv, "");
16765    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16766     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16767
16768     PerlIO_printf(Perl_debug_log, "%*s%s ",
16769     (int)(2*(indent+3)), "",
16770      elem_ptr
16771      ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16772         SvCUR(*elem_ptr), 60,
16773         PL_colors[0], PL_colors[1],
16774         (SvUTF8(*elem_ptr)
16775         ? PERL_PV_ESCAPE_UNI
16776         : 0)
16777         | PERL_PV_PRETTY_ELLIPSES
16778         | PERL_PV_PRETTY_LTGT
16779        )
16780      : "???"
16781     );
16782     if (trie->jump) {
16783      U16 dist= trie->jump[word_idx+1];
16784      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16785        (UV)((dist ? this_trie + dist : next) - start));
16786      if (dist) {
16787       if (!nextbranch)
16788        nextbranch= this_trie + trie->jump[0];
16789       DUMPUNTIL(this_trie + dist, nextbranch);
16790      }
16791      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16792       nextbranch= regnext((regnode *)nextbranch);
16793     } else {
16794      PerlIO_printf(Perl_debug_log, "\n");
16795     }
16796    }
16797    if (last && next > last)
16798     node= last;
16799    else
16800     node= next;
16801   }
16802   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16803    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16804      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16805   }
16806   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16807    assert(next);
16808    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16809   }
16810   else if ( op == PLUS || op == STAR) {
16811    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16812   }
16813   else if (PL_regkind[(U8)op] == ANYOF) {
16814    /* arglen 1 + class block */
16815    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16816       ? ANYOF_POSIXL_SKIP
16817       : ANYOF_SKIP);
16818    node = NEXTOPER(node);
16819   }
16820   else if (PL_regkind[(U8)op] == EXACT) {
16821    /* Literal string, where present. */
16822    node += NODE_SZ_STR(node) - 1;
16823    node = NEXTOPER(node);
16824   }
16825   else {
16826    node = NEXTOPER(node);
16827    node += regarglen[(U8)op];
16828   }
16829   if (op == CURLYX || op == OPEN)
16830    indent++;
16831  }
16832  CLEAR_OPTSTART;
16833 #ifdef DEBUG_DUMPUNTIL
16834  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16835 #endif
16836  return node;
16837 }
16838
16839 #endif /* DEBUGGING */
16840
16841 /*
16842  * Local variables:
16843  * c-indentation-style: bsd
16844  * c-basic-offset: 4
16845  * indent-tabs-mode: nil
16846  * End:
16847  *
16848  * ex: set ts=8 sts=4 sw=4 et:
16849  */