src/5020001/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #undef PERL_IN_XSUB_RE
  76 #define PERL_IN_XSUB_RE 1
  77 #include "perl.h"
  78 #undef PERL_IN_XSUB_RE
  79
  80 #ifndef PERL_IN_XSUB_RE
  81 #include "re_defs.h"
  82 #endif
  83
  84 #define REG_COMP_C
  85 #ifdef PERL_IN_XSUB_RE
  86 #  include "re_comp.h"
  87 EXTERN_C const struct regexp_engine my_reg_engine;
  88 #else
  89 #  include "regcomp.h"
  90 #endif
  91
  92 #include "dquote_static.c"
  93 #include "charclass_invlists.h"
  94 #include "inline_invlist.c"
  95 #include "unicode_constants.h"
  96
  97 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  98  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  99 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 101
 102 #ifndef STATIC
 103 #define STATIC static
 104 #endif
 105
 106
 107 struct RExC_state_t {
 108  U32  flags;   /* RXf_* are we folding, multilining? */
 109  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 110  char *precomp;  /* uncompiled string. */
 111  REGEXP *rx_sv;   /* The SV that is the regexp. */
 112  regexp *rx;                    /* perl core regexp structure */
 113  regexp_internal *rxi;           /* internal data for regexp object
 114           pprivate field */
 115  char *start;   /* Start of input for compile */
 116  char *end;   /* End of input for compile */
 117  char *parse;   /* Input-scan pointer. */
 118  SSize_t whilem_seen;  /* number of WHILEM in this expr */
 119  regnode *emit_start;  /* Start of emitted-code area */
 120  regnode *emit_bound;  /* First regnode outside of the
 121           allocated space */
 122  regnode *emit;   /* Code-emit pointer; if = &emit_dummy,
 123           implies compiling, so don't emit */
 124  regnode_ssc emit_dummy;  /* placeholder for emit to point to;
 125           large enough for the largest
 126           non-EXACTish node, so can use it as
 127           scratch in pass1 */
 128  I32  naughty;  /* How bad is this pattern? */
 129  I32  sawback;  /* Did we see \1, ...? */
 130  U32  seen;
 131  SSize_t size;   /* Code size. */
 132  I32                npar;            /* Capture buffer count, (OPEN) plus
 133           one. ("par" 0 is the whole
 134           pattern)*/
 135  I32  nestroot;  /* root parens we are in - used by
 136           accept */
 137  I32  extralen;
 138  I32  seen_zerolen;
 139  regnode **open_parens;  /* pointers to open parens */
 140  regnode **close_parens;  /* pointers to close parens */
 141  regnode *opend;   /* END node in program */
 142  I32  utf8;  /* whether the pattern is utf8 or not */
 143  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 144         /* XXX use this for future optimisation of case
 145         * where pattern must be upgraded to utf8. */
 146  I32  uni_semantics; /* If a d charset modifier should use unicode
 147         rules, even if the pattern is not in
 148         utf8 */
 149  HV  *paren_names;  /* Paren names */
 150
 151  regnode **recurse;  /* Recurse regops */
 152  I32  recurse_count;  /* Number of recurse regops */
 153  U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 154           through */
 155  U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 156  I32  in_lookbehind;
 157  I32  contains_locale;
 158  I32  contains_i;
 159  I32  override_recoding;
 160  I32  in_multi_char_class;
 161  struct reg_code_block *code_blocks; /* positions of literal (?{})
 162            within pattern */
 163  int  num_code_blocks; /* size of code_blocks[] */
 164  int  code_index;  /* next code_blocks[] slot */
 165  SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 166 #ifdef ADD_TO_REGEXEC
 167  char  *starttry;  /* -Dr: where regtry was called. */
 168 #define RExC_starttry (pRExC_state->starttry)
 169 #endif
 170  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 171 #ifdef DEBUGGING
 172  const char  *lastparse;
 173  I32         lastnum;
 174  AV          *paren_name_list;       /* idx -> name */
 175 #define RExC_lastparse (pRExC_state->lastparse)
 176 #define RExC_lastnum (pRExC_state->lastnum)
 177 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 178 #endif
 179 };
 180
 181 #define RExC_flags (pRExC_state->flags)
 182 #define RExC_pm_flags (pRExC_state->pm_flags)
 183 #define RExC_precomp (pRExC_state->precomp)
 184 #define RExC_rx_sv (pRExC_state->rx_sv)
 185 #define RExC_rx  (pRExC_state->rx)
 186 #define RExC_rxi (pRExC_state->rxi)
 187 #define RExC_start (pRExC_state->start)
 188 #define RExC_end (pRExC_state->end)
 189 #define RExC_parse (pRExC_state->parse)
 190 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 191 #ifdef RE_TRACK_PATTERN_OFFSETS
 192 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
 193               others */
 194 #endif
 195 #define RExC_emit (pRExC_state->emit)
 196 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 197 #define RExC_emit_start (pRExC_state->emit_start)
 198 #define RExC_emit_bound (pRExC_state->emit_bound)
 199 #define RExC_naughty (pRExC_state->naughty)
 200 #define RExC_sawback (pRExC_state->sawback)
 201 #define RExC_seen (pRExC_state->seen)
 202 #define RExC_size (pRExC_state->size)
 203 #define RExC_maxlen        (pRExC_state->maxlen)
 204 #define RExC_npar (pRExC_state->npar)
 205 #define RExC_nestroot   (pRExC_state->nestroot)
 206 #define RExC_extralen (pRExC_state->extralen)
 207 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 208 #define RExC_utf8 (pRExC_state->utf8)
 209 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 210 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 211 #define RExC_open_parens (pRExC_state->open_parens)
 212 #define RExC_close_parens (pRExC_state->close_parens)
 213 #define RExC_opend (pRExC_state->opend)
 214 #define RExC_paren_names (pRExC_state->paren_names)
 215 #define RExC_recurse (pRExC_state->recurse)
 216 #define RExC_recurse_count (pRExC_state->recurse_count)
 217 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 218 #define RExC_study_chunk_recursed_bytes  \
 219         (pRExC_state->study_chunk_recursed_bytes)
 220 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 221 #define RExC_contains_locale (pRExC_state->contains_locale)
 222 #define RExC_contains_i (pRExC_state->contains_i)
 223 #define RExC_override_recoding (pRExC_state->override_recoding)
 224 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 225
 226
 227 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 228 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 229   ((*s) == '{' && regcurly(s, FALSE)))
 230
 231 /*
 232  * Flags to be passed up and down.
 233  */
 234 #define WORST  0 /* Worst case. */
 235 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 236
 237 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 238  * character.  (There needs to be a case: in the switch statement in regexec.c
 239  * for any node marked SIMPLE.)  Note that this is not the same thing as
 240  * REGNODE_SIMPLE */
 241 #define SIMPLE  0x02
 242 #define SPSTART  0x04 /* Starts with * or + */
 243 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 244 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 245 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 246
 247 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 248
 249 /* whether trie related optimizations are enabled */
 250 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 251 #define TRIE_STUDY_OPT
 252 #define FULL_TRIE_STUDY
 253 #define TRIE_STCLASS
 254 #endif
 255
 256
 257
 258 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 259 #define PBITVAL(paren) (1 << ((paren) & 7))
 260 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 261 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 262 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 263
 264 #define REQUIRE_UTF8 STMT_START {                                       \
 265          if (!UTF) {                           \
 266           *flagp = RESTART_UTF8;            \
 267           return NULL;                      \
 268          }                                     \
 269       } STMT_END
 270
 271 /* This converts the named class defined in regcomp.h to its equivalent class
 272  * number defined in handy.h. */
 273 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 274 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 275
 276 #define _invlist_union_complement_2nd(a, b, output) \
 277       _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 278 #define _invlist_intersection_complement_2nd(a, b, output) \
 279     _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 280
 281 /* About scan_data_t.
 282
 283   During optimisation we recurse through the regexp program performing
 284   various inplace (keyhole style) optimisations. In addition study_chunk
 285   and scan_commit populate this data structure with information about
 286   what strings MUST appear in the pattern. We look for the longest
 287   string that must appear at a fixed location, and we look for the
 288   longest string that may appear at a floating location. So for instance
 289   in the pattern:
 290
 291  /FOO[xX]A.*B[xX]BAR/
 292
 293   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 294   strings (because they follow a .* construct). study_chunk will identify
 295   both FOO and BAR as being the longest fixed and floating strings respectively.
 296
 297   The strings can be composites, for instance
 298
 299  /(f)(o)(o)/
 300
 301   will result in a composite fixed substring 'foo'.
 302
 303   For each string some basic information is maintained:
 304
 305   - offset or min_offset
 306  This is the position the string must appear at, or not before.
 307  It also implicitly (when combined with minlenp) tells us how many
 308  characters must match before the string we are searching for.
 309  Likewise when combined with minlenp and the length of the string it
 310  tells us how many characters must appear after the string we have
 311  found.
 312
 313   - max_offset
 314  Only used for floating strings. This is the rightmost point that
 315  the string can appear at. If set to SSize_t_MAX it indicates that the
 316  string can occur infinitely far to the right.
 317
 318   - minlenp
 319  A pointer to the minimum number of characters of the pattern that the
 320  string was found inside. This is important as in the case of positive
 321  lookahead or positive lookbehind we can have multiple patterns
 322  involved. Consider
 323
 324  /(?=FOO).*F/
 325
 326  The minimum length of the pattern overall is 3, the minimum length
 327  of the lookahead part is 3, but the minimum length of the part that
 328  will actually match is 1. So 'FOO's minimum length is 3, but the
 329  minimum length for the F is 1. This is important as the minimum length
 330  is used to determine offsets in front of and behind the string being
 331  looked for.  Since strings can be composites this is the length of the
 332  pattern at the time it was committed with a scan_commit. Note that
 333  the length is calculated by study_chunk, so that the minimum lengths
 334  are not known until the full pattern has been compiled, thus the
 335  pointer to the value.
 336
 337   - lookbehind
 338
 339  In the case of lookbehind the string being searched for can be
 340  offset past the start point of the final matching string.
 341  If this value was just blithely removed from the min_offset it would
 342  invalidate some of the calculations for how many chars must match
 343  before or after (as they are derived from min_offset and minlen and
 344  the length of the string being searched for).
 345  When the final pattern is compiled and the data is moved from the
 346  scan_data_t structure into the regexp structure the information
 347  about lookbehind is factored in, with the information that would
 348  have been lost precalculated in the end_shift field for the
 349  associated string.
 350
 351   The fields pos_min and pos_delta are used to store the minimum offset
 352   and the delta to the maximum offset at the current point in the pattern.
 353
 354 */
 355
 356 typedef struct scan_data_t {
 357  /*I32 len_min;      unused */
 358  /*I32 len_delta;    unused */
 359  SSize_t pos_min;
 360  SSize_t pos_delta;
 361  SV *last_found;
 362  SSize_t last_end;     /* min value, <0 unless valid. */
 363  SSize_t last_start_min;
 364  SSize_t last_start_max;
 365  SV **longest;     /* Either &l_fixed, or &l_float. */
 366  SV *longest_fixed;      /* longest fixed string found in pattern */
 367  SSize_t offset_fixed;   /* offset where it starts */
 368  SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 369  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 370  SV *longest_float;      /* longest floating string found in pattern */
 371  SSize_t offset_float_min; /* earliest point in string it can appear */
 372  SSize_t offset_float_max; /* latest point in string it can appear */
 373  SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 374  SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 375  I32 flags;
 376  I32 whilem_c;
 377  SSize_t *last_closep;
 378  regnode_ssc *start_class;
 379 } scan_data_t;
 380
 381 /* The below is perhaps overboard, but this allows us to save a test at the
 382  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 383  * and 'a' differ by a single bit; the same with the upper and lower case of
 384  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 385  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 386  * then inverts it to form a mask, with just a single 0, in the bit position
 387  * where the upper- and lowercase differ.  XXX There are about 40 other
 388  * instances in the Perl core where this micro-optimization could be used.
 389  * Should decide if maintenance cost is worse, before changing those
 390  *
 391  * Returns a boolean as to whether or not 'v' is either a lowercase or
 392  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 393  * compile-time constant, the generated code is better than some optimizing
 394  * compilers figure out, amounting to a mask and test.  The results are
 395  * meaningless if 'c' is not one of [A-Za-z] */
 396 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 397        (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 398
 399 /*
 400  * Forward declarations for pregcomp()'s friends.
 401  */
 402
 403 static const scan_data_t zero_scan_data =
 404   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 405
 406 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 407 #define SF_BEFORE_SEOL  0x0001
 408 #define SF_BEFORE_MEOL  0x0002
 409 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 410 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 411
 412 #define SF_FIX_SHIFT_EOL (+2)
 413 #define SF_FL_SHIFT_EOL  (+4)
 414
 415 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 416 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 417
 418 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 419 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 420 #define SF_IS_INF  0x0040
 421 #define SF_HAS_PAR  0x0080
 422 #define SF_IN_PAR  0x0100
 423 #define SF_HAS_EVAL  0x0200
 424 #define SCF_DO_SUBSTR  0x0400
 425 #define SCF_DO_STCLASS_AND 0x0800
 426 #define SCF_DO_STCLASS_OR 0x1000
 427 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 428 #define SCF_WHILEM_VISITED_POS 0x2000
 429
 430 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 431 #define SCF_SEEN_ACCEPT         0x8000
 432 #define SCF_TRIE_DOING_RESTUDY 0x10000
 433
 434 #define UTF cBOOL(RExC_utf8)
 435
 436 /* The enums for all these are ordered so things work out correctly */
 437 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 438 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 439              == REGEX_DEPENDS_CHARSET)
 440 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 441 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 442              >= REGEX_UNICODE_CHARSET)
 443 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 444            == REGEX_ASCII_RESTRICTED_CHARSET)
 445 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 446            >= REGEX_ASCII_RESTRICTED_CHARSET)
 447 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 448           == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 449
 450 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 451
 452 /* For programs that want to be strictly Unicode compatible by dying if any
 453  * attempt is made to match a non-Unicode code point against a Unicode
 454  * property.  */
 455 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 456
 457 #define OOB_NAMEDCLASS  -1
 458
 459 /* There is no code point that is out-of-bounds, so this is problematic.  But
 460  * its only current use is to initialize a variable that is always set before
 461  * looked at. */
 462 #define OOB_UNICODE  0xDEADBEEF
 463
 464 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 465 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 466
 467
 468 /* length of regex to show in messages that don't mark a position within */
 469 #define RegexLengthToShowInErrorMessages 127
 470
 471 /*
 472  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 473  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 474  * op/pragma/warn/regcomp.
 475  */
 476 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 477 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 478
 479 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 480       " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 481
 482 #define REPORT_LOCATION_ARGS(offset)            \
 483     UTF8fARG(UTF, offset, RExC_precomp), \
 484     UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 485
 486 /*
 487  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 488  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 489  * "...".
 490  */
 491 #define _FAIL(code) STMT_START {     \
 492  const char *ellipses = "";      \
 493  IV len = RExC_end - RExC_precomp;     \
 494                   \
 495  if (!SIZE_ONLY)       \
 496   SAVEFREESV(RExC_rx_sv);      \
 497  if (len > RegexLengthToShowInErrorMessages) {   \
 498   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 499   len = RegexLengthToShowInErrorMessages - 10;   \
 500   ellipses = "...";      \
 501  }         \
 502  code;                                                               \
 503 } STMT_END
 504
 505 #define FAIL(msg) _FAIL(       \
 506  Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",     \
 507    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 508
 509 #define FAIL2(msg,arg) _FAIL(       \
 510  Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",     \
 511    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 512
 513 /*
 514  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 515  */
 516 #define Simple_vFAIL(m) STMT_START {     \
 517  const IV offset = RExC_parse - RExC_precomp;   \
 518  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 519    m, REPORT_LOCATION_ARGS(offset)); \
 520 } STMT_END
 521
 522 /*
 523  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 524  */
 525 #define vFAIL(m) STMT_START {    \
 526  if (!SIZE_ONLY)     \
 527   SAVEFREESV(RExC_rx_sv);    \
 528  Simple_vFAIL(m);     \
 529 } STMT_END
 530
 531 /*
 532  * Like Simple_vFAIL(), but accepts two arguments.
 533  */
 534 #define Simple_vFAIL2(m,a1) STMT_START {   \
 535  const IV offset = RExC_parse - RExC_precomp;   \
 536  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,   \
 537      REPORT_LOCATION_ARGS(offset)); \
 538 } STMT_END
 539
 540 /*
 541  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 542  */
 543 #define vFAIL2(m,a1) STMT_START {   \
 544  if (!SIZE_ONLY)     \
 545   SAVEFREESV(RExC_rx_sv);    \
 546  Simple_vFAIL2(m, a1);    \
 547 } STMT_END
 548
 549
 550 /*
 551  * Like Simple_vFAIL(), but accepts three arguments.
 552  */
 553 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 554  const IV offset = RExC_parse - RExC_precomp;  \
 555  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 556    REPORT_LOCATION_ARGS(offset)); \
 557 } STMT_END
 558
 559 /*
 560  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 561  */
 562 #define vFAIL3(m,a1,a2) STMT_START {   \
 563  if (!SIZE_ONLY)     \
 564   SAVEFREESV(RExC_rx_sv);    \
 565  Simple_vFAIL3(m, a1, a2);    \
 566 } STMT_END
 567
 568 /*
 569  * Like Simple_vFAIL(), but accepts four arguments.
 570  */
 571 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 572  const IV offset = RExC_parse - RExC_precomp;  \
 573  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,  \
 574    REPORT_LOCATION_ARGS(offset)); \
 575 } STMT_END
 576
 577 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 578  if (!SIZE_ONLY)     \
 579   SAVEFREESV(RExC_rx_sv);    \
 580  Simple_vFAIL4(m, a1, a2, a3);   \
 581 } STMT_END
 582
 583 /* A specialized version of vFAIL2 that works with UTF8f */
 584 #define vFAIL2utf8f(m, a1) STMT_START { \
 585  const IV offset = RExC_parse - RExC_precomp;   \
 586  if (!SIZE_ONLY)                                \
 587   SAVEFREESV(RExC_rx_sv);                    \
 588  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 589    REPORT_LOCATION_ARGS(offset));         \
 590 } STMT_END
 591
 592
 593 /* m is not necessarily a "literal string", in this macro */
 594 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 595  const IV offset = loc - RExC_precomp;                               \
 596  Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 597    m, REPORT_LOCATION_ARGS(offset));       \
 598 } STMT_END
 599
 600 #define ckWARNreg(loc,m) STMT_START {     \
 601  const IV offset = loc - RExC_precomp;    \
 602  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 603    REPORT_LOCATION_ARGS(offset));  \
 604 } STMT_END
 605
 606 #define vWARN_dep(loc, m) STMT_START {            \
 607  const IV offset = loc - RExC_precomp;    \
 608  Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 609    REPORT_LOCATION_ARGS(offset));         \
 610 } STMT_END
 611
 612 #define ckWARNdep(loc,m) STMT_START {            \
 613  const IV offset = loc - RExC_precomp;    \
 614  Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 615    m REPORT_LOCATION,      \
 616    REPORT_LOCATION_ARGS(offset));  \
 617 } STMT_END
 618
 619 #define ckWARNregdep(loc,m) STMT_START {    \
 620  const IV offset = loc - RExC_precomp;    \
 621  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 622    m REPORT_LOCATION,      \
 623    REPORT_LOCATION_ARGS(offset));  \
 624 } STMT_END
 625
 626 #define ckWARN2reg_d(loc,m, a1) STMT_START {    \
 627  const IV offset = loc - RExC_precomp;    \
 628  Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),   \
 629    m REPORT_LOCATION,      \
 630    a1, REPORT_LOCATION_ARGS(offset)); \
 631 } STMT_END
 632
 633 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 634  const IV offset = loc - RExC_precomp;    \
 635  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 636    a1, REPORT_LOCATION_ARGS(offset)); \
 637 } STMT_END
 638
 639 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 640  const IV offset = loc - RExC_precomp;    \
 641  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 642    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 643 } STMT_END
 644
 645 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 646  const IV offset = loc - RExC_precomp;    \
 647  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 648    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 649 } STMT_END
 650
 651 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 652  const IV offset = loc - RExC_precomp;    \
 653  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 654    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 655 } STMT_END
 656
 657 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 658  const IV offset = loc - RExC_precomp;    \
 659  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 660    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 661 } STMT_END
 662
 663 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 664  const IV offset = loc - RExC_precomp;    \
 665  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 666    a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 667 } STMT_END
 668
 669
 670 /* Allow for side effects in s */
 671 #define REGC(c,s) STMT_START {   \
 672  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 673 } STMT_END
 674
 675 /* Macros for recording node offsets.   20001227 mjd@plover.com
 676  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 677  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 678  * Element 0 holds the number n.
 679  * Position is 1 indexed.
 680  */
 681 #ifndef RE_TRACK_PATTERN_OFFSETS
 682 #define Set_Node_Offset_To_R(node,byte)
 683 #define Set_Node_Offset(node,byte)
 684 #define Set_Cur_Node_Offset
 685 #define Set_Node_Length_To_R(node,len)
 686 #define Set_Node_Length(node,len)
 687 #define Set_Node_Cur_Length(node,start)
 688 #define Node_Offset(n)
 689 #define Node_Length(n)
 690 #define Set_Node_Offset_Length(node,offset,len)
 691 #define ProgLen(ri) ri->u.proglen
 692 #define SetProgLen(ri,x) ri->u.proglen = x
 693 #else
 694 #define ProgLen(ri) ri->u.offsets[0]
 695 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 696 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 697  if (! SIZE_ONLY) {       \
 698   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 699      __LINE__, (int)(node), (int)(byte)));  \
 700   if((node) < 0) {      \
 701    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 702           (int)(node));                  \
 703   } else {       \
 704    RExC_offsets[2*(node)-1] = (byte);    \
 705   }        \
 706  }         \
 707 } STMT_END
 708
 709 #define Set_Node_Offset(node,byte) \
 710  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 711 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 712
 713 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 714  if (! SIZE_ONLY) {       \
 715   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 716     __LINE__, (int)(node), (int)(len)));   \
 717   if((node) < 0) {      \
 718    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 719           (int)(node));                  \
 720   } else {       \
 721    RExC_offsets[2*(node)] = (len);    \
 722   }        \
 723  }         \
 724 } STMT_END
 725
 726 #define Set_Node_Length(node,len) \
 727  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 728 #define Set_Node_Cur_Length(node, start)                \
 729  Set_Node_Length(node, RExC_parse - start)
 730
 731 /* Get offsets and lengths */
 732 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 733 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 734
 735 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 736  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 737  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 738 } STMT_END
 739 #endif
 740
 741 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 742 #define EXPERIMENTAL_INPLACESCAN
 743 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 744
 745 #define DEBUG_RExC_seen() \
 746   DEBUG_OPTIMISE_MORE_r({                                             \
 747    PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 748                    \
 749    if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 750     PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 751                    \
 752    if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 753     PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 754                    \
 755    if (RExC_seen & REG_GPOS_SEEN)                                  \
 756     PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 757                    \
 758    if (RExC_seen & REG_CANY_SEEN)                                  \
 759     PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 760                    \
 761    if (RExC_seen & REG_RECURSE_SEEN)                               \
 762     PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 763                    \
 764    if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 765     PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 766                    \
 767    if (RExC_seen & REG_VERBARG_SEEN)                               \
 768     PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 769                    \
 770    if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 771     PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 772                    \
 773    if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 774     PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 775                    \
 776    if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 777     PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 778                    \
 779    if (RExC_seen & REG_GOSTART_SEEN)                               \
 780     PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 781                    \
 782    if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 783     PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 784                    \
 785    PerlIO_printf(Perl_debug_log,"\n");                             \
 786   });
 787
 788 #define DEBUG_STUDYDATA(str,data,depth)                              \
 789 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 790  PerlIO_printf(Perl_debug_log,                                    \
 791   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 792   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 793   (int)(depth)*2, "",                                          \
 794   (IV)((data)->pos_min),                                       \
 795   (IV)((data)->pos_delta),                                     \
 796   (UV)((data)->flags),                                         \
 797   (IV)((data)->whilem_c),                                      \
 798   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 799   is_inf ? "INF " : ""                                         \
 800  );                                                               \
 801  if ((data)->last_found)                                          \
 802   PerlIO_printf(Perl_debug_log,                                \
 803    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 804    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 805    SvPVX_const((data)->last_found),                         \
 806    (IV)((data)->last_end),                                  \
 807    (IV)((data)->last_start_min),                            \
 808    (IV)((data)->last_start_max),                            \
 809    ((data)->longest &&                                      \
 810    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 811    SvPVX_const((data)->longest_fixed),                      \
 812    (IV)((data)->offset_fixed),                              \
 813    ((data)->longest &&                                      \
 814    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 815    SvPVX_const((data)->longest_float),                      \
 816    (IV)((data)->offset_float_min),                          \
 817    (IV)((data)->offset_float_max)                           \
 818   );                                                           \
 819  PerlIO_printf(Perl_debug_log,"\n");                              \
 820 });
 821
 822 /* Mark that we cannot extend a found fixed substring at this point.
 823    Update the longest found anchored substring and the longest found
 824    floating substrings if needed. */
 825
 826 STATIC void
 827 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 828      SSize_t *minlenp, int is_inf)
 829 {
 830  const STRLEN l = CHR_SVLEN(data->last_found);
 831  const STRLEN old_l = CHR_SVLEN(*data->longest);
 832  GET_RE_DEBUG_FLAGS_DECL;
 833
 834  PERL_ARGS_ASSERT_SCAN_COMMIT;
 835
 836  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 837   SvSetMagicSV(*data->longest, data->last_found);
 838   if (*data->longest == data->longest_fixed) {
 839    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 840    if (data->flags & SF_BEFORE_EOL)
 841     data->flags
 842      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 843    else
 844     data->flags &= ~SF_FIX_BEFORE_EOL;
 845    data->minlen_fixed=minlenp;
 846    data->lookbehind_fixed=0;
 847   }
 848   else { /* *data->longest == data->longest_float */
 849    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 850    data->offset_float_max = (l
 851          ? data->last_start_max
 852          : (data->pos_delta == SSize_t_MAX
 853           ? SSize_t_MAX
 854           : data->pos_min + data->pos_delta));
 855    if (is_inf
 856     || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 857     data->offset_float_max = SSize_t_MAX;
 858    if (data->flags & SF_BEFORE_EOL)
 859     data->flags
 860      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 861    else
 862     data->flags &= ~SF_FL_BEFORE_EOL;
 863    data->minlen_float=minlenp;
 864    data->lookbehind_float=0;
 865   }
 866  }
 867  SvCUR_set(data->last_found, 0);
 868  {
 869   SV * const sv = data->last_found;
 870   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 871    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 872    if (mg)
 873     mg->mg_len = 0;
 874   }
 875  }
 876  data->last_end = -1;
 877  data->flags &= ~SF_BEFORE_EOL;
 878  DEBUG_STUDYDATA("commit: ",data,0);
 879 }
 880
 881 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 882  * list that describes which code points it matches */
 883
 884 STATIC void
 885 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 886 {
 887  /* Set the SSC 'ssc' to match an empty string or any code point */
 888
 889  PERL_ARGS_ASSERT_SSC_ANYTHING;
 890
 891  assert(is_ANYOF_SYNTHETIC(ssc));
 892
 893  ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 894  _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 895  ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 896 }
 897
 898 STATIC int
 899 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 900 {
 901  /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 902  * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 903  * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 904  * in any way, so there's no point in using it */
 905
 906  UV start, end;
 907  bool ret;
 908
 909  PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 910
 911  assert(is_ANYOF_SYNTHETIC(ssc));
 912
 913  if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 914   return FALSE;
 915  }
 916
 917  /* See if the list consists solely of the range 0 - Infinity */
 918  invlist_iterinit(ssc->invlist);
 919  ret = invlist_iternext(ssc->invlist, &start, &end)
 920   && start == 0
 921   && end == UV_MAX;
 922
 923  invlist_iterfinish(ssc->invlist);
 924
 925  if (ret) {
 926   return TRUE;
 927  }
 928
 929  /* If e.g., both \w and \W are set, matches everything */
 930  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 931   int i;
 932   for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 933    if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 934     return TRUE;
 935    }
 936   }
 937  }
 938
 939  return FALSE;
 940 }
 941
 942 STATIC void
 943 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 944 {
 945  /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 946  * string, any code point, or any posix class under locale */
 947
 948  PERL_ARGS_ASSERT_SSC_INIT;
 949
 950  Zero(ssc, 1, regnode_ssc);
 951  set_ANYOF_SYNTHETIC(ssc);
 952  ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 953  ssc_anything(ssc);
 954
 955  /* If any portion of the regex is to operate under locale rules,
 956  * initialization includes it.  The reason this isn't done for all regexes
 957  * is that the optimizer was written under the assumption that locale was
 958  * all-or-nothing.  Given the complexity and lack of documentation in the
 959  * optimizer, and that there are inadequate test cases for locale, many
 960  * parts of it may not work properly, it is safest to avoid locale unless
 961  * necessary. */
 962  if (RExC_contains_locale) {
 963   ANYOF_POSIXL_SETALL(ssc);
 964  }
 965  else {
 966   ANYOF_POSIXL_ZERO(ssc);
 967  }
 968 }
 969
 970 STATIC int
 971 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 972        const regnode_ssc *ssc)
 973 {
 974  /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 975  * to the list of code points matched, and locale posix classes; hence does
 976  * not check its flags) */
 977
 978  UV start, end;
 979  bool ret;
 980
 981  PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 982
 983  assert(is_ANYOF_SYNTHETIC(ssc));
 984
 985  invlist_iterinit(ssc->invlist);
 986  ret = invlist_iternext(ssc->invlist, &start, &end)
 987   && start == 0
 988   && end == UV_MAX;
 989
 990  invlist_iterfinish(ssc->invlist);
 991
 992  if (! ret) {
 993   return FALSE;
 994  }
 995
 996  if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 997   return FALSE;
 998  }
 999
1000  return TRUE;
1001 }
1002
1003 STATIC SV*
1004 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1005        const regnode_charclass* const node)
1006 {
1007  /* Returns a mortal inversion list defining which code points are matched
1008  * by 'node', which is of type ANYOF.  Handles complementing the result if
1009  * appropriate.  If some code points aren't knowable at this time, the
1010  * returned list must, and will, contain every code point that is a
1011  * possibility. */
1012
1013  SV* invlist = sv_2mortal(_new_invlist(0));
1014  SV* only_utf8_locale_invlist = NULL;
1015  unsigned int i;
1016  const U32 n = ARG(node);
1017  bool new_node_has_latin1 = FALSE;
1018
1019  PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1020
1021  /* Look at the data structure created by S_set_ANYOF_arg() */
1022  if (n != ANYOF_NONBITMAP_EMPTY) {
1023   SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1024   AV * const av = MUTABLE_AV(SvRV(rv));
1025   SV **const ary = AvARRAY(av);
1026   assert(RExC_rxi->data->what[n] == 's');
1027
1028   if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1029    invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1030   }
1031   else if (ary[0] && ary[0] != &PL_sv_undef) {
1032
1033    /* Here, no compile-time swash, and there are things that won't be
1034    * known until runtime -- we have to assume it could be anything */
1035    return _add_range_to_invlist(invlist, 0, UV_MAX);
1036   }
1037   else if (ary[3] && ary[3] != &PL_sv_undef) {
1038
1039    /* Here no compile-time swash, and no run-time only data.  Use the
1040    * node's inversion list */
1041    invlist = sv_2mortal(invlist_clone(ary[3]));
1042   }
1043
1044   /* Get the code points valid only under UTF-8 locales */
1045   if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1046    && ary[2] && ary[2] != &PL_sv_undef)
1047   {
1048    only_utf8_locale_invlist = ary[2];
1049   }
1050  }
1051
1052  /* An ANYOF node contains a bitmap for the first 256 code points, and an
1053  * inversion list for the others, but if there are code points that should
1054  * match only conditionally on the target string being UTF-8, those are
1055  * placed in the inversion list, and not the bitmap.  Since there are
1056  * circumstances under which they could match, they are included in the
1057  * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1058  * here, so that when we invert below, the end result actually does include
1059  * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1060  * before we add the unconditionally matched code points */
1061  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1062   _invlist_intersection_complement_2nd(invlist,
1063            PL_UpperLatin1,
1064            &invlist);
1065  }
1066
1067  /* Add in the points from the bit map */
1068  for (i = 0; i < 256; i++) {
1069   if (ANYOF_BITMAP_TEST(node, i)) {
1070    invlist = add_cp_to_invlist(invlist, i);
1071    new_node_has_latin1 = TRUE;
1072   }
1073  }
1074
1075  /* If this can match all upper Latin1 code points, have to add them
1076  * as well */
1077  if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1078   _invlist_union(invlist, PL_UpperLatin1, &invlist);
1079  }
1080
1081  /* Similarly for these */
1082  if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1083   invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1084  }
1085
1086  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1087   _invlist_invert(invlist);
1088  }
1089  else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1090
1091   /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1092   * locale.  We can skip this if there are no 0-255 at all. */
1093   _invlist_union(invlist, PL_Latin1, &invlist);
1094  }
1095
1096  /* Similarly add the UTF-8 locale possible matches.  These have to be
1097  * deferred until after the non-UTF-8 locale ones are taken care of just
1098  * above, or it leads to wrong results under ANYOF_INVERT */
1099  if (only_utf8_locale_invlist) {
1100   _invlist_union_maybe_complement_2nd(invlist,
1101            only_utf8_locale_invlist,
1102            ANYOF_FLAGS(node) & ANYOF_INVERT,
1103            &invlist);
1104  }
1105
1106  return invlist;
1107 }
1108
1109 /* These two functions currently do the exact same thing */
1110 #define ssc_init_zero  ssc_init
1111
1112 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1113 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1114
1115 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1116  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1117  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1118
1119 STATIC void
1120 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1121     const regnode_charclass *and_with)
1122 {
1123  /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1124  * another SSC or a regular ANYOF class.  Can create false positives. */
1125
1126  SV* anded_cp_list;
1127  U8  anded_flags;
1128
1129  PERL_ARGS_ASSERT_SSC_AND;
1130
1131  assert(is_ANYOF_SYNTHETIC(ssc));
1132
1133  /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1134  * the code point inversion list and just the relevant flags */
1135  if (is_ANYOF_SYNTHETIC(and_with)) {
1136   anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1137   anded_flags = ANYOF_FLAGS(and_with);
1138
1139   /* XXX This is a kludge around what appears to be deficiencies in the
1140   * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1141   * there are paths through the optimizer where it doesn't get weeded
1142   * out when it should.  And if we don't make some extra provision for
1143   * it like the code just below, it doesn't get added when it should.
1144   * This solution is to add it only when AND'ing, which is here, and
1145   * only when what is being AND'ed is the pristine, original node
1146   * matching anything.  Thus it is like adding it to ssc_anything() but
1147   * only when the result is to be AND'ed.  Probably the same solution
1148   * could be adopted for the same problem we have with /l matching,
1149   * which is solved differently in S_ssc_init(), and that would lead to
1150   * fewer false positives than that solution has.  But if this solution
1151   * creates bugs, the consequences are only that a warning isn't raised
1152   * that should be; while the consequences for having /l bugs is
1153   * incorrect matches */
1154   if (ssc_is_anything((regnode_ssc *)and_with)) {
1155    anded_flags |= ANYOF_WARN_SUPER;
1156   }
1157  }
1158  else {
1159   anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1160   anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1161  }
1162
1163  ANYOF_FLAGS(ssc) &= anded_flags;
1164
1165  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1166  * C2 is the list of code points in 'and-with'; P2, its posix classes.
1167  * 'and_with' may be inverted.  When not inverted, we have the situation of
1168  * computing:
1169  *  (C1 | P1) & (C2 | P2)
1170  *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1171  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1172  *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1173  *                    <=  ((C1 & C2) | P1 | P2)
1174  * Alternatively, the last few steps could be:
1175  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1176  *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1177  *                    <=  (C1 | C2 | (P1 & P2))
1178  * We favor the second approach if either P1 or P2 is non-empty.  This is
1179  * because these components are a barrier to doing optimizations, as what
1180  * they match cannot be known until the moment of matching as they are
1181  * dependent on the current locale, 'AND"ing them likely will reduce or
1182  * eliminate them.
1183  * But we can do better if we know that C1,P1 are in their initial state (a
1184  * frequent occurrence), each matching everything:
1185  *  (<everything>) & (C2 | P2) =  C2 | P2
1186  * Similarly, if C2,P2 are in their initial state (again a frequent
1187  * occurrence), the result is a no-op
1188  *  (C1 | P1) & (<everything>) =  C1 | P1
1189  *
1190  * Inverted, we have
1191  *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1192  *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1193  *                         <=  (C1 & ~C2) | (P1 & ~P2)
1194  * */
1195
1196  if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1197   && ! is_ANYOF_SYNTHETIC(and_with))
1198  {
1199   unsigned int i;
1200
1201   ssc_intersection(ssc,
1202       anded_cp_list,
1203       FALSE /* Has already been inverted */
1204       );
1205
1206   /* If either P1 or P2 is empty, the intersection will be also; can skip
1207   * the loop */
1208   if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1209    ANYOF_POSIXL_ZERO(ssc);
1210   }
1211   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1212
1213    /* Note that the Posix class component P from 'and_with' actually
1214    * looks like:
1215    *      P = Pa | Pb | ... | Pn
1216    * where each component is one posix class, such as in [\w\s].
1217    * Thus
1218    *      ~P = ~(Pa | Pb | ... | Pn)
1219    *         = ~Pa & ~Pb & ... & ~Pn
1220    *        <= ~Pa | ~Pb | ... | ~Pn
1221    * The last is something we can easily calculate, but unfortunately
1222    * is likely to have many false positives.  We could do better
1223    * in some (but certainly not all) instances if two classes in
1224    * P have known relationships.  For example
1225    *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1226    * So
1227    *      :lower: & :print: = :lower:
1228    * And similarly for classes that must be disjoint.  For example,
1229    * since \s and \w can have no elements in common based on rules in
1230    * the POSIX standard,
1231    *      \w & ^\S = nothing
1232    * Unfortunately, some vendor locales do not meet the Posix
1233    * standard, in particular almost everything by Microsoft.
1234    * The loop below just changes e.g., \w into \W and vice versa */
1235
1236    regnode_charclass_posixl temp;
1237    int add = 1;    /* To calculate the index of the complement */
1238
1239    ANYOF_POSIXL_ZERO(&temp);
1240    for (i = 0; i < ANYOF_MAX; i++) {
1241     assert(i % 2 != 0
1242      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1243      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1244
1245     if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1246      ANYOF_POSIXL_SET(&temp, i + add);
1247     }
1248     add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1249    }
1250    ANYOF_POSIXL_AND(&temp, ssc);
1251
1252   } /* else ssc already has no posixes */
1253  } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1254   in its initial state */
1255  else if (! is_ANYOF_SYNTHETIC(and_with)
1256    || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1257  {
1258   /* But if 'ssc' is in its initial state, the result is just 'and_with';
1259   * copy it over 'ssc' */
1260   if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1261    if (is_ANYOF_SYNTHETIC(and_with)) {
1262     StructCopy(and_with, ssc, regnode_ssc);
1263    }
1264    else {
1265     ssc->invlist = anded_cp_list;
1266     ANYOF_POSIXL_ZERO(ssc);
1267     if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1268      ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1269     }
1270    }
1271   }
1272   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1273     || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1274   {
1275    /* One or the other of P1, P2 is non-empty. */
1276    if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1277     ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1278    }
1279    ssc_union(ssc, anded_cp_list, FALSE);
1280   }
1281   else { /* P1 = P2 = empty */
1282    ssc_intersection(ssc, anded_cp_list, FALSE);
1283   }
1284  }
1285 }
1286
1287 STATIC void
1288 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1289    const regnode_charclass *or_with)
1290 {
1291  /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1292  * another SSC or a regular ANYOF class.  Can create false positives if
1293  * 'or_with' is to be inverted. */
1294
1295  SV* ored_cp_list;
1296  U8 ored_flags;
1297
1298  PERL_ARGS_ASSERT_SSC_OR;
1299
1300  assert(is_ANYOF_SYNTHETIC(ssc));
1301
1302  /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1303  * the code point inversion list and just the relevant flags */
1304  if (is_ANYOF_SYNTHETIC(or_with)) {
1305   ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1306   ored_flags = ANYOF_FLAGS(or_with);
1307  }
1308  else {
1309   ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1310   ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1311  }
1312
1313  ANYOF_FLAGS(ssc) |= ored_flags;
1314
1315  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1316  * C2 is the list of code points in 'or-with'; P2, its posix classes.
1317  * 'or_with' may be inverted.  When not inverted, we have the simple
1318  * situation of computing:
1319  *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1320  * If P1|P2 yields a situation with both a class and its complement are
1321  * set, like having both \w and \W, this matches all code points, and we
1322  * can delete these from the P component of the ssc going forward.  XXX We
1323  * might be able to delete all the P components, but I (khw) am not certain
1324  * about this, and it is better to be safe.
1325  *
1326  * Inverted, we have
1327  *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1328  *                         <=  (C1 | P1) | ~C2
1329  *                         <=  (C1 | ~C2) | P1
1330  * (which results in actually simpler code than the non-inverted case)
1331  * */
1332
1333  if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1334   && ! is_ANYOF_SYNTHETIC(or_with))
1335  {
1336   /* We ignore P2, leaving P1 going forward */
1337  }   /* else  Not inverted */
1338  else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1339   ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1340   if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1341    unsigned int i;
1342    for (i = 0; i < ANYOF_MAX; i += 2) {
1343     if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1344     {
1345      ssc_match_all_cp(ssc);
1346      ANYOF_POSIXL_CLEAR(ssc, i);
1347      ANYOF_POSIXL_CLEAR(ssc, i+1);
1348     }
1349    }
1350   }
1351  }
1352
1353  ssc_union(ssc,
1354    ored_cp_list,
1355    FALSE /* Already has been inverted */
1356    );
1357 }
1358
1359 PERL_STATIC_INLINE void
1360 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1361 {
1362  PERL_ARGS_ASSERT_SSC_UNION;
1363
1364  assert(is_ANYOF_SYNTHETIC(ssc));
1365
1366  _invlist_union_maybe_complement_2nd(ssc->invlist,
1367           invlist,
1368           invert2nd,
1369           &ssc->invlist);
1370 }
1371
1372 PERL_STATIC_INLINE void
1373 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1374       SV* const invlist,
1375       const bool invert2nd)
1376 {
1377  PERL_ARGS_ASSERT_SSC_INTERSECTION;
1378
1379  assert(is_ANYOF_SYNTHETIC(ssc));
1380
1381  _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1382            invlist,
1383            invert2nd,
1384            &ssc->invlist);
1385 }
1386
1387 PERL_STATIC_INLINE void
1388 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1389 {
1390  PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1391
1392  assert(is_ANYOF_SYNTHETIC(ssc));
1393
1394  ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1395 }
1396
1397 PERL_STATIC_INLINE void
1398 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1399 {
1400  /* AND just the single code point 'cp' into the SSC 'ssc' */
1401
1402  SV* cp_list = _new_invlist(2);
1403
1404  PERL_ARGS_ASSERT_SSC_CP_AND;
1405
1406  assert(is_ANYOF_SYNTHETIC(ssc));
1407
1408  cp_list = add_cp_to_invlist(cp_list, cp);
1409  ssc_intersection(ssc, cp_list,
1410      FALSE /* Not inverted */
1411      );
1412  SvREFCNT_dec_NN(cp_list);
1413 }
1414
1415 PERL_STATIC_INLINE void
1416 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1417 {
1418  /* Set the SSC 'ssc' to not match any locale things */
1419
1420  PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1421
1422  assert(is_ANYOF_SYNTHETIC(ssc));
1423
1424  ANYOF_POSIXL_ZERO(ssc);
1425  ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1426 }
1427
1428 STATIC void
1429 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1430 {
1431  /* The inversion list in the SSC is marked mortal; now we need a more
1432  * permanent copy, which is stored the same way that is done in a regular
1433  * ANYOF node, with the first 256 code points in a bit map */
1434
1435  SV* invlist = invlist_clone(ssc->invlist);
1436
1437  PERL_ARGS_ASSERT_SSC_FINALIZE;
1438
1439  assert(is_ANYOF_SYNTHETIC(ssc));
1440
1441  /* The code in this file assumes that all but these flags aren't relevant
1442  * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1443  * time we reach here */
1444  assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1445
1446  populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1447
1448  set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1449         NULL, NULL, NULL, FALSE);
1450
1451  /* Make sure is clone-safe */
1452  ssc->invlist = NULL;
1453
1454  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1455   ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1456  }
1457
1458  assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1459 }
1460
1461 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1462 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1463 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1464 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1465        ? (TRIE_LIST_CUR( idx ) - 1)           \
1466        : 0 )
1467
1468
1469 #ifdef DEBUGGING
1470 /*
1471    dump_trie(trie,widecharmap,revcharmap)
1472    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1473    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1474
1475    These routines dump out a trie in a somewhat readable format.
1476    The _interim_ variants are used for debugging the interim
1477    tables that are used to generate the final compressed
1478    representation which is what dump_trie expects.
1479
1480    Part of the reason for their existence is to provide a form
1481    of documentation as to how the different representations function.
1482
1483 */
1484
1485 /*
1486   Dumps the final compressed table form of the trie to Perl_debug_log.
1487   Used for debugging make_trie().
1488 */
1489
1490 STATIC void
1491 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1492    AV *revcharmap, U32 depth)
1493 {
1494  U32 state;
1495  SV *sv=sv_newmortal();
1496  int colwidth= widecharmap ? 6 : 4;
1497  U16 word;
1498  GET_RE_DEBUG_FLAGS_DECL;
1499
1500  PERL_ARGS_ASSERT_DUMP_TRIE;
1501
1502  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1503   (int)depth * 2 + 2,"",
1504   "Match","Base","Ofs" );
1505
1506  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1507   SV ** const tmp = av_fetch( revcharmap, state, 0);
1508   if ( tmp ) {
1509    PerlIO_printf( Perl_debug_log, "%*s",
1510     colwidth,
1511     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1512        PL_colors[0], PL_colors[1],
1513        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1514        PERL_PV_ESCAPE_FIRSTCHAR
1515     )
1516    );
1517   }
1518  }
1519  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1520   (int)depth * 2 + 2,"");
1521
1522  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1523   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1524  PerlIO_printf( Perl_debug_log, "\n");
1525
1526  for( state = 1 ; state < trie->statecount ; state++ ) {
1527   const U32 base = trie->states[ state ].trans.base;
1528
1529   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1530          (int)depth * 2 + 2,"", (UV)state);
1531
1532   if ( trie->states[ state ].wordnum ) {
1533    PerlIO_printf( Perl_debug_log, " W%4X",
1534           trie->states[ state ].wordnum );
1535   } else {
1536    PerlIO_printf( Perl_debug_log, "%6s", "" );
1537   }
1538
1539   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1540
1541   if ( base ) {
1542    U32 ofs = 0;
1543
1544    while( ( base + ofs  < trie->uniquecharcount ) ||
1545     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1546      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1547                  != state))
1548      ofs++;
1549
1550    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1551
1552    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1553     if ( ( base + ofs >= trie->uniquecharcount )
1554       && ( base + ofs - trie->uniquecharcount
1555               < trie->lasttrans )
1556       && trie->trans[ base + ofs
1557          - trie->uniquecharcount ].check == state )
1558     {
1559     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1560      colwidth,
1561      (UV)trie->trans[ base + ofs
1562            - trie->uniquecharcount ].next );
1563     } else {
1564      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1565     }
1566    }
1567
1568    PerlIO_printf( Perl_debug_log, "]");
1569
1570   }
1571   PerlIO_printf( Perl_debug_log, "\n" );
1572  }
1573  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1574         (int)depth*2, "");
1575  for (word=1; word <= trie->wordcount; word++) {
1576   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1577    (int)word, (int)(trie->wordinfo[word].prev),
1578    (int)(trie->wordinfo[word].len));
1579  }
1580  PerlIO_printf(Perl_debug_log, "\n" );
1581 }
1582 /*
1583   Dumps a fully constructed but uncompressed trie in list form.
1584   List tries normally only are used for construction when the number of
1585   possible chars (trie->uniquecharcount) is very high.
1586   Used for debugging make_trie().
1587 */
1588 STATIC void
1589 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1590       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1591       U32 depth)
1592 {
1593  U32 state;
1594  SV *sv=sv_newmortal();
1595  int colwidth= widecharmap ? 6 : 4;
1596  GET_RE_DEBUG_FLAGS_DECL;
1597
1598  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1599
1600  /* print out the table precompression.  */
1601  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1602   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1603   "------:-----+-----------------\n" );
1604
1605  for( state=1 ; state < next_alloc ; state ++ ) {
1606   U16 charid;
1607
1608   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1609    (int)depth * 2 + 2,"", (UV)state  );
1610   if ( ! trie->states[ state ].wordnum ) {
1611    PerlIO_printf( Perl_debug_log, "%5s| ","");
1612   } else {
1613    PerlIO_printf( Perl_debug_log, "W%4x| ",
1614     trie->states[ state ].wordnum
1615    );
1616   }
1617   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1618    SV ** const tmp = av_fetch( revcharmap,
1619           TRIE_LIST_ITEM(state,charid).forid, 0);
1620    if ( tmp ) {
1621     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1622      colwidth,
1623      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1624        colwidth,
1625        PL_colors[0], PL_colors[1],
1626        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1627        | PERL_PV_ESCAPE_FIRSTCHAR
1628      ) ,
1629      TRIE_LIST_ITEM(state,charid).forid,
1630      (UV)TRIE_LIST_ITEM(state,charid).newstate
1631     );
1632     if (!(charid % 10))
1633      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1634       (int)((depth * 2) + 14), "");
1635    }
1636   }
1637   PerlIO_printf( Perl_debug_log, "\n");
1638  }
1639 }
1640
1641 /*
1642   Dumps a fully constructed but uncompressed trie in table form.
1643   This is the normal DFA style state transition table, with a few
1644   twists to facilitate compression later.
1645   Used for debugging make_trie().
1646 */
1647 STATIC void
1648 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1649       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1650       U32 depth)
1651 {
1652  U32 state;
1653  U16 charid;
1654  SV *sv=sv_newmortal();
1655  int colwidth= widecharmap ? 6 : 4;
1656  GET_RE_DEBUG_FLAGS_DECL;
1657
1658  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1659
1660  /*
1661  print out the table precompression so that we can do a visual check
1662  that they are identical.
1663  */
1664
1665  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1666
1667  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1668   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1669   if ( tmp ) {
1670    PerlIO_printf( Perl_debug_log, "%*s",
1671     colwidth,
1672     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1673        PL_colors[0], PL_colors[1],
1674        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1675        PERL_PV_ESCAPE_FIRSTCHAR
1676     )
1677    );
1678   }
1679  }
1680
1681  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1682
1683  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1684   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1685  }
1686
1687  PerlIO_printf( Perl_debug_log, "\n" );
1688
1689  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1690
1691   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1692    (int)depth * 2 + 2,"",
1693    (UV)TRIE_NODENUM( state ) );
1694
1695   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1696    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1697    if (v)
1698     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1699    else
1700     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1701   }
1702   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1703    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1704            (UV)trie->trans[ state ].check );
1705   } else {
1706    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1707            (UV)trie->trans[ state ].check,
1708    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1709   }
1710  }
1711 }
1712
1713 #endif
1714
1715
1716 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1717   startbranch: the first branch in the whole branch sequence
1718   first      : start branch of sequence of branch-exact nodes.
1719    May be the same as startbranch
1720   last       : Thing following the last branch.
1721    May be the same as tail.
1722   tail       : item following the branch sequence
1723   count      : words in the sequence
1724   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1725   depth      : indent depth
1726
1727 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1728
1729 A trie is an N'ary tree where the branches are determined by digital
1730 decomposition of the key. IE, at the root node you look up the 1st character and
1731 follow that branch repeat until you find the end of the branches. Nodes can be
1732 marked as "accepting" meaning they represent a complete word. Eg:
1733
1734   /he|she|his|hers/
1735
1736 would convert into the following structure. Numbers represent states, letters
1737 following numbers represent valid transitions on the letter from that state, if
1738 the number is in square brackets it represents an accepting state, otherwise it
1739 will be in parenthesis.
1740
1741  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1742  |    |
1743  |   (2)
1744  |    |
1745  (1)   +-i->(6)-+-s->[7]
1746  |
1747  +-s->(3)-+-h->(4)-+-e->[5]
1748
1749  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1750
1751 This shows that when matching against the string 'hers' we will begin at state 1
1752 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1753 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1754 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1755 single traverse. We store a mapping from accepting to state to which word was
1756 matched, and then when we have multiple possibilities we try to complete the
1757 rest of the regex in the order in which they occured in the alternation.
1758
1759 The only prior NFA like behaviour that would be changed by the TRIE support is
1760 the silent ignoring of duplicate alternations which are of the form:
1761
1762  / (DUPE|DUPE) X? (?{ ... }) Y /x
1763
1764 Thus EVAL blocks following a trie may be called a different number of times with
1765 and without the optimisation. With the optimisations dupes will be silently
1766 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1767 the following demonstrates:
1768
1769  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1770
1771 which prints out 'word' three times, but
1772
1773  'words'=~/(word|word|word)(?{ print $1 })S/
1774
1775 which doesnt print it out at all. This is due to other optimisations kicking in.
1776
1777 Example of what happens on a structural level:
1778
1779 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1780
1781    1: CURLYM[1] {1,32767}(18)
1782    5:   BRANCH(8)
1783    6:     EXACT <ac>(16)
1784    8:   BRANCH(11)
1785    9:     EXACT <ad>(16)
1786   11:   BRANCH(14)
1787   12:     EXACT <ab>(16)
1788   16:   SUCCEED(0)
1789   17:   NOTHING(18)
1790   18: END(0)
1791
1792 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1793 and should turn into:
1794
1795    1: CURLYM[1] {1,32767}(18)
1796    5:   TRIE(16)
1797   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1798   <ac>
1799   <ad>
1800   <ab>
1801   16:   SUCCEED(0)
1802   17:   NOTHING(18)
1803   18: END(0)
1804
1805 Cases where tail != last would be like /(?foo|bar)baz/:
1806
1807    1: BRANCH(4)
1808    2:   EXACT <foo>(8)
1809    4: BRANCH(7)
1810    5:   EXACT <bar>(8)
1811    7: TAIL(8)
1812    8: EXACT <baz>(10)
1813   10: END(0)
1814
1815 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1816 and would end up looking like:
1817
1818  1: TRIE(8)
1819  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1820   <foo>
1821   <bar>
1822    7: TAIL(8)
1823    8: EXACT <baz>(10)
1824   10: END(0)
1825
1826  d = uvchr_to_utf8_flags(d, uv, 0);
1827
1828 is the recommended Unicode-aware way of saying
1829
1830  *(d++) = uv;
1831 */
1832
1833 #define TRIE_STORE_REVCHAR(val)                                            \
1834  STMT_START {                                                           \
1835   if (UTF) {          \
1836    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1837    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1838    unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1839    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1840    SvPOK_on(zlopp);         \
1841    SvUTF8_on(zlopp);         \
1842    av_push(revcharmap, zlopp);        \
1843   } else {          \
1844    char ooooff = (char)val;                                           \
1845    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1846   }           \
1847   } STMT_END
1848
1849 /* This gets the next character from the input, folding it if not already
1850  * folded. */
1851 #define TRIE_READ_CHAR STMT_START {                                           \
1852  wordlen++;                                                                \
1853  if ( UTF ) {                                                              \
1854   /* if it is UTF then it is either already folded, or does not need    \
1855   * folding */                                                         \
1856   uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1857  }                                                                         \
1858  else if (folder == PL_fold_latin1) {                                      \
1859   /* This folder implies Unicode rules, which in the range expressible  \
1860   *  by not UTF is the lower case, with the two exceptions, one of     \
1861   *  which should have been taken care of before calling this */       \
1862   assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1863   uvc = toLOWER_L1(*uc);                                                \
1864   if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1865   len = 1;                                                              \
1866  } else {                                                                  \
1867   /* raw data, will be folded later if needed */                        \
1868   uvc = (U32)*uc;                                                       \
1869   len = 1;                                                              \
1870  }                                                                         \
1871 } STMT_END
1872
1873
1874
1875 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1876  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1877   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1878   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1879  }                                                           \
1880  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1881  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1882  TRIE_LIST_CUR( state )++;                                   \
1883 } STMT_END
1884
1885 #define TRIE_LIST_NEW(state) STMT_START {                       \
1886  Newxz( trie->states[ state ].trans.list,               \
1887   4, reg_trie_trans_le );                                 \
1888  TRIE_LIST_CUR( state ) = 1;                                \
1889  TRIE_LIST_LEN( state ) = 4;                                \
1890 } STMT_END
1891
1892 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1893  U16 dupe= trie->states[ state ].wordnum;                    \
1894  regnode * const noper_next = regnext( noper );              \
1895                 \
1896  DEBUG_r({                                                   \
1897   /* store the word for dumping */                        \
1898   SV* tmp;                                                \
1899   if (OP(noper) != NOTHING)                               \
1900    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1901   else                                                    \
1902    tmp = newSVpvn_utf8( "", 0, UTF );   \
1903   av_push( trie_words, tmp );                             \
1904  });                                                         \
1905                 \
1906  curword++;                                                  \
1907  trie->wordinfo[curword].prev   = 0;                         \
1908  trie->wordinfo[curword].len    = wordlen;                   \
1909  trie->wordinfo[curword].accept = state;                     \
1910                 \
1911  if ( noper_next < tail ) {                                  \
1912   if (!trie->jump)                                        \
1913    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1914             sizeof(U16) ); \
1915   trie->jump[curword] = (U16)(noper_next - convert);      \
1916   if (!jumper)                                            \
1917    jumper = noper_next;                                \
1918   if (!nextbranch)                                        \
1919    nextbranch= regnext(cur);                           \
1920  }                                                           \
1921                 \
1922  if ( dupe ) {                                               \
1923   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1924   /* chain, so that when the bits of chain are later    */\
1925   /* linked together, the dups appear in the chain      */\
1926   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1927   trie->wordinfo[dupe].prev = curword;                    \
1928  } else {                                                    \
1929   /* we haven't inserted this word yet.                */ \
1930   trie->states[ state ].wordnum = curword;                \
1931  }                                                           \
1932 } STMT_END
1933
1934
1935 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1936  ( ( base + charid >=  ucharcount     \
1937   && base + charid < ubound     \
1938   && state == trie->trans[ base - ucharcount + charid ].check \
1939   && trie->trans[ base - ucharcount + charid ].next )  \
1940   ? trie->trans[ base - ucharcount + charid ].next  \
1941   : ( state==1 ? special : 0 )     \
1942  )
1943
1944 #define MADE_TRIE       1
1945 #define MADE_JUMP_TRIE  2
1946 #define MADE_EXACT_TRIE 4
1947
1948 STATIC I32
1949 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1950     regnode *first, regnode *last, regnode *tail,
1951     U32 word_count, U32 flags, U32 depth)
1952 {
1953  dVAR;
1954  /* first pass, loop through and scan words */
1955  reg_trie_data *trie;
1956  HV *widecharmap = NULL;
1957  AV *revcharmap = newAV();
1958  regnode *cur;
1959  STRLEN len = 0;
1960  UV uvc = 0;
1961  U16 curword = 0;
1962  U32 next_alloc = 0;
1963  regnode *jumper = NULL;
1964  regnode *nextbranch = NULL;
1965  regnode *convert = NULL;
1966  U32 *prev_states; /* temp array mapping each state to previous one */
1967  /* we just use folder as a flag in utf8 */
1968  const U8 * folder = NULL;
1969
1970 #ifdef DEBUGGING
1971  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1972  AV *trie_words = NULL;
1973  /* along with revcharmap, this only used during construction but both are
1974  * useful during debugging so we store them in the struct when debugging.
1975  */
1976 #else
1977  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1978  STRLEN trie_charcount=0;
1979 #endif
1980  SV *re_trie_maxbuff;
1981  GET_RE_DEBUG_FLAGS_DECL;
1982
1983  PERL_ARGS_ASSERT_MAKE_TRIE;
1984 #ifndef DEBUGGING
1985  PERL_UNUSED_ARG(depth);
1986 #endif
1987
1988  switch (flags) {
1989   case EXACT: break;
1990   case EXACTFA:
1991   case EXACTFU_SS:
1992   case EXACTFU: folder = PL_fold_latin1; break;
1993   case EXACTF:  folder = PL_fold; break;
1994   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1995  }
1996
1997  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1998  trie->refcount = 1;
1999  trie->startstate = 1;
2000  trie->wordcount = word_count;
2001  RExC_rxi->data->data[ data_slot ] = (void*)trie;
2002  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2003  if (flags == EXACT)
2004   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2005  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2006      trie->wordcount+1, sizeof(reg_trie_wordinfo));
2007
2008  DEBUG_r({
2009   trie_words = newAV();
2010  });
2011
2012  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2013  if (!SvIOK(re_trie_maxbuff)) {
2014   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2015  }
2016  DEBUG_TRIE_COMPILE_r({
2017   PerlIO_printf( Perl_debug_log,
2018   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2019   (int)depth * 2 + 2, "",
2020   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2021   REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2022  });
2023
2024    /* Find the node we are going to overwrite */
2025  if ( first == startbranch && OP( last ) != BRANCH ) {
2026   /* whole branch chain */
2027   convert = first;
2028  } else {
2029   /* branch sub-chain */
2030   convert = NEXTOPER( first );
2031  }
2032
2033  /*  -- First loop and Setup --
2034
2035  We first traverse the branches and scan each word to determine if it
2036  contains widechars, and how many unique chars there are, this is
2037  important as we have to build a table with at least as many columns as we
2038  have unique chars.
2039
2040  We use an array of integers to represent the character codes 0..255
2041  (trie->charmap) and we use a an HV* to store Unicode characters. We use
2042  the native representation of the character value as the key and IV's for
2043  the coded index.
2044
2045  *TODO* If we keep track of how many times each character is used we can
2046  remap the columns so that the table compression later on is more
2047  efficient in terms of memory by ensuring the most common value is in the
2048  middle and the least common are on the outside.  IMO this would be better
2049  than a most to least common mapping as theres a decent chance the most
2050  common letter will share a node with the least common, meaning the node
2051  will not be compressible. With a middle is most common approach the worst
2052  case is when we have the least common nodes twice.
2053
2054  */
2055
2056  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2057   regnode *noper = NEXTOPER( cur );
2058   const U8 *uc = (U8*)STRING( noper );
2059   const U8 *e  = uc + STR_LEN( noper );
2060   int foldlen = 0;
2061   U32 wordlen      = 0;         /* required init */
2062   STRLEN minchars = 0;
2063   STRLEN maxchars = 0;
2064   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2065            bitmap?*/
2066
2067   if (OP(noper) == NOTHING) {
2068    regnode *noper_next= regnext(noper);
2069    if (noper_next != tail && OP(noper_next) == flags) {
2070     noper = noper_next;
2071     uc= (U8*)STRING(noper);
2072     e= uc + STR_LEN(noper);
2073     trie->minlen= STR_LEN(noper);
2074    } else {
2075     trie->minlen= 0;
2076     continue;
2077    }
2078   }
2079
2080   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2081    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2082           regardless of encoding */
2083    if (OP( noper ) == EXACTFU_SS) {
2084     /* false positives are ok, so just set this */
2085     TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2086    }
2087   }
2088   for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2089           branch */
2090    TRIE_CHARCOUNT(trie)++;
2091    TRIE_READ_CHAR;
2092
2093    /* TRIE_READ_CHAR returns the current character, or its fold if /i
2094    * is in effect.  Under /i, this character can match itself, or
2095    * anything that folds to it.  If not under /i, it can match just
2096    * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2097    * all fold to k, and all are single characters.   But some folds
2098    * expand to more than one character, so for example LATIN SMALL
2099    * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2100    * the string beginning at 'uc' is 'ffi', it could be matched by
2101    * three characters, or just by the one ligature character. (It
2102    * could also be matched by two characters: LATIN SMALL LIGATURE FF
2103    * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2104    * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2105    * match.)  The trie needs to know the minimum and maximum number
2106    * of characters that could match so that it can use size alone to
2107    * quickly reject many match attempts.  The max is simple: it is
2108    * the number of folded characters in this branch (since a fold is
2109    * never shorter than what folds to it. */
2110
2111    maxchars++;
2112
2113    /* And the min is equal to the max if not under /i (indicated by
2114    * 'folder' being NULL), or there are no multi-character folds.  If
2115    * there is a multi-character fold, the min is incremented just
2116    * once, for the character that folds to the sequence.  Each
2117    * character in the sequence needs to be added to the list below of
2118    * characters in the trie, but we count only the first towards the
2119    * min number of characters needed.  This is done through the
2120    * variable 'foldlen', which is returned by the macros that look
2121    * for these sequences as the number of bytes the sequence
2122    * occupies.  Each time through the loop, we decrement 'foldlen' by
2123    * how many bytes the current char occupies.  Only when it reaches
2124    * 0 do we increment 'minchars' or look for another multi-character
2125    * sequence. */
2126    if (folder == NULL) {
2127     minchars++;
2128    }
2129    else if (foldlen > 0) {
2130     foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2131    }
2132    else {
2133     minchars++;
2134
2135     /* See if *uc is the beginning of a multi-character fold.  If
2136     * so, we decrement the length remaining to look at, to account
2137     * for the current character this iteration.  (We can use 'uc'
2138     * instead of the fold returned by TRIE_READ_CHAR because for
2139     * non-UTF, the latin1_safe macro is smart enough to account
2140     * for all the unfolded characters, and because for UTF, the
2141     * string will already have been folded earlier in the
2142     * compilation process */
2143     if (UTF) {
2144      if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2145       foldlen -= UTF8SKIP(uc);
2146      }
2147     }
2148     else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2149      foldlen--;
2150     }
2151    }
2152
2153    /* The current character (and any potential folds) should be added
2154    * to the possible matching characters for this position in this
2155    * branch */
2156    if ( uvc < 256 ) {
2157     if ( folder ) {
2158      U8 folded= folder[ (U8) uvc ];
2159      if ( !trie->charmap[ folded ] ) {
2160       trie->charmap[ folded ]=( ++trie->uniquecharcount );
2161       TRIE_STORE_REVCHAR( folded );
2162      }
2163     }
2164     if ( !trie->charmap[ uvc ] ) {
2165      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2166      TRIE_STORE_REVCHAR( uvc );
2167     }
2168     if ( set_bit ) {
2169      /* store the codepoint in the bitmap, and its folded
2170      * equivalent. */
2171      TRIE_BITMAP_SET(trie, uvc);
2172
2173      /* store the folded codepoint */
2174      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2175
2176      if ( !UTF ) {
2177       /* store first byte of utf8 representation of
2178       variant codepoints */
2179       if (! UVCHR_IS_INVARIANT(uvc)) {
2180        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2181       }
2182      }
2183      set_bit = 0; /* We've done our bit :-) */
2184     }
2185    } else {
2186
2187     /* XXX We could come up with the list of code points that fold
2188     * to this using PL_utf8_foldclosures, except not for
2189     * multi-char folds, as there may be multiple combinations
2190     * there that could work, which needs to wait until runtime to
2191     * resolve (The comment about LIGATURE FFI above is such an
2192     * example */
2193
2194     SV** svpp;
2195     if ( !widecharmap )
2196      widecharmap = newHV();
2197
2198     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2199
2200     if ( !svpp )
2201      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2202
2203     if ( !SvTRUE( *svpp ) ) {
2204      sv_setiv( *svpp, ++trie->uniquecharcount );
2205      TRIE_STORE_REVCHAR(uvc);
2206     }
2207    }
2208   } /* end loop through characters in this branch of the trie */
2209
2210   /* We take the min and max for this branch and combine to find the min
2211   * and max for all branches processed so far */
2212   if( cur == first ) {
2213    trie->minlen = minchars;
2214    trie->maxlen = maxchars;
2215   } else if (minchars < trie->minlen) {
2216    trie->minlen = minchars;
2217   } else if (maxchars > trie->maxlen) {
2218    trie->maxlen = maxchars;
2219   }
2220  } /* end first pass */
2221  DEBUG_TRIE_COMPILE_r(
2222   PerlIO_printf( Perl_debug_log,
2223     "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2224     (int)depth * 2 + 2,"",
2225     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2226     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2227     (int)trie->minlen, (int)trie->maxlen )
2228  );
2229
2230  /*
2231   We now know what we are dealing with in terms of unique chars and
2232   string sizes so we can calculate how much memory a naive
2233   representation using a flat table  will take. If it's over a reasonable
2234   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2235   conservative but potentially much slower representation using an array
2236   of lists.
2237
2238   At the end we convert both representations into the same compressed
2239   form that will be used in regexec.c for matching with. The latter
2240   is a form that cannot be used to construct with but has memory
2241   properties similar to the list form and access properties similar
2242   to the table form making it both suitable for fast searches and
2243   small enough that its feasable to store for the duration of a program.
2244
2245   See the comment in the code where the compressed table is produced
2246   inplace from the flat tabe representation for an explanation of how
2247   the compression works.
2248
2249  */
2250
2251
2252  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2253  prev_states[1] = 0;
2254
2255  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2256              > SvIV(re_trie_maxbuff) )
2257  {
2258   /*
2259    Second Pass -- Array Of Lists Representation
2260
2261    Each state will be represented by a list of charid:state records
2262    (reg_trie_trans_le) the first such element holds the CUR and LEN
2263    points of the allocated array. (See defines above).
2264
2265    We build the initial structure using the lists, and then convert
2266    it into the compressed table form which allows faster lookups
2267    (but cant be modified once converted).
2268   */
2269
2270   STRLEN transcount = 1;
2271
2272   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2273    "%*sCompiling trie using list compiler\n",
2274    (int)depth * 2 + 2, ""));
2275
2276   trie->states = (reg_trie_state *)
2277    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2278         sizeof(reg_trie_state) );
2279   TRIE_LIST_NEW(1);
2280   next_alloc = 2;
2281
2282   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2283
2284    regnode *noper   = NEXTOPER( cur );
2285    U8 *uc           = (U8*)STRING( noper );
2286    const U8 *e      = uc + STR_LEN( noper );
2287    U32 state        = 1;         /* required init */
2288    U16 charid       = 0;         /* sanity init */
2289    U32 wordlen      = 0;         /* required init */
2290
2291    if (OP(noper) == NOTHING) {
2292     regnode *noper_next= regnext(noper);
2293     if (noper_next != tail && OP(noper_next) == flags) {
2294      noper = noper_next;
2295      uc= (U8*)STRING(noper);
2296      e= uc + STR_LEN(noper);
2297     }
2298    }
2299
2300    if (OP(noper) != NOTHING) {
2301     for ( ; uc < e ; uc += len ) {
2302
2303      TRIE_READ_CHAR;
2304
2305      if ( uvc < 256 ) {
2306       charid = trie->charmap[ uvc ];
2307      } else {
2308       SV** const svpp = hv_fetch( widecharmap,
2309              (char*)&uvc,
2310              sizeof( UV ),
2311              0);
2312       if ( !svpp ) {
2313        charid = 0;
2314       } else {
2315        charid=(U16)SvIV( *svpp );
2316       }
2317      }
2318      /* charid is now 0 if we dont know the char read, or
2319      * nonzero if we do */
2320      if ( charid ) {
2321
2322       U16 check;
2323       U32 newstate = 0;
2324
2325       charid--;
2326       if ( !trie->states[ state ].trans.list ) {
2327        TRIE_LIST_NEW( state );
2328       }
2329       for ( check = 1;
2330        check <= TRIE_LIST_USED( state );
2331        check++ )
2332       {
2333        if ( TRIE_LIST_ITEM( state, check ).forid
2334                  == charid )
2335        {
2336         newstate = TRIE_LIST_ITEM( state, check ).newstate;
2337         break;
2338        }
2339       }
2340       if ( ! newstate ) {
2341        newstate = next_alloc++;
2342        prev_states[newstate] = state;
2343        TRIE_LIST_PUSH( state, charid, newstate );
2344        transcount++;
2345       }
2346       state = newstate;
2347      } else {
2348       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2349      }
2350     }
2351    }
2352    TRIE_HANDLE_WORD(state);
2353
2354   } /* end second pass */
2355
2356   /* next alloc is the NEXT state to be allocated */
2357   trie->statecount = next_alloc;
2358   trie->states = (reg_trie_state *)
2359    PerlMemShared_realloc( trie->states,
2360         next_alloc
2361         * sizeof(reg_trie_state) );
2362
2363   /* and now dump it out before we compress it */
2364   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2365               revcharmap, next_alloc,
2366               depth+1)
2367   );
2368
2369   trie->trans = (reg_trie_trans *)
2370    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2371   {
2372    U32 state;
2373    U32 tp = 0;
2374    U32 zp = 0;
2375
2376
2377    for( state=1 ; state < next_alloc ; state ++ ) {
2378     U32 base=0;
2379
2380     /*
2381     DEBUG_TRIE_COMPILE_MORE_r(
2382      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2383     );
2384     */
2385
2386     if (trie->states[state].trans.list) {
2387      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2388      U16 maxid=minid;
2389      U16 idx;
2390
2391      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2392       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2393       if ( forid < minid ) {
2394        minid=forid;
2395       } else if ( forid > maxid ) {
2396        maxid=forid;
2397       }
2398      }
2399      if ( transcount < tp + maxid - minid + 1) {
2400       transcount *= 2;
2401       trie->trans = (reg_trie_trans *)
2402        PerlMemShared_realloc( trie->trans,
2403              transcount
2404              * sizeof(reg_trie_trans) );
2405       Zero( trie->trans + (transcount / 2),
2406        transcount / 2,
2407        reg_trie_trans );
2408      }
2409      base = trie->uniquecharcount + tp - minid;
2410      if ( maxid == minid ) {
2411       U32 set = 0;
2412       for ( ; zp < tp ; zp++ ) {
2413        if ( ! trie->trans[ zp ].next ) {
2414         base = trie->uniquecharcount + zp - minid;
2415         trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2416                 1).newstate;
2417         trie->trans[ zp ].check = state;
2418         set = 1;
2419         break;
2420        }
2421       }
2422       if ( !set ) {
2423        trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2424                 1).newstate;
2425        trie->trans[ tp ].check = state;
2426        tp++;
2427        zp = tp;
2428       }
2429      } else {
2430       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2431        const U32 tid = base
2432           - trie->uniquecharcount
2433           + TRIE_LIST_ITEM( state, idx ).forid;
2434        trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2435                 idx ).newstate;
2436        trie->trans[ tid ].check = state;
2437       }
2438       tp += ( maxid - minid + 1 );
2439      }
2440      Safefree(trie->states[ state ].trans.list);
2441     }
2442     /*
2443     DEBUG_TRIE_COMPILE_MORE_r(
2444      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2445     );
2446     */
2447     trie->states[ state ].trans.base=base;
2448    }
2449    trie->lasttrans = tp + 1;
2450   }
2451  } else {
2452   /*
2453   Second Pass -- Flat Table Representation.
2454
2455   we dont use the 0 slot of either trans[] or states[] so we add 1 to
2456   each.  We know that we will need Charcount+1 trans at most to store
2457   the data (one row per char at worst case) So we preallocate both
2458   structures assuming worst case.
2459
2460   We then construct the trie using only the .next slots of the entry
2461   structs.
2462
2463   We use the .check field of the first entry of the node temporarily
2464   to make compression both faster and easier by keeping track of how
2465   many non zero fields are in the node.
2466
2467   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2468   transition.
2469
2470   There are two terms at use here: state as a TRIE_NODEIDX() which is
2471   a number representing the first entry of the node, and state as a
2472   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2473   and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2474   if there are 2 entrys per node. eg:
2475
2476    A B       A B
2477   1. 2 4    1. 3 7
2478   2. 0 3    3. 0 5
2479   3. 0 0    5. 0 0
2480   4. 0 0    7. 0 0
2481
2482   The table is internally in the right hand, idx form. However as we
2483   also have to deal with the states array which is indexed by nodenum
2484   we have to use TRIE_NODENUM() to convert.
2485
2486   */
2487   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2488    "%*sCompiling trie using table compiler\n",
2489    (int)depth * 2 + 2, ""));
2490
2491   trie->trans = (reg_trie_trans *)
2492    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2493         * trie->uniquecharcount + 1,
2494         sizeof(reg_trie_trans) );
2495   trie->states = (reg_trie_state *)
2496    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2497         sizeof(reg_trie_state) );
2498   next_alloc = trie->uniquecharcount + 1;
2499
2500
2501   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2502
2503    regnode *noper   = NEXTOPER( cur );
2504    const U8 *uc     = (U8*)STRING( noper );
2505    const U8 *e      = uc + STR_LEN( noper );
2506
2507    U32 state        = 1;         /* required init */
2508
2509    U16 charid       = 0;         /* sanity init */
2510    U32 accept_state = 0;         /* sanity init */
2511
2512    U32 wordlen      = 0;         /* required init */
2513
2514    if (OP(noper) == NOTHING) {
2515     regnode *noper_next= regnext(noper);
2516     if (noper_next != tail && OP(noper_next) == flags) {
2517      noper = noper_next;
2518      uc= (U8*)STRING(noper);
2519      e= uc + STR_LEN(noper);
2520     }
2521    }
2522
2523    if ( OP(noper) != NOTHING ) {
2524     for ( ; uc < e ; uc += len ) {
2525
2526      TRIE_READ_CHAR;
2527
2528      if ( uvc < 256 ) {
2529       charid = trie->charmap[ uvc ];
2530      } else {
2531       SV* const * const svpp = hv_fetch( widecharmap,
2532               (char*)&uvc,
2533               sizeof( UV ),
2534               0);
2535       charid = svpp ? (U16)SvIV(*svpp) : 0;
2536      }
2537      if ( charid ) {
2538       charid--;
2539       if ( !trie->trans[ state + charid ].next ) {
2540        trie->trans[ state + charid ].next = next_alloc;
2541        trie->trans[ state ].check++;
2542        prev_states[TRIE_NODENUM(next_alloc)]
2543          = TRIE_NODENUM(state);
2544        next_alloc += trie->uniquecharcount;
2545       }
2546       state = trie->trans[ state + charid ].next;
2547      } else {
2548       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2549      }
2550      /* charid is now 0 if we dont know the char read, or
2551      * nonzero if we do */
2552     }
2553    }
2554    accept_state = TRIE_NODENUM( state );
2555    TRIE_HANDLE_WORD(accept_state);
2556
2557   } /* end second pass */
2558
2559   /* and now dump it out before we compress it */
2560   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2561               revcharmap,
2562               next_alloc, depth+1));
2563
2564   {
2565   /*
2566   * Inplace compress the table.*
2567
2568   For sparse data sets the table constructed by the trie algorithm will
2569   be mostly 0/FAIL transitions or to put it another way mostly empty.
2570   (Note that leaf nodes will not contain any transitions.)
2571
2572   This algorithm compresses the tables by eliminating most such
2573   transitions, at the cost of a modest bit of extra work during lookup:
2574
2575   - Each states[] entry contains a .base field which indicates the
2576   index in the state[] array wheres its transition data is stored.
2577
2578   - If .base is 0 there are no valid transitions from that node.
2579
2580   - If .base is nonzero then charid is added to it to find an entry in
2581   the trans array.
2582
2583   -If trans[states[state].base+charid].check!=state then the
2584   transition is taken to be a 0/Fail transition. Thus if there are fail
2585   transitions at the front of the node then the .base offset will point
2586   somewhere inside the previous nodes data (or maybe even into a node
2587   even earlier), but the .check field determines if the transition is
2588   valid.
2589
2590   XXX - wrong maybe?
2591   The following process inplace converts the table to the compressed
2592   table: We first do not compress the root node 1,and mark all its
2593   .check pointers as 1 and set its .base pointer as 1 as well. This
2594   allows us to do a DFA construction from the compressed table later,
2595   and ensures that any .base pointers we calculate later are greater
2596   than 0.
2597
2598   - We set 'pos' to indicate the first entry of the second node.
2599
2600   - We then iterate over the columns of the node, finding the first and
2601   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2602   and set the .check pointers accordingly, and advance pos
2603   appropriately and repreat for the next node. Note that when we copy
2604   the next pointers we have to convert them from the original
2605   NODEIDX form to NODENUM form as the former is not valid post
2606   compression.
2607
2608   - If a node has no transitions used we mark its base as 0 and do not
2609   advance the pos pointer.
2610
2611   - If a node only has one transition we use a second pointer into the
2612   structure to fill in allocated fail transitions from other states.
2613   This pointer is independent of the main pointer and scans forward
2614   looking for null transitions that are allocated to a state. When it
2615   finds one it writes the single transition into the "hole".  If the
2616   pointer doesnt find one the single transition is appended as normal.
2617
2618   - Once compressed we can Renew/realloc the structures to release the
2619   excess space.
2620
2621   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2622   specifically Fig 3.47 and the associated pseudocode.
2623
2624   demq
2625   */
2626   const U32 laststate = TRIE_NODENUM( next_alloc );
2627   U32 state, charid;
2628   U32 pos = 0, zp=0;
2629   trie->statecount = laststate;
2630
2631   for ( state = 1 ; state < laststate ; state++ ) {
2632    U8 flag = 0;
2633    const U32 stateidx = TRIE_NODEIDX( state );
2634    const U32 o_used = trie->trans[ stateidx ].check;
2635    U32 used = trie->trans[ stateidx ].check;
2636    trie->trans[ stateidx ].check = 0;
2637
2638    for ( charid = 0;
2639     used && charid < trie->uniquecharcount;
2640     charid++ )
2641    {
2642     if ( flag || trie->trans[ stateidx + charid ].next ) {
2643      if ( trie->trans[ stateidx + charid ].next ) {
2644       if (o_used == 1) {
2645        for ( ; zp < pos ; zp++ ) {
2646         if ( ! trie->trans[ zp ].next ) {
2647          break;
2648         }
2649        }
2650        trie->states[ state ].trans.base
2651              = zp
2652              + trie->uniquecharcount
2653              - charid ;
2654        trie->trans[ zp ].next
2655         = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2656                + charid ].next );
2657        trie->trans[ zp ].check = state;
2658        if ( ++zp > pos ) pos = zp;
2659        break;
2660       }
2661       used--;
2662      }
2663      if ( !flag ) {
2664       flag = 1;
2665       trie->states[ state ].trans.base
2666          = pos + trie->uniquecharcount - charid ;
2667      }
2668      trie->trans[ pos ].next
2669       = SAFE_TRIE_NODENUM(
2670          trie->trans[ stateidx + charid ].next );
2671      trie->trans[ pos ].check = state;
2672      pos++;
2673     }
2674    }
2675   }
2676   trie->lasttrans = pos + 1;
2677   trie->states = (reg_trie_state *)
2678    PerlMemShared_realloc( trie->states, laststate
2679         * sizeof(reg_trie_state) );
2680   DEBUG_TRIE_COMPILE_MORE_r(
2681    PerlIO_printf( Perl_debug_log,
2682     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2683     (int)depth * 2 + 2,"",
2684     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2685      + 1 ),
2686     (IV)next_alloc,
2687     (IV)pos,
2688     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2689    );
2690
2691   } /* end table compress */
2692  }
2693  DEBUG_TRIE_COMPILE_MORE_r(
2694    PerlIO_printf(Perl_debug_log,
2695     "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2696     (int)depth * 2 + 2, "",
2697     (UV)trie->statecount,
2698     (UV)trie->lasttrans)
2699  );
2700  /* resize the trans array to remove unused space */
2701  trie->trans = (reg_trie_trans *)
2702   PerlMemShared_realloc( trie->trans, trie->lasttrans
2703        * sizeof(reg_trie_trans) );
2704
2705  {   /* Modify the program and insert the new TRIE node */
2706   U8 nodetype =(U8)(flags & 0xFF);
2707   char *str=NULL;
2708
2709 #ifdef DEBUGGING
2710   regnode *optimize = NULL;
2711 #ifdef RE_TRACK_PATTERN_OFFSETS
2712
2713   U32 mjd_offset = 0;
2714   U32 mjd_nodelen = 0;
2715 #endif /* RE_TRACK_PATTERN_OFFSETS */
2716 #endif /* DEBUGGING */
2717   /*
2718   This means we convert either the first branch or the first Exact,
2719   depending on whether the thing following (in 'last') is a branch
2720   or not and whther first is the startbranch (ie is it a sub part of
2721   the alternation or is it the whole thing.)
2722   Assuming its a sub part we convert the EXACT otherwise we convert
2723   the whole branch sequence, including the first.
2724   */
2725   /* Find the node we are going to overwrite */
2726   if ( first != startbranch || OP( last ) == BRANCH ) {
2727    /* branch sub-chain */
2728    NEXT_OFF( first ) = (U16)(last - first);
2729 #ifdef RE_TRACK_PATTERN_OFFSETS
2730    DEBUG_r({
2731     mjd_offset= Node_Offset((convert));
2732     mjd_nodelen= Node_Length((convert));
2733    });
2734 #endif
2735    /* whole branch chain */
2736   }
2737 #ifdef RE_TRACK_PATTERN_OFFSETS
2738   else {
2739    DEBUG_r({
2740     const  regnode *nop = NEXTOPER( convert );
2741     mjd_offset= Node_Offset((nop));
2742     mjd_nodelen= Node_Length((nop));
2743    });
2744   }
2745   DEBUG_OPTIMISE_r(
2746    PerlIO_printf(Perl_debug_log,
2747     "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2748     (int)depth * 2 + 2, "",
2749     (UV)mjd_offset, (UV)mjd_nodelen)
2750   );
2751 #endif
2752   /* But first we check to see if there is a common prefix we can
2753   split out as an EXACT and put in front of the TRIE node.  */
2754   trie->startstate= 1;
2755   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2756    U32 state;
2757    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2758     U32 ofs = 0;
2759     I32 idx = -1;
2760     U32 count = 0;
2761     const U32 base = trie->states[ state ].trans.base;
2762
2763     if ( trie->states[state].wordnum )
2764       count = 1;
2765
2766     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2767      if ( ( base + ofs >= trie->uniquecharcount ) &&
2768       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2769       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2770      {
2771       if ( ++count > 1 ) {
2772        SV **tmp = av_fetch( revcharmap, ofs, 0);
2773        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2774        if ( state == 1 ) break;
2775        if ( count == 2 ) {
2776         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2777         DEBUG_OPTIMISE_r(
2778          PerlIO_printf(Perl_debug_log,
2779           "%*sNew Start State=%"UVuf" Class: [",
2780           (int)depth * 2 + 2, "",
2781           (UV)state));
2782         if (idx >= 0) {
2783          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2784          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2785
2786          TRIE_BITMAP_SET(trie,*ch);
2787          if ( folder )
2788           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2789          DEBUG_OPTIMISE_r(
2790           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2791          );
2792         }
2793        }
2794        TRIE_BITMAP_SET(trie,*ch);
2795        if ( folder )
2796         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2797        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2798       }
2799       idx = ofs;
2800      }
2801     }
2802     if ( count == 1 ) {
2803      SV **tmp = av_fetch( revcharmap, idx, 0);
2804      STRLEN len;
2805      char *ch = SvPV( *tmp, len );
2806      DEBUG_OPTIMISE_r({
2807       SV *sv=sv_newmortal();
2808       PerlIO_printf( Perl_debug_log,
2809        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2810        (int)depth * 2 + 2, "",
2811        (UV)state, (UV)idx,
2812        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2813         PL_colors[0], PL_colors[1],
2814         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2815         PERL_PV_ESCAPE_FIRSTCHAR
2816        )
2817       );
2818      });
2819      if ( state==1 ) {
2820       OP( convert ) = nodetype;
2821       str=STRING(convert);
2822       STR_LEN(convert)=0;
2823      }
2824      STR_LEN(convert) += len;
2825      while (len--)
2826       *str++ = *ch++;
2827     } else {
2828 #ifdef DEBUGGING
2829      if (state>1)
2830       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2831 #endif
2832      break;
2833     }
2834    }
2835    trie->prefixlen = (state-1);
2836    if (str) {
2837     regnode *n = convert+NODE_SZ_STR(convert);
2838     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2839     trie->startstate = state;
2840     trie->minlen -= (state - 1);
2841     trie->maxlen -= (state - 1);
2842 #ifdef DEBUGGING
2843    /* At least the UNICOS C compiler choked on this
2844     * being argument to DEBUG_r(), so let's just have
2845     * it right here. */
2846    if (
2847 #ifdef PERL_EXT_RE_BUILD
2848     1
2849 #else
2850     DEBUG_r_TEST
2851 #endif
2852     ) {
2853     regnode *fix = convert;
2854     U32 word = trie->wordcount;
2855     mjd_nodelen++;
2856     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2857     while( ++fix < n ) {
2858      Set_Node_Offset_Length(fix, 0, 0);
2859     }
2860     while (word--) {
2861      SV ** const tmp = av_fetch( trie_words, word, 0 );
2862      if (tmp) {
2863       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2864        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2865       else
2866        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2867      }
2868     }
2869    }
2870 #endif
2871     if (trie->maxlen) {
2872      convert = n;
2873     } else {
2874      NEXT_OFF(convert) = (U16)(tail - convert);
2875      DEBUG_r(optimize= n);
2876     }
2877    }
2878   }
2879   if (!jumper)
2880    jumper = last;
2881   if ( trie->maxlen ) {
2882    NEXT_OFF( convert ) = (U16)(tail - convert);
2883    ARG_SET( convert, data_slot );
2884    /* Store the offset to the first unabsorbed branch in
2885    jump[0], which is otherwise unused by the jump logic.
2886    We use this when dumping a trie and during optimisation. */
2887    if (trie->jump)
2888     trie->jump[0] = (U16)(nextbranch - convert);
2889
2890    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2891    *   and there is a bitmap
2892    *   and the first "jump target" node we found leaves enough room
2893    * then convert the TRIE node into a TRIEC node, with the bitmap
2894    * embedded inline in the opcode - this is hypothetically faster.
2895    */
2896    if ( !trie->states[trie->startstate].wordnum
2897     && trie->bitmap
2898     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2899    {
2900     OP( convert ) = TRIEC;
2901     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2902     PerlMemShared_free(trie->bitmap);
2903     trie->bitmap= NULL;
2904    } else
2905     OP( convert ) = TRIE;
2906
2907    /* store the type in the flags */
2908    convert->flags = nodetype;
2909    DEBUG_r({
2910    optimize = convert
2911      + NODE_STEP_REGNODE
2912      + regarglen[ OP( convert ) ];
2913    });
2914    /* XXX We really should free up the resource in trie now,
2915     as we won't use them - (which resources?) dmq */
2916   }
2917   /* needed for dumping*/
2918   DEBUG_r(if (optimize) {
2919    regnode *opt = convert;
2920
2921    while ( ++opt < optimize) {
2922     Set_Node_Offset_Length(opt,0,0);
2923    }
2924    /*
2925     Try to clean up some of the debris left after the
2926     optimisation.
2927    */
2928    while( optimize < jumper ) {
2929     mjd_nodelen += Node_Length((optimize));
2930     OP( optimize ) = OPTIMIZED;
2931     Set_Node_Offset_Length(optimize,0,0);
2932     optimize++;
2933    }
2934    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2935   });
2936  } /* end node insert */
2937  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2938
2939  /*  Finish populating the prev field of the wordinfo array.  Walk back
2940  *  from each accept state until we find another accept state, and if
2941  *  so, point the first word's .prev field at the second word. If the
2942  *  second already has a .prev field set, stop now. This will be the
2943  *  case either if we've already processed that word's accept state,
2944  *  or that state had multiple words, and the overspill words were
2945  *  already linked up earlier.
2946  */
2947  {
2948   U16 word;
2949   U32 state;
2950   U16 prev;
2951
2952   for (word=1; word <= trie->wordcount; word++) {
2953    prev = 0;
2954    if (trie->wordinfo[word].prev)
2955     continue;
2956    state = trie->wordinfo[word].accept;
2957    while (state) {
2958     state = prev_states[state];
2959     if (!state)
2960      break;
2961     prev = trie->states[state].wordnum;
2962     if (prev)
2963      break;
2964    }
2965    trie->wordinfo[word].prev = prev;
2966   }
2967   Safefree(prev_states);
2968  }
2969
2970
2971  /* and now dump out the compressed format */
2972  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2973
2974  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2975 #ifdef DEBUGGING
2976  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2977  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2978 #else
2979  SvREFCNT_dec_NN(revcharmap);
2980 #endif
2981  return trie->jump
2982   ? MADE_JUMP_TRIE
2983   : trie->startstate>1
2984    ? MADE_EXACT_TRIE
2985    : MADE_TRIE;
2986 }
2987
2988 STATIC void
2989 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2990 {
2991 /* The Trie is constructed and compressed now so we can build a fail array if
2992  * it's needed
2993
2994    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2995    3.32 in the
2996    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2997    Ullman 1985/88
2998    ISBN 0-201-10088-6
2999
3000    We find the fail state for each state in the trie, this state is the longest
3001    proper suffix of the current state's 'word' that is also a proper prefix of
3002    another word in our trie. State 1 represents the word '' and is thus the
3003    default fail state. This allows the DFA not to have to restart after its
3004    tried and failed a word at a given point, it simply continues as though it
3005    had been matching the other word in the first place.
3006    Consider
3007  'abcdgu'=~/abcdefg|cdgu/
3008    When we get to 'd' we are still matching the first word, we would encounter
3009    'g' which would fail, which would bring us to the state representing 'd' in
3010    the second word where we would try 'g' and succeed, proceeding to match
3011    'cdgu'.
3012  */
3013  /* add a fail transition */
3014  const U32 trie_offset = ARG(source);
3015  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3016  U32 *q;
3017  const U32 ucharcount = trie->uniquecharcount;
3018  const U32 numstates = trie->statecount;
3019  const U32 ubound = trie->lasttrans + ucharcount;
3020  U32 q_read = 0;
3021  U32 q_write = 0;
3022  U32 charid;
3023  U32 base = trie->states[ 1 ].trans.base;
3024  U32 *fail;
3025  reg_ac_data *aho;
3026  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3027  GET_RE_DEBUG_FLAGS_DECL;
3028
3029  PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
3030 #ifndef DEBUGGING
3031  PERL_UNUSED_ARG(depth);
3032 #endif
3033
3034
3035  ARG_SET( stclass, data_slot );
3036  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3037  RExC_rxi->data->data[ data_slot ] = (void*)aho;
3038  aho->trie=trie_offset;
3039  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3040  Copy( trie->states, aho->states, numstates, reg_trie_state );
3041  Newxz( q, numstates, U32);
3042  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3043  aho->refcount = 1;
3044  fail = aho->fail;
3045  /* initialize fail[0..1] to be 1 so that we always have
3046  a valid final fail state */
3047  fail[ 0 ] = fail[ 1 ] = 1;
3048
3049  for ( charid = 0; charid < ucharcount ; charid++ ) {
3050   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3051   if ( newstate ) {
3052    q[ q_write ] = newstate;
3053    /* set to point at the root */
3054    fail[ q[ q_write++ ] ]=1;
3055   }
3056  }
3057  while ( q_read < q_write) {
3058   const U32 cur = q[ q_read++ % numstates ];
3059   base = trie->states[ cur ].trans.base;
3060
3061   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3062    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3063    if (ch_state) {
3064     U32 fail_state = cur;
3065     U32 fail_base;
3066     do {
3067      fail_state = fail[ fail_state ];
3068      fail_base = aho->states[ fail_state ].trans.base;
3069     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3070
3071     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3072     fail[ ch_state ] = fail_state;
3073     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3074     {
3075       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3076     }
3077     q[ q_write++ % numstates] = ch_state;
3078    }
3079   }
3080  }
3081  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3082  when we fail in state 1, this allows us to use the
3083  charclass scan to find a valid start char. This is based on the principle
3084  that theres a good chance the string being searched contains lots of stuff
3085  that cant be a start char.
3086  */
3087  fail[ 0 ] = fail[ 1 ] = 0;
3088  DEBUG_TRIE_COMPILE_r({
3089   PerlIO_printf(Perl_debug_log,
3090      "%*sStclass Failtable (%"UVuf" states): 0",
3091      (int)(depth * 2), "", (UV)numstates
3092   );
3093   for( q_read=1; q_read<numstates; q_read++ ) {
3094    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3095   }
3096   PerlIO_printf(Perl_debug_log, "\n");
3097  });
3098  Safefree(q);
3099  /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3100 }
3101
3102
3103 #define DEBUG_PEEP(str,scan,depth) \
3104  DEBUG_OPTIMISE_r({if (scan){ \
3105  SV * const mysv=sv_newmortal(); \
3106  regnode *Next = regnext(scan); \
3107  regprop(RExC_rx, mysv, scan, NULL); \
3108  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3109  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3110  Next ? (REG_NODE_NUM(Next)) : 0 ); \
3111    }});
3112
3113
3114 /* The below joins as many adjacent EXACTish nodes as possible into a single
3115  * one.  The regop may be changed if the node(s) contain certain sequences that
3116  * require special handling.  The joining is only done if:
3117  * 1) there is room in the current conglomerated node to entirely contain the
3118  *    next one.
3119  * 2) they are the exact same node type
3120  *
3121  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3122  * these get optimized out
3123  *
3124  * If a node is to match under /i (folded), the number of characters it matches
3125  * can be different than its character length if it contains a multi-character
3126  * fold.  *min_subtract is set to the total delta number of characters of the
3127  * input nodes.
3128  *
3129  * And *unfolded_multi_char is set to indicate whether or not the node contains
3130  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3131  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3132  * SMALL LETTER SHARP S, as only if the target string being matched against
3133  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3134  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3135  * whose components are all above the Latin1 range are not run-time locale
3136  * dependent, and have already been folded by the time this function is
3137  * called.)
3138  *
3139  * This is as good a place as any to discuss the design of handling these
3140  * multi-character fold sequences.  It's been wrong in Perl for a very long
3141  * time.  There are three code points in Unicode whose multi-character folds
3142  * were long ago discovered to mess things up.  The previous designs for
3143  * dealing with these involved assigning a special node for them.  This
3144  * approach doesn't always work, as evidenced by this example:
3145  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3146  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3147  * would match just the \xDF, it won't be able to handle the case where a
3148  * successful match would have to cross the node's boundary.  The new approach
3149  * that hopefully generally solves the problem generates an EXACTFU_SS node
3150  * that is "sss" in this case.
3151  *
3152  * It turns out that there are problems with all multi-character folds, and not
3153  * just these three.  Now the code is general, for all such cases.  The
3154  * approach taken is:
3155  * 1)   This routine examines each EXACTFish node that could contain multi-
3156  *      character folded sequences.  Since a single character can fold into
3157  *      such a sequence, the minimum match length for this node is less than
3158  *      the number of characters in the node.  This routine returns in
3159  *      *min_subtract how many characters to subtract from the the actual
3160  *      length of the string to get a real minimum match length; it is 0 if
3161  *      there are no multi-char foldeds.  This delta is used by the caller to
3162  *      adjust the min length of the match, and the delta between min and max,
3163  *      so that the optimizer doesn't reject these possibilities based on size
3164  *      constraints.
3165  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3166  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3167  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3168  *      there is a possible fold length change.  That means that a regular
3169  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3170  *      with length changes, and so can be processed faster.  regexec.c takes
3171  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3172  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3173  *      known until runtime).  This saves effort in regex matching.  However,
3174  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3175  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3176  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3177  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3178  *      possibilities for the non-UTF8 patterns are quite simple, except for
3179  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3180  *      members of a fold-pair, and arrays are set up for all of them so that
3181  *      the other member of the pair can be found quickly.  Code elsewhere in
3182  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3183  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3184  *      described in the next item.
3185  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3186  *      validity of the fold won't be known until runtime, and so must remain
3187  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3188  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3189  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3190  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3191  *      The reason this is a problem is that the optimizer part of regexec.c
3192  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3193  *      that a character in the pattern corresponds to at most a single
3194  *      character in the target string.  (And I do mean character, and not byte
3195  *      here, unlike other parts of the documentation that have never been
3196  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3197  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3198  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3199  *      nodes, violate the assumption, and they are the only instances where it
3200  *      is violated.  I'm reluctant to try to change the assumption, as the
3201  *      code involved is impenetrable to me (khw), so instead the code here
3202  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3203  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3204  *      boolean indicating whether or not the node contains such a fold.  When
3205  *      it is true, the caller sets a flag that later causes the optimizer in
3206  *      this file to not set values for the floating and fixed string lengths,
3207  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3208  *      assumption.  Thus, there is no optimization based on string lengths for
3209  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3210  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3211  *      assumption is wrong only in these cases is that all other non-UTF-8
3212  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3213  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3214  *      EXACTF nodes because we don't know at compile time if it actually
3215  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3216  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3217  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3218  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3219  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3220  *      string would require the pattern to be forced into UTF-8, the overhead
3221  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3222  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3223  *      locale.)
3224  *
3225  *      Similarly, the code that generates tries doesn't currently handle
3226  *      not-already-folded multi-char folds, and it looks like a pain to change
3227  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3228  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3229  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3230  *      using /iaa matching will be doing so almost entirely with ASCII
3231  *      strings, so this should rarely be encountered in practice */
3232
3233 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3234  if (PL_regkind[OP(scan)] == EXACT) \
3235   join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3236
3237 STATIC U32
3238 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3239     UV *min_subtract, bool *unfolded_multi_char,
3240     U32 flags,regnode *val, U32 depth)
3241 {
3242  /* Merge several consecutive EXACTish nodes into one. */
3243  regnode *n = regnext(scan);
3244  U32 stringok = 1;
3245  regnode *next = scan + NODE_SZ_STR(scan);
3246  U32 merged = 0;
3247  U32 stopnow = 0;
3248 #ifdef DEBUGGING
3249  regnode *stop = scan;
3250  GET_RE_DEBUG_FLAGS_DECL;
3251 #else
3252  PERL_UNUSED_ARG(depth);
3253 #endif
3254
3255  PERL_ARGS_ASSERT_JOIN_EXACT;
3256 #ifndef EXPERIMENTAL_INPLACESCAN
3257  PERL_UNUSED_ARG(flags);
3258  PERL_UNUSED_ARG(val);
3259 #endif
3260  DEBUG_PEEP("join",scan,depth);
3261
3262  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3263  * EXACT ones that are mergeable to the current one. */
3264  while (n
3265   && (PL_regkind[OP(n)] == NOTHING
3266    || (stringok && OP(n) == OP(scan)))
3267   && NEXT_OFF(n)
3268   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3269  {
3270
3271   if (OP(n) == TAIL || n > next)
3272    stringok = 0;
3273   if (PL_regkind[OP(n)] == NOTHING) {
3274    DEBUG_PEEP("skip:",n,depth);
3275    NEXT_OFF(scan) += NEXT_OFF(n);
3276    next = n + NODE_STEP_REGNODE;
3277 #ifdef DEBUGGING
3278    if (stringok)
3279     stop = n;
3280 #endif
3281    n = regnext(n);
3282   }
3283   else if (stringok) {
3284    const unsigned int oldl = STR_LEN(scan);
3285    regnode * const nnext = regnext(n);
3286
3287    /* XXX I (khw) kind of doubt that this works on platforms (should
3288    * Perl ever run on one) where U8_MAX is above 255 because of lots
3289    * of other assumptions */
3290    /* Don't join if the sum can't fit into a single node */
3291    if (oldl + STR_LEN(n) > U8_MAX)
3292     break;
3293
3294    DEBUG_PEEP("merg",n,depth);
3295    merged++;
3296
3297    NEXT_OFF(scan) += NEXT_OFF(n);
3298    STR_LEN(scan) += STR_LEN(n);
3299    next = n + NODE_SZ_STR(n);
3300    /* Now we can overwrite *n : */
3301    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3302 #ifdef DEBUGGING
3303    stop = next - 1;
3304 #endif
3305    n = nnext;
3306    if (stopnow) break;
3307   }
3308
3309 #ifdef EXPERIMENTAL_INPLACESCAN
3310   if (flags && !NEXT_OFF(n)) {
3311    DEBUG_PEEP("atch", val, depth);
3312    if (reg_off_by_arg[OP(n)]) {
3313     ARG_SET(n, val - n);
3314    }
3315    else {
3316     NEXT_OFF(n) = val - n;
3317    }
3318    stopnow = 1;
3319   }
3320 #endif
3321  }
3322
3323  *min_subtract = 0;
3324  *unfolded_multi_char = FALSE;
3325
3326  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3327  * can now analyze for sequences of problematic code points.  (Prior to
3328  * this final joining, sequences could have been split over boundaries, and
3329  * hence missed).  The sequences only happen in folding, hence for any
3330  * non-EXACT EXACTish node */
3331  if (OP(scan) != EXACT) {
3332   U8* s0 = (U8*) STRING(scan);
3333   U8* s = s0;
3334   U8* s_end = s0 + STR_LEN(scan);
3335
3336   int total_count_delta = 0;  /* Total delta number of characters that
3337          multi-char folds expand to */
3338
3339   /* One pass is made over the node's string looking for all the
3340   * possibilities.  To avoid some tests in the loop, there are two main
3341   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3342   * non-UTF-8 */
3343   if (UTF) {
3344    U8* folded = NULL;
3345
3346    if (OP(scan) == EXACTFL) {
3347     U8 *d;
3348
3349     /* An EXACTFL node would already have been changed to another
3350     * node type unless there is at least one character in it that
3351     * is problematic; likely a character whose fold definition
3352     * won't be known until runtime, and so has yet to be folded.
3353     * For all but the UTF-8 locale, folds are 1-1 in length, but
3354     * to handle the UTF-8 case, we need to create a temporary
3355     * folded copy using UTF-8 locale rules in order to analyze it.
3356     * This is because our macros that look to see if a sequence is
3357     * a multi-char fold assume everything is folded (otherwise the
3358     * tests in those macros would be too complicated and slow).
3359     * Note that here, the non-problematic folds will have already
3360     * been done, so we can just copy such characters.  We actually
3361     * don't completely fold the EXACTFL string.  We skip the
3362     * unfolded multi-char folds, as that would just create work
3363     * below to figure out the size they already are */
3364
3365     Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3366     d = folded;
3367     while (s < s_end) {
3368      STRLEN s_len = UTF8SKIP(s);
3369      if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3370       Copy(s, d, s_len, U8);
3371       d += s_len;
3372      }
3373      else if (is_FOLDS_TO_MULTI_utf8(s)) {
3374       *unfolded_multi_char = TRUE;
3375       Copy(s, d, s_len, U8);
3376       d += s_len;
3377      }
3378      else if (isASCII(*s)) {
3379       *(d++) = toFOLD(*s);
3380      }
3381      else {
3382       STRLEN len;
3383       _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3384       d += len;
3385      }
3386      s += s_len;
3387     }
3388
3389     /* Point the remainder of the routine to look at our temporary
3390     * folded copy */
3391     s = folded;
3392     s_end = d;
3393    } /* End of creating folded copy of EXACTFL string */
3394
3395    /* Examine the string for a multi-character fold sequence.  UTF-8
3396    * patterns have all characters pre-folded by the time this code is
3397    * executed */
3398    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3399          length sequence we are looking for is 2 */
3400    {
3401     int count = 0;  /* How many characters in a multi-char fold */
3402     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3403     if (! len) {    /* Not a multi-char fold: get next char */
3404      s += UTF8SKIP(s);
3405      continue;
3406     }
3407
3408     /* Nodes with 'ss' require special handling, except for
3409     * EXACTFA-ish for which there is no multi-char fold to this */
3410     if (len == 2 && *s == 's' && *(s+1) == 's'
3411      && OP(scan) != EXACTFA
3412      && OP(scan) != EXACTFA_NO_TRIE)
3413     {
3414      count = 2;
3415      if (OP(scan) != EXACTFL) {
3416       OP(scan) = EXACTFU_SS;
3417      }
3418      s += 2;
3419     }
3420     else { /* Here is a generic multi-char fold. */
3421      U8* multi_end  = s + len;
3422
3423      /* Count how many characters in it.  In the case of /aa, no
3424      * folds which contain ASCII code points are allowed, so
3425      * check for those, and skip if found. */
3426      if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3427       count = utf8_length(s, multi_end);
3428       s = multi_end;
3429      }
3430      else {
3431       while (s < multi_end) {
3432        if (isASCII(*s)) {
3433         s++;
3434         goto next_iteration;
3435        }
3436        else {
3437         s += UTF8SKIP(s);
3438        }
3439        count++;
3440       }
3441      }
3442     }
3443
3444     /* The delta is how long the sequence is minus 1 (1 is how long
3445     * the character that folds to the sequence is) */
3446     total_count_delta += count - 1;
3447    next_iteration: ;
3448    }
3449
3450    /* We created a temporary folded copy of the string in EXACTFL
3451    * nodes.  Therefore we need to be sure it doesn't go below zero,
3452    * as the real string could be shorter */
3453    if (OP(scan) == EXACTFL) {
3454     int total_chars = utf8_length((U8*) STRING(scan),
3455           (U8*) STRING(scan) + STR_LEN(scan));
3456     if (total_count_delta > total_chars) {
3457      total_count_delta = total_chars;
3458     }
3459    }
3460
3461    *min_subtract += total_count_delta;
3462    Safefree(folded);
3463   }
3464   else if (OP(scan) == EXACTFA) {
3465
3466    /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3467    * fold to the ASCII range (and there are no existing ones in the
3468    * upper latin1 range).  But, as outlined in the comments preceding
3469    * this function, we need to flag any occurrences of the sharp s.
3470    * This character forbids trie formation (because of added
3471    * complexity) */
3472    while (s < s_end) {
3473     if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3474      OP(scan) = EXACTFA_NO_TRIE;
3475      *unfolded_multi_char = TRUE;
3476      break;
3477     }
3478     s++;
3479     continue;
3480    }
3481   }
3482   else {
3483
3484    /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3485    * folds that are all Latin1.  As explained in the comments
3486    * preceding this function, we look also for the sharp s in EXACTF
3487    * and EXACTFL nodes; it can be in the final position.  Otherwise
3488    * we can stop looking 1 byte earlier because have to find at least
3489    * two characters for a multi-fold */
3490    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3491        ? s_end
3492        : s_end -1;
3493
3494    while (s < upper) {
3495     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3496     if (! len) {    /* Not a multi-char fold. */
3497      if (*s == LATIN_SMALL_LETTER_SHARP_S
3498       && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3499      {
3500       *unfolded_multi_char = TRUE;
3501      }
3502      s++;
3503      continue;
3504     }
3505
3506     if (len == 2
3507      && isARG2_lower_or_UPPER_ARG1('s', *s)
3508      && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3509     {
3510
3511      /* EXACTF nodes need to know that the minimum length
3512      * changed so that a sharp s in the string can match this
3513      * ss in the pattern, but they remain EXACTF nodes, as they
3514      * won't match this unless the target string is is UTF-8,
3515      * which we don't know until runtime.  EXACTFL nodes can't
3516      * transform into EXACTFU nodes */
3517      if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3518       OP(scan) = EXACTFU_SS;
3519      }
3520     }
3521
3522     *min_subtract += len - 1;
3523     s += len;
3524    }
3525   }
3526  }
3527
3528 #ifdef DEBUGGING
3529  /* Allow dumping but overwriting the collection of skipped
3530  * ops and/or strings with fake optimized ops */
3531  n = scan + NODE_SZ_STR(scan);
3532  while (n <= stop) {
3533   OP(n) = OPTIMIZED;
3534   FLAGS(n) = 0;
3535   NEXT_OFF(n) = 0;
3536   n++;
3537  }
3538 #endif
3539  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3540  return stopnow;
3541 }
3542
3543 /* REx optimizer.  Converts nodes into quicker variants "in place".
3544    Finds fixed substrings.  */
3545
3546 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3547    to the position after last scanned or to NULL. */
3548
3549 #define INIT_AND_WITHP \
3550  assert(!and_withp); \
3551  Newx(and_withp,1, regnode_ssc); \
3552  SAVEFREEPV(and_withp)
3553
3554 /* this is a chain of data about sub patterns we are processing that
3555    need to be handled separately/specially in study_chunk. Its so
3556    we can simulate recursion without losing state.  */
3557 struct scan_frame;
3558 typedef struct scan_frame {
3559  regnode *last;  /* last node to process in this frame */
3560  regnode *next;  /* next node to process when last is reached */
3561  struct scan_frame *prev; /*previous frame*/
3562  U32 prev_recursed_depth;
3563  I32 stop; /* what stopparen do we use */
3564 } scan_frame;
3565
3566
3567 STATIC SSize_t
3568 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3569       SSize_t *minlenp, SSize_t *deltap,
3570       regnode *last,
3571       scan_data_t *data,
3572       I32 stopparen,
3573       U32 recursed_depth,
3574       regnode_ssc *and_withp,
3575       U32 flags, U32 depth)
3576       /* scanp: Start here (read-write). */
3577       /* deltap: Write maxlen-minlen here. */
3578       /* last: Stop before this one. */
3579       /* data: string data about the pattern */
3580       /* stopparen: treat close N as END */
3581       /* recursed: which subroutines have we recursed into */
3582       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3583 {
3584  dVAR;
3585  /* There must be at least this number of characters to match */
3586  SSize_t min = 0;
3587  I32 pars = 0, code;
3588  regnode *scan = *scanp, *next;
3589  SSize_t delta = 0;
3590  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3591  int is_inf_internal = 0;  /* The studied chunk is infinite */
3592  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3593  scan_data_t data_fake;
3594  SV *re_trie_maxbuff = NULL;
3595  regnode *first_non_open = scan;
3596  SSize_t stopmin = SSize_t_MAX;
3597  scan_frame *frame = NULL;
3598  GET_RE_DEBUG_FLAGS_DECL;
3599
3600  PERL_ARGS_ASSERT_STUDY_CHUNK;
3601
3602 #ifdef DEBUGGING
3603  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3604 #endif
3605  if ( depth == 0 ) {
3606   while (first_non_open && OP(first_non_open) == OPEN)
3607    first_non_open=regnext(first_non_open);
3608  }
3609
3610
3611   fake_study_recurse:
3612  while ( scan && OP(scan) != END && scan < last ){
3613   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3614         node length to get a real minimum (because
3615         the folded version may be shorter) */
3616   bool unfolded_multi_char = FALSE;
3617   /* Peephole optimizer: */
3618   DEBUG_OPTIMISE_MORE_r(
3619   {
3620    PerlIO_printf(Perl_debug_log,
3621     "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3622     ((int) depth*2), "", (long)stopparen,
3623     (unsigned long)depth, (unsigned long)recursed_depth);
3624    if (recursed_depth) {
3625     U32 i;
3626     U32 j;
3627     for ( j = 0 ; j < recursed_depth ; j++ ) {
3628      PerlIO_printf(Perl_debug_log,"[");
3629      for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3630       PerlIO_printf(Perl_debug_log,"%d",
3631        PAREN_TEST(RExC_study_chunk_recursed +
3632          (j * RExC_study_chunk_recursed_bytes), i)
3633        ? 1 : 0
3634       );
3635      PerlIO_printf(Perl_debug_log,"]");
3636     }
3637    }
3638    PerlIO_printf(Perl_debug_log,"\n");
3639   }
3640   );
3641   DEBUG_STUDYDATA("Peep:", data, depth);
3642   DEBUG_PEEP("Peep", scan, depth);
3643
3644
3645   /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3646   * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3647   * by a different invocation of reg() -- Yves
3648   */
3649   JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3650
3651   /* Follow the next-chain of the current node and optimize
3652   away all the NOTHINGs from it.  */
3653   if (OP(scan) != CURLYX) {
3654    const int max = (reg_off_by_arg[OP(scan)]
3655      ? I32_MAX
3656      /* I32 may be smaller than U16 on CRAYs! */
3657      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3658    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3659    int noff;
3660    regnode *n = scan;
3661
3662    /* Skip NOTHING and LONGJMP. */
3663    while ((n = regnext(n))
3664     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3665      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3666     && off + noff < max)
3667     off += noff;
3668    if (reg_off_by_arg[OP(scan)])
3669     ARG(scan) = off;
3670    else
3671     NEXT_OFF(scan) = off;
3672   }
3673
3674
3675
3676   /* The principal pseudo-switch.  Cannot be a switch, since we
3677   look into several different things.  */
3678   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3679     || OP(scan) == IFTHEN) {
3680    next = regnext(scan);
3681    code = OP(scan);
3682    /* demq: the op(next)==code check is to see if we have
3683    * "branch-branch" AFAICT */
3684
3685    if (OP(next) == code || code == IFTHEN) {
3686     /* NOTE - There is similar code to this block below for
3687     * handling TRIE nodes on a re-study.  If you change stuff here
3688     * check there too. */
3689     SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3690     regnode_ssc accum;
3691     regnode * const startbranch=scan;
3692
3693     if (flags & SCF_DO_SUBSTR) {
3694      /* Cannot merge strings after this. */
3695      scan_commit(pRExC_state, data, minlenp, is_inf);
3696     }
3697
3698     if (flags & SCF_DO_STCLASS)
3699      ssc_init_zero(pRExC_state, &accum);
3700
3701     while (OP(scan) == code) {
3702      SSize_t deltanext, minnext, fake;
3703      I32 f = 0;
3704      regnode_ssc this_class;
3705
3706      num++;
3707      data_fake.flags = 0;
3708      if (data) {
3709       data_fake.whilem_c = data->whilem_c;
3710       data_fake.last_closep = data->last_closep;
3711      }
3712      else
3713       data_fake.last_closep = &fake;
3714
3715      data_fake.pos_delta = delta;
3716      next = regnext(scan);
3717      scan = NEXTOPER(scan);
3718      if (code != BRANCH)
3719       scan = NEXTOPER(scan);
3720      if (flags & SCF_DO_STCLASS) {
3721       ssc_init(pRExC_state, &this_class);
3722       data_fake.start_class = &this_class;
3723       f = SCF_DO_STCLASS_AND;
3724      }
3725      if (flags & SCF_WHILEM_VISITED_POS)
3726       f |= SCF_WHILEM_VISITED_POS;
3727
3728      /* we suppose the run is continuous, last=next...*/
3729      minnext = study_chunk(pRExC_state, &scan, minlenp,
3730          &deltanext, next, &data_fake, stopparen,
3731          recursed_depth, NULL, f,depth+1);
3732      if (min1 > minnext)
3733       min1 = minnext;
3734      if (deltanext == SSize_t_MAX) {
3735       is_inf = is_inf_internal = 1;
3736       max1 = SSize_t_MAX;
3737      } else if (max1 < minnext + deltanext)
3738       max1 = minnext + deltanext;
3739      scan = next;
3740      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3741       pars++;
3742      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3743       if ( stopmin > minnext)
3744        stopmin = min + min1;
3745       flags &= ~SCF_DO_SUBSTR;
3746       if (data)
3747        data->flags |= SCF_SEEN_ACCEPT;
3748      }
3749      if (data) {
3750       if (data_fake.flags & SF_HAS_EVAL)
3751        data->flags |= SF_HAS_EVAL;
3752       data->whilem_c = data_fake.whilem_c;
3753      }
3754      if (flags & SCF_DO_STCLASS)
3755       ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3756     }
3757     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3758      min1 = 0;
3759     if (flags & SCF_DO_SUBSTR) {
3760      data->pos_min += min1;
3761      if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3762       data->pos_delta = SSize_t_MAX;
3763      else
3764       data->pos_delta += max1 - min1;
3765      if (max1 != min1 || is_inf)
3766       data->longest = &(data->longest_float);
3767     }
3768     min += min1;
3769     if (delta == SSize_t_MAX
3770     || SSize_t_MAX - delta - (max1 - min1) < 0)
3771      delta = SSize_t_MAX;
3772     else
3773      delta += max1 - min1;
3774     if (flags & SCF_DO_STCLASS_OR) {
3775      ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3776      if (min1) {
3777       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3778       flags &= ~SCF_DO_STCLASS;
3779      }
3780     }
3781     else if (flags & SCF_DO_STCLASS_AND) {
3782      if (min1) {
3783       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3784       flags &= ~SCF_DO_STCLASS;
3785      }
3786      else {
3787       /* Switch to OR mode: cache the old value of
3788       * data->start_class */
3789       INIT_AND_WITHP;
3790       StructCopy(data->start_class, and_withp, regnode_ssc);
3791       flags &= ~SCF_DO_STCLASS_AND;
3792       StructCopy(&accum, data->start_class, regnode_ssc);
3793       flags |= SCF_DO_STCLASS_OR;
3794      }
3795     }
3796
3797     if (PERL_ENABLE_TRIE_OPTIMISATION &&
3798       OP( startbranch ) == BRANCH )
3799     {
3800     /* demq.
3801
3802     Assuming this was/is a branch we are dealing with: 'scan'
3803     now points at the item that follows the branch sequence,
3804     whatever it is. We now start at the beginning of the
3805     sequence and look for subsequences of
3806
3807     BRANCH->EXACT=>x1
3808     BRANCH->EXACT=>x2
3809     tail
3810
3811     which would be constructed from a pattern like
3812     /A|LIST|OF|WORDS/
3813
3814     If we can find such a subsequence we need to turn the first
3815     element into a trie and then add the subsequent branch exact
3816     strings to the trie.
3817
3818     We have two cases
3819
3820      1. patterns where the whole set of branches can be
3821       converted.
3822
3823      2. patterns where only a subset can be converted.
3824
3825     In case 1 we can replace the whole set with a single regop
3826     for the trie. In case 2 we need to keep the start and end
3827     branches so
3828
3829      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3830      becomes BRANCH TRIE; BRANCH X;
3831
3832     There is an additional case, that being where there is a
3833     common prefix, which gets split out into an EXACT like node
3834     preceding the TRIE node.
3835
3836     If x(1..n)==tail then we can do a simple trie, if not we make
3837     a "jump" trie, such that when we match the appropriate word
3838     we "jump" to the appropriate tail node. Essentially we turn
3839     a nested if into a case structure of sorts.
3840
3841     */
3842
3843      int made=0;
3844      if (!re_trie_maxbuff) {
3845       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3846       if (!SvIOK(re_trie_maxbuff))
3847        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3848      }
3849      if ( SvIV(re_trie_maxbuff)>=0  ) {
3850       regnode *cur;
3851       regnode *first = (regnode *)NULL;
3852       regnode *last = (regnode *)NULL;
3853       regnode *tail = scan;
3854       U8 trietype = 0;
3855       U32 count=0;
3856
3857 #ifdef DEBUGGING
3858       SV * const mysv = sv_newmortal();   /* for dumping */
3859 #endif
3860       /* var tail is used because there may be a TAIL
3861       regop in the way. Ie, the exacts will point to the
3862       thing following the TAIL, but the last branch will
3863       point at the TAIL. So we advance tail. If we
3864       have nested (?:) we may have to move through several
3865       tails.
3866       */
3867
3868       while ( OP( tail ) == TAIL ) {
3869        /* this is the TAIL generated by (?:) */
3870        tail = regnext( tail );
3871       }
3872
3873
3874       DEBUG_TRIE_COMPILE_r({
3875        regprop(RExC_rx, mysv, tail, NULL);
3876        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3877        (int)depth * 2 + 2, "",
3878        "Looking for TRIE'able sequences. Tail node is: ",
3879        SvPV_nolen_const( mysv )
3880        );
3881       });
3882
3883       /*
3884
3885        Step through the branches
3886         cur represents each branch,
3887         noper is the first thing to be matched as part
3888          of that branch
3889         noper_next is the regnext() of that node.
3890
3891        We normally handle a case like this
3892        /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3893        support building with NOJUMPTRIE, which restricts
3894        the trie logic to structures like /FOO|BAR/.
3895
3896        If noper is a trieable nodetype then the branch is
3897        a possible optimization target. If we are building
3898        under NOJUMPTRIE then we require that noper_next is
3899        the same as scan (our current position in the regex
3900        program).
3901
3902        Once we have two or more consecutive such branches
3903        we can create a trie of the EXACT's contents and
3904        stitch it in place into the program.
3905
3906        If the sequence represents all of the branches in
3907        the alternation we replace the entire thing with a
3908        single TRIE node.
3909
3910        Otherwise when it is a subsequence we need to
3911        stitch it in place and replace only the relevant
3912        branches. This means the first branch has to remain
3913        as it is used by the alternation logic, and its
3914        next pointer, and needs to be repointed at the item
3915        on the branch chain following the last branch we
3916        have optimized away.
3917
3918        This could be either a BRANCH, in which case the
3919        subsequence is internal, or it could be the item
3920        following the branch sequence in which case the
3921        subsequence is at the end (which does not
3922        necessarily mean the first node is the start of the
3923        alternation).
3924
3925        TRIE_TYPE(X) is a define which maps the optype to a
3926        trietype.
3927
3928         optype          |  trietype
3929         ----------------+-----------
3930         NOTHING         | NOTHING
3931         EXACT           | EXACT
3932         EXACTFU         | EXACTFU
3933         EXACTFU_SS      | EXACTFU
3934         EXACTFA         | EXACTFA
3935
3936
3937       */
3938 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3939      ( EXACT == (X) )   ? EXACT :        \
3940      ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3941      ( EXACTFA == (X) ) ? EXACTFA :        \
3942      0 )
3943
3944       /* dont use tail as the end marker for this traverse */
3945       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3946        regnode * const noper = NEXTOPER( cur );
3947        U8 noper_type = OP( noper );
3948        U8 noper_trietype = TRIE_TYPE( noper_type );
3949 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3950        regnode * const noper_next = regnext( noper );
3951        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3952        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3953 #endif
3954
3955        DEBUG_TRIE_COMPILE_r({
3956         regprop(RExC_rx, mysv, cur, NULL);
3957         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3958         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3959
3960         regprop(RExC_rx, mysv, noper, NULL);
3961         PerlIO_printf( Perl_debug_log, " -> %s",
3962          SvPV_nolen_const(mysv));
3963
3964         if ( noper_next ) {
3965         regprop(RExC_rx, mysv, noper_next, NULL);
3966         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3967          SvPV_nolen_const(mysv));
3968         }
3969         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3970         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3971         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3972         );
3973        });
3974
3975        /* Is noper a trieable nodetype that can be merged
3976        * with the current trie (if there is one)? */
3977        if ( noper_trietype
3978         &&
3979         (
3980           ( noper_trietype == NOTHING)
3981           || ( trietype == NOTHING )
3982           || ( trietype == noper_trietype )
3983         )
3984 #ifdef NOJUMPTRIE
3985         && noper_next == tail
3986 #endif
3987         && count < U16_MAX)
3988        {
3989         /* Handle mergable triable node Either we are
3990         * the first node in a new trieable sequence,
3991         * in which case we do some bookkeeping,
3992         * otherwise we update the end pointer. */
3993         if ( !first ) {
3994          first = cur;
3995          if ( noper_trietype == NOTHING ) {
3996 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3997           regnode * const noper_next = regnext( noper );
3998           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3999           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4000 #endif
4001
4002           if ( noper_next_trietype ) {
4003            trietype = noper_next_trietype;
4004           } else if (noper_next_type)  {
4005            /* a NOTHING regop is 1 regop wide.
4006            * We need at least two for a trie
4007            * so we can't merge this in */
4008            first = NULL;
4009           }
4010          } else {
4011           trietype = noper_trietype;
4012          }
4013         } else {
4014          if ( trietype == NOTHING )
4015           trietype = noper_trietype;
4016          last = cur;
4017         }
4018         if (first)
4019          count++;
4020        } /* end handle mergable triable node */
4021        else {
4022         /* handle unmergable node -
4023         * noper may either be a triable node which can
4024         * not be tried together with the current trie,
4025         * or a non triable node */
4026         if ( last ) {
4027          /* If last is set and trietype is not
4028          * NOTHING then we have found at least two
4029          * triable branch sequences in a row of a
4030          * similar trietype so we can turn them
4031          * into a trie. If/when we allow NOTHING to
4032          * start a trie sequence this condition
4033          * will be required, and it isn't expensive
4034          * so we leave it in for now. */
4035          if ( trietype && trietype != NOTHING )
4036           make_trie( pRExC_state,
4037             startbranch, first, cur, tail,
4038             count, trietype, depth+1 );
4039          last = NULL; /* note: we clear/update
4040              first, trietype etc below,
4041              so we dont do it here */
4042         }
4043         if ( noper_trietype
4044 #ifdef NOJUMPTRIE
4045          && noper_next == tail
4046 #endif
4047         ){
4048          /* noper is triable, so we can start a new
4049          * trie sequence */
4050          count = 1;
4051          first = cur;
4052          trietype = noper_trietype;
4053         } else if (first) {
4054          /* if we already saw a first but the
4055          * current node is not triable then we have
4056          * to reset the first information. */
4057          count = 0;
4058          first = NULL;
4059          trietype = 0;
4060         }
4061        } /* end handle unmergable node */
4062       } /* loop over branches */
4063       DEBUG_TRIE_COMPILE_r({
4064        regprop(RExC_rx, mysv, cur, NULL);
4065        PerlIO_printf( Perl_debug_log,
4066        "%*s- %s (%d) <SCAN FINISHED>\n",
4067        (int)depth * 2 + 2,
4068        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4069
4070       });
4071       if ( last && trietype ) {
4072        if ( trietype != NOTHING ) {
4073         /* the last branch of the sequence was part of
4074         * a trie, so we have to construct it here
4075         * outside of the loop */
4076         made= make_trie( pRExC_state, startbranch,
4077             first, scan, tail, count,
4078             trietype, depth+1 );
4079 #ifdef TRIE_STUDY_OPT
4080         if ( ((made == MADE_EXACT_TRIE &&
4081          startbranch == first)
4082          || ( first_non_open == first )) &&
4083          depth==0 ) {
4084          flags |= SCF_TRIE_RESTUDY;
4085          if ( startbranch == first
4086           && scan == tail )
4087          {
4088           RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4089          }
4090         }
4091 #endif
4092        } else {
4093         /* at this point we know whatever we have is a
4094         * NOTHING sequence/branch AND if 'startbranch'
4095         * is 'first' then we can turn the whole thing
4096         * into a NOTHING
4097         */
4098         if ( startbranch == first ) {
4099          regnode *opt;
4100          /* the entire thing is a NOTHING sequence,
4101          * something like this: (?:|) So we can
4102          * turn it into a plain NOTHING op. */
4103          DEBUG_TRIE_COMPILE_r({
4104           regprop(RExC_rx, mysv, cur, NULL);
4105           PerlIO_printf( Perl_debug_log,
4106           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4107           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4108
4109          });
4110          OP(startbranch)= NOTHING;
4111          NEXT_OFF(startbranch)= tail - startbranch;
4112          for ( opt= startbranch + 1; opt < tail ; opt++ )
4113           OP(opt)= OPTIMIZED;
4114         }
4115        }
4116       } /* end if ( last) */
4117      } /* TRIE_MAXBUF is non zero */
4118
4119     } /* do trie */
4120
4121    }
4122    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4123     scan = NEXTOPER(NEXTOPER(scan));
4124    } else   /* single branch is optimized. */
4125     scan = NEXTOPER(scan);
4126    continue;
4127   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4128    scan_frame *newframe = NULL;
4129    I32 paren;
4130    regnode *start;
4131    regnode *end;
4132    U32 my_recursed_depth= recursed_depth;
4133
4134    if (OP(scan) != SUSPEND) {
4135     /* set the pointer */
4136     if (OP(scan) == GOSUB) {
4137      paren = ARG(scan);
4138      RExC_recurse[ARG2L(scan)] = scan;
4139      start = RExC_open_parens[paren-1];
4140      end   = RExC_close_parens[paren-1];
4141     } else {
4142      paren = 0;
4143      start = RExC_rxi->program + 1;
4144      end   = RExC_opend;
4145     }
4146     if (!recursed_depth
4147      ||
4148      !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4149     ) {
4150      if (!recursed_depth) {
4151       Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4152      } else {
4153       Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4154        RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4155        RExC_study_chunk_recursed_bytes, U8);
4156      }
4157      /* we havent recursed into this paren yet, so recurse into it */
4158      DEBUG_STUDYDATA("set:", data,depth);
4159      PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4160      my_recursed_depth= recursed_depth + 1;
4161      Newx(newframe,1,scan_frame);
4162     } else {
4163      DEBUG_STUDYDATA("inf:", data,depth);
4164      /* some form of infinite recursion, assume infinite length
4165      * */
4166      if (flags & SCF_DO_SUBSTR) {
4167       scan_commit(pRExC_state, data, minlenp, is_inf);
4168       data->longest = &(data->longest_float);
4169      }
4170      is_inf = is_inf_internal = 1;
4171      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4172       ssc_anything(data->start_class);
4173      flags &= ~SCF_DO_STCLASS;
4174     }
4175    } else {
4176     Newx(newframe,1,scan_frame);
4177     paren = stopparen;
4178     start = scan+2;
4179     end = regnext(scan);
4180    }
4181    if (newframe) {
4182     assert(start);
4183     assert(end);
4184     SAVEFREEPV(newframe);
4185     newframe->next = regnext(scan);
4186     newframe->last = last;
4187     newframe->stop = stopparen;
4188     newframe->prev = frame;
4189     newframe->prev_recursed_depth = recursed_depth;
4190
4191     DEBUG_STUDYDATA("frame-new:",data,depth);
4192     DEBUG_PEEP("fnew", scan, depth);
4193
4194     frame = newframe;
4195     scan =  start;
4196     stopparen = paren;
4197     last = end;
4198     depth = depth + 1;
4199     recursed_depth= my_recursed_depth;
4200
4201     continue;
4202    }
4203   }
4204   else if (OP(scan) == EXACT) {
4205    SSize_t l = STR_LEN(scan);
4206    UV uc;
4207    if (UTF) {
4208     const U8 * const s = (U8*)STRING(scan);
4209     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4210     l = utf8_length(s, s + l);
4211    } else {
4212     uc = *((U8*)STRING(scan));
4213    }
4214    min += l;
4215    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4216     /* The code below prefers earlier match for fixed
4217     offset, later match for variable offset.  */
4218     if (data->last_end == -1) { /* Update the start info. */
4219      data->last_start_min = data->pos_min;
4220      data->last_start_max = is_inf
4221       ? SSize_t_MAX : data->pos_min + data->pos_delta;
4222     }
4223     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4224     if (UTF)
4225      SvUTF8_on(data->last_found);
4226     {
4227      SV * const sv = data->last_found;
4228      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4229       mg_find(sv, PERL_MAGIC_utf8) : NULL;
4230      if (mg && mg->mg_len >= 0)
4231       mg->mg_len += utf8_length((U8*)STRING(scan),
4232            (U8*)STRING(scan)+STR_LEN(scan));
4233     }
4234     data->last_end = data->pos_min + l;
4235     data->pos_min += l; /* As in the first entry. */
4236     data->flags &= ~SF_BEFORE_EOL;
4237    }
4238
4239    /* ANDing the code point leaves at most it, and not in locale, and
4240    * can't match null string */
4241    if (flags & SCF_DO_STCLASS_AND) {
4242     ssc_cp_and(data->start_class, uc);
4243     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4244     ssc_clear_locale(data->start_class);
4245    }
4246    else if (flags & SCF_DO_STCLASS_OR) {
4247     ssc_add_cp(data->start_class, uc);
4248     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4249
4250     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4251     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4252    }
4253    flags &= ~SCF_DO_STCLASS;
4254   }
4255   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4256    SSize_t l = STR_LEN(scan);
4257    UV uc = *((U8*)STRING(scan));
4258    SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4259              separate code points */
4260
4261    /* Search for fixed substrings supports EXACT only. */
4262    if (flags & SCF_DO_SUBSTR) {
4263     assert(data);
4264     scan_commit(pRExC_state, data, minlenp, is_inf);
4265    }
4266    if (UTF) {
4267     const U8 * const s = (U8 *)STRING(scan);
4268     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4269     l = utf8_length(s, s + l);
4270    }
4271    if (unfolded_multi_char) {
4272     RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4273    }
4274    min += l - min_subtract;
4275    assert (min >= 0);
4276    delta += min_subtract;
4277    if (flags & SCF_DO_SUBSTR) {
4278     data->pos_min += l - min_subtract;
4279     if (data->pos_min < 0) {
4280      data->pos_min = 0;
4281     }
4282     data->pos_delta += min_subtract;
4283     if (min_subtract) {
4284      data->longest = &(data->longest_float);
4285     }
4286    }
4287    if (OP(scan) == EXACTFL) {
4288
4289     /* We don't know what the folds are; it could be anything. XXX
4290     * Actually, we only support UTF-8 encoding for code points
4291     * above Latin1, so we could know what those folds are. */
4292     EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4293              0,
4294              UV_MAX);
4295    }
4296    else {  /* Non-locale EXACTFish */
4297     EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4298     if (flags & SCF_DO_STCLASS_AND) {
4299      ssc_clear_locale(data->start_class);
4300     }
4301     if (uc < 256) { /* We know what the Latin1 folds are ... */
4302      if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
4303              know if anything folds
4304              with this */
4305       EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4306               PL_fold_latin1[uc]);
4307       if (OP(scan) != EXACTFA) { /* The folds below aren't
4308              legal under /iaa */
4309        if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4310         EXACTF_invlist
4311          = add_cp_to_invlist(EXACTF_invlist,
4312             LATIN_SMALL_LETTER_SHARP_S);
4313        }
4314        else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4315         EXACTF_invlist
4316          = add_cp_to_invlist(EXACTF_invlist, 's');
4317         EXACTF_invlist
4318          = add_cp_to_invlist(EXACTF_invlist, 'S');
4319        }
4320       }
4321
4322       /* We also know if there are above-Latin1 code points
4323       * that fold to this (none legal for ASCII and /iaa) */
4324       if ((! isASCII(uc) || OP(scan) != EXACTFA)
4325        && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4326       {
4327        /* XXX We could know exactly what does fold to this
4328        * if the reverse folds are loaded, as currently in
4329        * S_regclass() */
4330        _invlist_union(EXACTF_invlist,
4331           PL_AboveLatin1,
4332           &EXACTF_invlist);
4333       }
4334      }
4335     }
4336     else {  /* Non-locale, above Latin1.  XXX We don't currently
4337       know what participates in folds with this, so have
4338       to assume anything could */
4339
4340      /* XXX We could know exactly what does fold to this if the
4341      * reverse folds are loaded, as currently in S_regclass().
4342      * But we do know that under /iaa nothing in the ASCII
4343      * range can participate */
4344      if (OP(scan) == EXACTFA) {
4345       _invlist_union_complement_2nd(EXACTF_invlist,
4346              PL_XPosix_ptrs[_CC_ASCII],
4347              &EXACTF_invlist);
4348      }
4349      else {
4350       EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4351                0, UV_MAX);
4352      }
4353     }
4354    }
4355    if (flags & SCF_DO_STCLASS_AND) {
4356     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4357     ANYOF_POSIXL_ZERO(data->start_class);
4358     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4359    }
4360    else if (flags & SCF_DO_STCLASS_OR) {
4361     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4362     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4363
4364     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4365     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4366    }
4367    flags &= ~SCF_DO_STCLASS;
4368    SvREFCNT_dec(EXACTF_invlist);
4369   }
4370   else if (REGNODE_VARIES(OP(scan))) {
4371    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4372    I32 fl = 0, f = flags;
4373    regnode * const oscan = scan;
4374    regnode_ssc this_class;
4375    regnode_ssc *oclass = NULL;
4376    I32 next_is_eval = 0;
4377
4378    switch (PL_regkind[OP(scan)]) {
4379    case WHILEM:  /* End of (?:...)* . */
4380     scan = NEXTOPER(scan);
4381     goto finish;
4382    case PLUS:
4383     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4384      next = NEXTOPER(scan);
4385      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4386       mincount = 1;
4387       maxcount = REG_INFTY;
4388       next = regnext(scan);
4389       scan = NEXTOPER(scan);
4390       goto do_curly;
4391      }
4392     }
4393     if (flags & SCF_DO_SUBSTR)
4394      data->pos_min++;
4395     min++;
4396     /* Fall through. */
4397    case STAR:
4398     if (flags & SCF_DO_STCLASS) {
4399      mincount = 0;
4400      maxcount = REG_INFTY;
4401      next = regnext(scan);
4402      scan = NEXTOPER(scan);
4403      goto do_curly;
4404     }
4405     if (flags & SCF_DO_SUBSTR) {
4406      scan_commit(pRExC_state, data, minlenp, is_inf);
4407      /* Cannot extend fixed substrings */
4408      data->longest = &(data->longest_float);
4409     }
4410     is_inf = is_inf_internal = 1;
4411     scan = regnext(scan);
4412     goto optimize_curly_tail;
4413    case CURLY:
4414     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4415      && (scan->flags == stopparen))
4416     {
4417      mincount = 1;
4418      maxcount = 1;
4419     } else {
4420      mincount = ARG1(scan);
4421      maxcount = ARG2(scan);
4422     }
4423     next = regnext(scan);
4424     if (OP(scan) == CURLYX) {
4425      I32 lp = (data ? *(data->last_closep) : 0);
4426      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4427     }
4428     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4429     next_is_eval = (OP(scan) == EVAL);
4430    do_curly:
4431     if (flags & SCF_DO_SUBSTR) {
4432      if (mincount == 0)
4433       scan_commit(pRExC_state, data, minlenp, is_inf);
4434      /* Cannot extend fixed substrings */
4435      pos_before = data->pos_min;
4436     }
4437     if (data) {
4438      fl = data->flags;
4439      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4440      if (is_inf)
4441       data->flags |= SF_IS_INF;
4442     }
4443     if (flags & SCF_DO_STCLASS) {
4444      ssc_init(pRExC_state, &this_class);
4445      oclass = data->start_class;
4446      data->start_class = &this_class;
4447      f |= SCF_DO_STCLASS_AND;
4448      f &= ~SCF_DO_STCLASS_OR;
4449     }
4450     /* Exclude from super-linear cache processing any {n,m}
4451     regops for which the combination of input pos and regex
4452     pos is not enough information to determine if a match
4453     will be possible.
4454
4455     For example, in the regex /foo(bar\s*){4,8}baz/ with the
4456     regex pos at the \s*, the prospects for a match depend not
4457     only on the input position but also on how many (bar\s*)
4458     repeats into the {4,8} we are. */
4459    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4460      f &= ~SCF_WHILEM_VISITED_POS;
4461
4462     /* This will finish on WHILEM, setting scan, or on NULL: */
4463     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4464         last, data, stopparen, recursed_depth, NULL,
4465         (mincount == 0
4466         ? (f & ~SCF_DO_SUBSTR)
4467         : f)
4468         ,depth+1);
4469
4470     if (flags & SCF_DO_STCLASS)
4471      data->start_class = oclass;
4472     if (mincount == 0 || minnext == 0) {
4473      if (flags & SCF_DO_STCLASS_OR) {
4474       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4475      }
4476      else if (flags & SCF_DO_STCLASS_AND) {
4477       /* Switch to OR mode: cache the old value of
4478       * data->start_class */
4479       INIT_AND_WITHP;
4480       StructCopy(data->start_class, and_withp, regnode_ssc);
4481       flags &= ~SCF_DO_STCLASS_AND;
4482       StructCopy(&this_class, data->start_class, regnode_ssc);
4483       flags |= SCF_DO_STCLASS_OR;
4484       ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4485      }
4486     } else {  /* Non-zero len */
4487      if (flags & SCF_DO_STCLASS_OR) {
4488       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4489       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4490      }
4491      else if (flags & SCF_DO_STCLASS_AND)
4492       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4493      flags &= ~SCF_DO_STCLASS;
4494     }
4495     if (!scan)   /* It was not CURLYX, but CURLY. */
4496      scan = next;
4497     if (!(flags & SCF_TRIE_DOING_RESTUDY)
4498      /* ? quantifier ok, except for (?{ ... }) */
4499      && (next_is_eval || !(mincount == 0 && maxcount == 1))
4500      && (minnext == 0) && (deltanext == 0)
4501      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4502      && maxcount <= REG_INFTY/3) /* Complement check for big
4503             count */
4504     {
4505      /* Fatal warnings may leak the regexp without this: */
4506      SAVEFREESV(RExC_rx_sv);
4507      ckWARNreg(RExC_parse,
4508        "Quantifier unexpected on zero-length expression");
4509      (void)ReREFCNT_inc(RExC_rx_sv);
4510     }
4511
4512     min += minnext * mincount;
4513     is_inf_internal |= deltanext == SSize_t_MAX
4514       || (maxcount == REG_INFTY && minnext + deltanext > 0);
4515     is_inf |= is_inf_internal;
4516     if (is_inf) {
4517      delta = SSize_t_MAX;
4518     } else {
4519      delta += (minnext + deltanext) * maxcount
4520        - minnext * mincount;
4521     }
4522     /* Try powerful optimization CURLYX => CURLYN. */
4523     if (  OP(oscan) == CURLYX && data
4524      && data->flags & SF_IN_PAR
4525      && !(data->flags & SF_HAS_EVAL)
4526      && !deltanext && minnext == 1 ) {
4527      /* Try to optimize to CURLYN.  */
4528      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4529      regnode * const nxt1 = nxt;
4530 #ifdef DEBUGGING
4531      regnode *nxt2;
4532 #endif
4533
4534      /* Skip open. */
4535      nxt = regnext(nxt);
4536      if (!REGNODE_SIMPLE(OP(nxt))
4537       && !(PL_regkind[OP(nxt)] == EXACT
4538        && STR_LEN(nxt) == 1))
4539       goto nogo;
4540 #ifdef DEBUGGING
4541      nxt2 = nxt;
4542 #endif
4543      nxt = regnext(nxt);
4544      if (OP(nxt) != CLOSE)
4545       goto nogo;
4546      if (RExC_open_parens) {
4547       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4548       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4549      }
4550      /* Now we know that nxt2 is the only contents: */
4551      oscan->flags = (U8)ARG(nxt);
4552      OP(oscan) = CURLYN;
4553      OP(nxt1) = NOTHING; /* was OPEN. */
4554
4555 #ifdef DEBUGGING
4556      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4557      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4558      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4559      OP(nxt) = OPTIMIZED; /* was CLOSE. */
4560      OP(nxt + 1) = OPTIMIZED; /* was count. */
4561      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4562 #endif
4563     }
4564    nogo:
4565
4566     /* Try optimization CURLYX => CURLYM. */
4567     if (  OP(oscan) == CURLYX && data
4568      && !(data->flags & SF_HAS_PAR)
4569      && !(data->flags & SF_HAS_EVAL)
4570      && !deltanext /* atom is fixed width */
4571      && minnext != 0 /* CURLYM can't handle zero width */
4572
4573       /* Nor characters whose fold at run-time may be
4574       * multi-character */
4575      && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4576     ) {
4577      /* XXXX How to optimize if data == 0? */
4578      /* Optimize to a simpler form.  */
4579      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4580      regnode *nxt2;
4581
4582      OP(oscan) = CURLYM;
4583      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4584        && (OP(nxt2) != WHILEM))
4585       nxt = nxt2;
4586      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4587      /* Need to optimize away parenths. */
4588      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4589       /* Set the parenth number.  */
4590       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4591
4592       oscan->flags = (U8)ARG(nxt);
4593       if (RExC_open_parens) {
4594        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4595        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4596       }
4597       OP(nxt1) = OPTIMIZED; /* was OPEN. */
4598       OP(nxt) = OPTIMIZED; /* was CLOSE. */
4599
4600 #ifdef DEBUGGING
4601       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4602       OP(nxt + 1) = OPTIMIZED; /* was count. */
4603       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4604       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4605 #endif
4606 #if 0
4607       while ( nxt1 && (OP(nxt1) != WHILEM)) {
4608        regnode *nnxt = regnext(nxt1);
4609        if (nnxt == nxt) {
4610         if (reg_off_by_arg[OP(nxt1)])
4611          ARG_SET(nxt1, nxt2 - nxt1);
4612         else if (nxt2 - nxt1 < U16_MAX)
4613          NEXT_OFF(nxt1) = nxt2 - nxt1;
4614         else
4615          OP(nxt) = NOTHING; /* Cannot beautify */
4616        }
4617        nxt1 = nnxt;
4618       }
4619 #endif
4620       /* Optimize again: */
4621       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4622          NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4623      }
4624      else
4625       oscan->flags = 0;
4626     }
4627     else if ((OP(oscan) == CURLYX)
4628       && (flags & SCF_WHILEM_VISITED_POS)
4629       /* See the comment on a similar expression above.
4630        However, this time it's not a subexpression
4631        we care about, but the expression itself. */
4632       && (maxcount == REG_INFTY)
4633       && data && ++data->whilem_c < 16) {
4634      /* This stays as CURLYX, we can put the count/of pair. */
4635      /* Find WHILEM (as in regexec.c) */
4636      regnode *nxt = oscan + NEXT_OFF(oscan);
4637
4638      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4639       nxt += ARG(nxt);
4640      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4641       | (RExC_whilem_seen << 4)); /* On WHILEM */
4642     }
4643     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4644      pars++;
4645     if (flags & SCF_DO_SUBSTR) {
4646      SV *last_str = NULL;
4647      STRLEN last_chrs = 0;
4648      int counted = mincount != 0;
4649
4650      if (data->last_end > 0 && mincount != 0) { /* Ends with a
4651                 string. */
4652       SSize_t b = pos_before >= data->last_start_min
4653        ? pos_before : data->last_start_min;
4654       STRLEN l;
4655       const char * const s = SvPV_const(data->last_found, l);
4656       SSize_t old = b - data->last_start_min;
4657
4658       if (UTF)
4659        old = utf8_hop((U8*)s, old) - (U8*)s;
4660       l -= old;
4661       /* Get the added string: */
4662       last_str = newSVpvn_utf8(s  + old, l, UTF);
4663       last_chrs = UTF ? utf8_length((U8*)(s + old),
4664            (U8*)(s + old + l)) : l;
4665       if (deltanext == 0 && pos_before == b) {
4666        /* What was added is a constant string */
4667        if (mincount > 1) {
4668
4669         SvGROW(last_str, (mincount * l) + 1);
4670         repeatcpy(SvPVX(last_str) + l,
4671           SvPVX_const(last_str), l,
4672           mincount - 1);
4673         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4674         /* Add additional parts. */
4675         SvCUR_set(data->last_found,
4676           SvCUR(data->last_found) - l);
4677         sv_catsv(data->last_found, last_str);
4678         {
4679          SV * sv = data->last_found;
4680          MAGIC *mg =
4681           SvUTF8(sv) && SvMAGICAL(sv) ?
4682           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4683          if (mg && mg->mg_len >= 0)
4684           mg->mg_len += last_chrs * (mincount-1);
4685         }
4686         last_chrs *= mincount;
4687         data->last_end += l * (mincount - 1);
4688        }
4689       } else {
4690        /* start offset must point into the last copy */
4691        data->last_start_min += minnext * (mincount - 1);
4692        data->last_start_max += is_inf ? SSize_t_MAX
4693         : (maxcount - 1) * (minnext + data->pos_delta);
4694       }
4695      }
4696      /* It is counted once already... */
4697      data->pos_min += minnext * (mincount - counted);
4698 #if 0
4699 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4700        " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4701        " maxcount=%"UVdf" mincount=%"UVdf"\n",
4702  (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4703  (UV)mincount);
4704 if (deltanext != SSize_t_MAX)
4705 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4706  (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4707   - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4708 #endif
4709      if (deltanext == SSize_t_MAX
4710       || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4711       data->pos_delta = SSize_t_MAX;
4712      else
4713       data->pos_delta += - counted * deltanext +
4714       (minnext + deltanext) * maxcount - minnext * mincount;
4715      if (mincount != maxcount) {
4716       /* Cannot extend fixed substrings found inside
4717        the group.  */
4718       scan_commit(pRExC_state, data, minlenp, is_inf);
4719       if (mincount && last_str) {
4720        SV * const sv = data->last_found;
4721        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4722         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4723
4724        if (mg)
4725         mg->mg_len = -1;
4726        sv_setsv(sv, last_str);
4727        data->last_end = data->pos_min;
4728        data->last_start_min = data->pos_min - last_chrs;
4729        data->last_start_max = is_inf
4730         ? SSize_t_MAX
4731         : data->pos_min + data->pos_delta - last_chrs;
4732       }
4733       data->longest = &(data->longest_float);
4734      }
4735      SvREFCNT_dec(last_str);
4736     }
4737     if (data && (fl & SF_HAS_EVAL))
4738      data->flags |= SF_HAS_EVAL;
4739    optimize_curly_tail:
4740     if (OP(oscan) != CURLYX) {
4741      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4742       && NEXT_OFF(next))
4743       NEXT_OFF(oscan) += NEXT_OFF(next);
4744     }
4745     continue;
4746
4747    default:
4748 #ifdef DEBUGGING
4749     Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4750                  OP(scan));
4751 #endif
4752    case REF:
4753    case CLUMP:
4754     if (flags & SCF_DO_SUBSTR) {
4755      /* Cannot expect anything... */
4756      scan_commit(pRExC_state, data, minlenp, is_inf);
4757      data->longest = &(data->longest_float);
4758     }
4759     is_inf = is_inf_internal = 1;
4760     if (flags & SCF_DO_STCLASS_OR) {
4761      if (OP(scan) == CLUMP) {
4762       /* Actually is any start char, but very few code points
4763       * aren't start characters */
4764       ssc_match_all_cp(data->start_class);
4765      }
4766      else {
4767       ssc_anything(data->start_class);
4768      }
4769     }
4770     flags &= ~SCF_DO_STCLASS;
4771     break;
4772    }
4773   }
4774   else if (OP(scan) == LNBREAK) {
4775    if (flags & SCF_DO_STCLASS) {
4776      if (flags & SCF_DO_STCLASS_AND) {
4777      ssc_intersection(data->start_class,
4778          PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4779      ssc_clear_locale(data->start_class);
4780      ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4781     }
4782     else if (flags & SCF_DO_STCLASS_OR) {
4783      ssc_union(data->start_class,
4784        PL_XPosix_ptrs[_CC_VERTSPACE],
4785        FALSE);
4786      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4787
4788      /* See commit msg for
4789      * 749e076fceedeb708a624933726e7989f2302f6a */
4790      ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4791     }
4792     flags &= ~SCF_DO_STCLASS;
4793    }
4794    min++;
4795    delta++;    /* Because of the 2 char string cr-lf */
4796    if (flags & SCF_DO_SUBSTR) {
4797     /* Cannot expect anything... */
4798     scan_commit(pRExC_state, data, minlenp, is_inf);
4799      data->pos_min += 1;
4800     data->pos_delta += 1;
4801     data->longest = &(data->longest_float);
4802     }
4803   }
4804   else if (REGNODE_SIMPLE(OP(scan))) {
4805
4806    if (flags & SCF_DO_SUBSTR) {
4807     scan_commit(pRExC_state, data, minlenp, is_inf);
4808     data->pos_min++;
4809    }
4810    min++;
4811    if (flags & SCF_DO_STCLASS) {
4812     bool invert = 0;
4813     SV* my_invlist = sv_2mortal(_new_invlist(0));
4814     U8 namedclass;
4815
4816     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4817     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4818
4819     /* Some of the logic below assumes that switching
4820     locale on will only add false positives. */
4821     switch (OP(scan)) {
4822
4823     default:
4824 #ifdef DEBUGGING
4825     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4826                  OP(scan));
4827 #endif
4828     case CANY:
4829     case SANY:
4830      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4831       ssc_match_all_cp(data->start_class);
4832      break;
4833
4834     case REG_ANY:
4835      {
4836       SV* REG_ANY_invlist = _new_invlist(2);
4837       REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4838                '\n');
4839       if (flags & SCF_DO_STCLASS_OR) {
4840        ssc_union(data->start_class,
4841          REG_ANY_invlist,
4842          TRUE /* TRUE => invert, hence all but \n
4843            */
4844          );
4845       }
4846       else if (flags & SCF_DO_STCLASS_AND) {
4847        ssc_intersection(data->start_class,
4848            REG_ANY_invlist,
4849            TRUE  /* TRUE => invert */
4850            );
4851        ssc_clear_locale(data->start_class);
4852       }
4853       SvREFCNT_dec_NN(REG_ANY_invlist);
4854      }
4855      break;
4856
4857     case ANYOF:
4858      if (flags & SCF_DO_STCLASS_AND)
4859       ssc_and(pRExC_state, data->start_class,
4860         (regnode_charclass *) scan);
4861      else
4862       ssc_or(pRExC_state, data->start_class,
4863               (regnode_charclass *) scan);
4864      break;
4865
4866     case NPOSIXL:
4867      invert = 1;
4868      /* FALL THROUGH */
4869
4870     case POSIXL:
4871      namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4872      if (flags & SCF_DO_STCLASS_AND) {
4873       bool was_there = cBOOL(
4874           ANYOF_POSIXL_TEST(data->start_class,
4875                 namedclass));
4876       ANYOF_POSIXL_ZERO(data->start_class);
4877       if (was_there) {    /* Do an AND */
4878        ANYOF_POSIXL_SET(data->start_class, namedclass);
4879       }
4880       /* No individual code points can now match */
4881       data->start_class->invlist
4882             = sv_2mortal(_new_invlist(0));
4883      }
4884      else {
4885       int complement = namedclass + ((invert) ? -1 : 1);
4886
4887       assert(flags & SCF_DO_STCLASS_OR);
4888
4889       /* If the complement of this class was already there,
4890       * the result is that they match all code points,
4891       * (\d + \D == everything).  Remove the classes from
4892       * future consideration.  Locale is not relevant in
4893       * this case */
4894       if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4895        ssc_match_all_cp(data->start_class);
4896        ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4897        ANYOF_POSIXL_CLEAR(data->start_class, complement);
4898       }
4899       else {  /* The usual case; just add this class to the
4900         existing set */
4901        ANYOF_POSIXL_SET(data->start_class, namedclass);
4902       }
4903      }
4904      break;
4905
4906     case NPOSIXA:   /* For these, we always know the exact set of
4907         what's matched */
4908      invert = 1;
4909      /* FALL THROUGH */
4910     case POSIXA:
4911      if (FLAGS(scan) == _CC_ASCII) {
4912       my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4913      }
4914      else {
4915       _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4916            PL_XPosix_ptrs[_CC_ASCII],
4917            &my_invlist);
4918      }
4919      goto join_posix;
4920
4921     case NPOSIXD:
4922     case NPOSIXU:
4923      invert = 1;
4924      /* FALL THROUGH */
4925     case POSIXD:
4926     case POSIXU:
4927      my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
4928
4929      /* NPOSIXD matches all upper Latin1 code points unless the
4930      * target string being matched is UTF-8, which is
4931      * unknowable until match time.  Since we are going to
4932      * invert, we want to get rid of all of them so that the
4933      * inversion will match all */
4934      if (OP(scan) == NPOSIXD) {
4935       _invlist_subtract(my_invlist, PL_UpperLatin1,
4936           &my_invlist);
4937      }
4938
4939     join_posix:
4940
4941      if (flags & SCF_DO_STCLASS_AND) {
4942       ssc_intersection(data->start_class, my_invlist, invert);
4943       ssc_clear_locale(data->start_class);
4944      }
4945      else {
4946       assert(flags & SCF_DO_STCLASS_OR);
4947       ssc_union(data->start_class, my_invlist, invert);
4948      }
4949     }
4950     if (flags & SCF_DO_STCLASS_OR)
4951      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4952     flags &= ~SCF_DO_STCLASS;
4953    }
4954   }
4955   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4956    data->flags |= (OP(scan) == MEOL
4957        ? SF_BEFORE_MEOL
4958        : SF_BEFORE_SEOL);
4959    scan_commit(pRExC_state, data, minlenp, is_inf);
4960
4961   }
4962   else if (  PL_regkind[OP(scan)] == BRANCHJ
4963     /* Lookbehind, or need to calculate parens/evals/stclass: */
4964     && (scan->flags || data || (flags & SCF_DO_STCLASS))
4965     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4966    if ( OP(scan) == UNLESSM &&
4967     scan->flags == 0 &&
4968     OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4969     OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4970    ) {
4971     regnode *opt;
4972     regnode *upto= regnext(scan);
4973     DEBUG_PARSE_r({
4974      SV * const mysv_val=sv_newmortal();
4975      DEBUG_STUDYDATA("OPFAIL",data,depth);
4976
4977      /*DEBUG_PARSE_MSG("opfail");*/
4978      regprop(RExC_rx, mysv_val, upto, NULL);
4979      PerlIO_printf(Perl_debug_log,
4980       "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4981       SvPV_nolen_const(mysv_val),
4982       (IV)REG_NODE_NUM(upto),
4983       (IV)(upto - scan)
4984      );
4985     });
4986     OP(scan) = OPFAIL;
4987     NEXT_OFF(scan) = upto - scan;
4988     for (opt= scan + 1; opt < upto ; opt++)
4989      OP(opt) = OPTIMIZED;
4990     scan= upto;
4991     continue;
4992    }
4993    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4994     || OP(scan) == UNLESSM )
4995    {
4996     /* Negative Lookahead/lookbehind
4997     In this case we can't do fixed string optimisation.
4998     */
4999
5000     SSize_t deltanext, minnext, fake = 0;
5001     regnode *nscan;
5002     regnode_ssc intrnl;
5003     int f = 0;
5004
5005     data_fake.flags = 0;
5006     if (data) {
5007      data_fake.whilem_c = data->whilem_c;
5008      data_fake.last_closep = data->last_closep;
5009     }
5010     else
5011      data_fake.last_closep = &fake;
5012     data_fake.pos_delta = delta;
5013     if ( flags & SCF_DO_STCLASS && !scan->flags
5014      && OP(scan) == IFMATCH ) { /* Lookahead */
5015      ssc_init(pRExC_state, &intrnl);
5016      data_fake.start_class = &intrnl;
5017      f |= SCF_DO_STCLASS_AND;
5018     }
5019     if (flags & SCF_WHILEM_VISITED_POS)
5020      f |= SCF_WHILEM_VISITED_POS;
5021     next = regnext(scan);
5022     nscan = NEXTOPER(NEXTOPER(scan));
5023     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5024          last, &data_fake, stopparen,
5025          recursed_depth, NULL, f, depth+1);
5026     if (scan->flags) {
5027      if (deltanext) {
5028       FAIL("Variable length lookbehind not implemented");
5029      }
5030      else if (minnext > (I32)U8_MAX) {
5031       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5032        (UV)U8_MAX);
5033      }
5034      scan->flags = (U8)minnext;
5035     }
5036     if (data) {
5037      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5038       pars++;
5039      if (data_fake.flags & SF_HAS_EVAL)
5040       data->flags |= SF_HAS_EVAL;
5041      data->whilem_c = data_fake.whilem_c;
5042     }
5043     if (f & SCF_DO_STCLASS_AND) {
5044      if (flags & SCF_DO_STCLASS_OR) {
5045       /* OR before, AND after: ideally we would recurse with
5046       * data_fake to get the AND applied by study of the
5047       * remainder of the pattern, and then derecurse;
5048       * *** HACK *** for now just treat as "no information".
5049       * See [perl #56690].
5050       */
5051       ssc_init(pRExC_state, data->start_class);
5052      }  else {
5053       /* AND before and after: combine and continue.  These
5054       * assertions are zero-length, so can match an EMPTY
5055       * string */
5056       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5057       ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5058      }
5059     }
5060    }
5061 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5062    else {
5063     /* Positive Lookahead/lookbehind
5064     In this case we can do fixed string optimisation,
5065     but we must be careful about it. Note in the case of
5066     lookbehind the positions will be offset by the minimum
5067     length of the pattern, something we won't know about
5068     until after the recurse.
5069     */
5070     SSize_t deltanext, fake = 0;
5071     regnode *nscan;
5072     regnode_ssc intrnl;
5073     int f = 0;
5074     /* We use SAVEFREEPV so that when the full compile
5075      is finished perl will clean up the allocated
5076      minlens when it's all done. This way we don't
5077      have to worry about freeing them when we know
5078      they wont be used, which would be a pain.
5079     */
5080     SSize_t *minnextp;
5081     Newx( minnextp, 1, SSize_t );
5082     SAVEFREEPV(minnextp);
5083
5084     if (data) {
5085      StructCopy(data, &data_fake, scan_data_t);
5086      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5087       f |= SCF_DO_SUBSTR;
5088       if (scan->flags)
5089        scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5090       data_fake.last_found=newSVsv(data->last_found);
5091      }
5092     }
5093     else
5094      data_fake.last_closep = &fake;
5095     data_fake.flags = 0;
5096     data_fake.pos_delta = delta;
5097     if (is_inf)
5098      data_fake.flags |= SF_IS_INF;
5099     if ( flags & SCF_DO_STCLASS && !scan->flags
5100      && OP(scan) == IFMATCH ) { /* Lookahead */
5101      ssc_init(pRExC_state, &intrnl);
5102      data_fake.start_class = &intrnl;
5103      f |= SCF_DO_STCLASS_AND;
5104     }
5105     if (flags & SCF_WHILEM_VISITED_POS)
5106      f |= SCF_WHILEM_VISITED_POS;
5107     next = regnext(scan);
5108     nscan = NEXTOPER(NEXTOPER(scan));
5109
5110     *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5111           &deltanext, last, &data_fake,
5112           stopparen, recursed_depth, NULL,
5113           f,depth+1);
5114     if (scan->flags) {
5115      if (deltanext) {
5116       FAIL("Variable length lookbehind not implemented");
5117      }
5118      else if (*minnextp > (I32)U8_MAX) {
5119       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5120        (UV)U8_MAX);
5121      }
5122      scan->flags = (U8)*minnextp;
5123     }
5124
5125     *minnextp += min;
5126
5127     if (f & SCF_DO_STCLASS_AND) {
5128      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5129      ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5130     }
5131     if (data) {
5132      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5133       pars++;
5134      if (data_fake.flags & SF_HAS_EVAL)
5135       data->flags |= SF_HAS_EVAL;
5136      data->whilem_c = data_fake.whilem_c;
5137      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5138       if (RExC_rx->minlen<*minnextp)
5139        RExC_rx->minlen=*minnextp;
5140       scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5141       SvREFCNT_dec_NN(data_fake.last_found);
5142
5143       if ( data_fake.minlen_fixed != minlenp )
5144       {
5145        data->offset_fixed= data_fake.offset_fixed;
5146        data->minlen_fixed= data_fake.minlen_fixed;
5147        data->lookbehind_fixed+= scan->flags;
5148       }
5149       if ( data_fake.minlen_float != minlenp )
5150       {
5151        data->minlen_float= data_fake.minlen_float;
5152        data->offset_float_min=data_fake.offset_float_min;
5153        data->offset_float_max=data_fake.offset_float_max;
5154        data->lookbehind_float+= scan->flags;
5155       }
5156      }
5157     }
5158    }
5159 #endif
5160   }
5161   else if (OP(scan) == OPEN) {
5162    if (stopparen != (I32)ARG(scan))
5163     pars++;
5164   }
5165   else if (OP(scan) == CLOSE) {
5166    if (stopparen == (I32)ARG(scan)) {
5167     break;
5168    }
5169    if ((I32)ARG(scan) == is_par) {
5170     next = regnext(scan);
5171
5172     if ( next && (OP(next) != WHILEM) && next < last)
5173      is_par = 0;  /* Disable optimization */
5174    }
5175    if (data)
5176     *(data->last_closep) = ARG(scan);
5177   }
5178   else if (OP(scan) == EVAL) {
5179     if (data)
5180      data->flags |= SF_HAS_EVAL;
5181   }
5182   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5183    if (flags & SCF_DO_SUBSTR) {
5184     scan_commit(pRExC_state, data, minlenp, is_inf);
5185     flags &= ~SCF_DO_SUBSTR;
5186    }
5187    if (data && OP(scan)==ACCEPT) {
5188     data->flags |= SCF_SEEN_ACCEPT;
5189     if (stopmin > min)
5190      stopmin = min;
5191    }
5192   }
5193   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5194   {
5195     if (flags & SCF_DO_SUBSTR) {
5196      scan_commit(pRExC_state, data, minlenp, is_inf);
5197      data->longest = &(data->longest_float);
5198     }
5199     is_inf = is_inf_internal = 1;
5200     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5201      ssc_anything(data->start_class);
5202     flags &= ~SCF_DO_STCLASS;
5203   }
5204   else if (OP(scan) == GPOS) {
5205    if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5206     !(delta || is_inf || (data && data->pos_delta)))
5207    {
5208     if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5209      RExC_rx->intflags |= PREGf_ANCH_GPOS;
5210     if (RExC_rx->gofs < (STRLEN)min)
5211      RExC_rx->gofs = min;
5212    } else {
5213     RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5214     RExC_rx->gofs = 0;
5215    }
5216   }
5217 #ifdef TRIE_STUDY_OPT
5218 #ifdef FULL_TRIE_STUDY
5219   else if (PL_regkind[OP(scan)] == TRIE) {
5220    /* NOTE - There is similar code to this block above for handling
5221    BRANCH nodes on the initial study.  If you change stuff here
5222    check there too. */
5223    regnode *trie_node= scan;
5224    regnode *tail= regnext(scan);
5225    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5226    SSize_t max1 = 0, min1 = SSize_t_MAX;
5227    regnode_ssc accum;
5228
5229    if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5230     /* Cannot merge strings after this. */
5231     scan_commit(pRExC_state, data, minlenp, is_inf);
5232    }
5233    if (flags & SCF_DO_STCLASS)
5234     ssc_init_zero(pRExC_state, &accum);
5235
5236    if (!trie->jump) {
5237     min1= trie->minlen;
5238     max1= trie->maxlen;
5239    } else {
5240     const regnode *nextbranch= NULL;
5241     U32 word;
5242
5243     for ( word=1 ; word <= trie->wordcount ; word++)
5244     {
5245      SSize_t deltanext=0, minnext=0, f = 0, fake;
5246      regnode_ssc this_class;
5247
5248      data_fake.flags = 0;
5249      if (data) {
5250       data_fake.whilem_c = data->whilem_c;
5251       data_fake.last_closep = data->last_closep;
5252      }
5253      else
5254       data_fake.last_closep = &fake;
5255      data_fake.pos_delta = delta;
5256      if (flags & SCF_DO_STCLASS) {
5257       ssc_init(pRExC_state, &this_class);
5258       data_fake.start_class = &this_class;
5259       f = SCF_DO_STCLASS_AND;
5260      }
5261      if (flags & SCF_WHILEM_VISITED_POS)
5262       f |= SCF_WHILEM_VISITED_POS;
5263
5264      if (trie->jump[word]) {
5265       if (!nextbranch)
5266        nextbranch = trie_node + trie->jump[0];
5267       scan= trie_node + trie->jump[word];
5268       /* We go from the jump point to the branch that follows
5269       it. Note this means we need the vestigal unused
5270       branches even though they arent otherwise used. */
5271       minnext = study_chunk(pRExC_state, &scan, minlenp,
5272        &deltanext, (regnode *)nextbranch, &data_fake,
5273        stopparen, recursed_depth, NULL, f,depth+1);
5274      }
5275      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5276       nextbranch= regnext((regnode*)nextbranch);
5277
5278      if (min1 > (SSize_t)(minnext + trie->minlen))
5279       min1 = minnext + trie->minlen;
5280      if (deltanext == SSize_t_MAX) {
5281       is_inf = is_inf_internal = 1;
5282       max1 = SSize_t_MAX;
5283      } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5284       max1 = minnext + deltanext + trie->maxlen;
5285
5286      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5287       pars++;
5288      if (data_fake.flags & SCF_SEEN_ACCEPT) {
5289       if ( stopmin > min + min1)
5290        stopmin = min + min1;
5291       flags &= ~SCF_DO_SUBSTR;
5292       if (data)
5293        data->flags |= SCF_SEEN_ACCEPT;
5294      }
5295      if (data) {
5296       if (data_fake.flags & SF_HAS_EVAL)
5297        data->flags |= SF_HAS_EVAL;
5298       data->whilem_c = data_fake.whilem_c;
5299      }
5300      if (flags & SCF_DO_STCLASS)
5301       ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5302     }
5303    }
5304    if (flags & SCF_DO_SUBSTR) {
5305     data->pos_min += min1;
5306     data->pos_delta += max1 - min1;
5307     if (max1 != min1 || is_inf)
5308      data->longest = &(data->longest_float);
5309    }
5310    min += min1;
5311    delta += max1 - min1;
5312    if (flags & SCF_DO_STCLASS_OR) {
5313     ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5314     if (min1) {
5315      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5316      flags &= ~SCF_DO_STCLASS;
5317     }
5318    }
5319    else if (flags & SCF_DO_STCLASS_AND) {
5320     if (min1) {
5321      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5322      flags &= ~SCF_DO_STCLASS;
5323     }
5324     else {
5325      /* Switch to OR mode: cache the old value of
5326      * data->start_class */
5327      INIT_AND_WITHP;
5328      StructCopy(data->start_class, and_withp, regnode_ssc);
5329      flags &= ~SCF_DO_STCLASS_AND;
5330      StructCopy(&accum, data->start_class, regnode_ssc);
5331      flags |= SCF_DO_STCLASS_OR;
5332     }
5333    }
5334    scan= tail;
5335    continue;
5336   }
5337 #else
5338   else if (PL_regkind[OP(scan)] == TRIE) {
5339    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5340    U8*bang=NULL;
5341
5342    min += trie->minlen;
5343    delta += (trie->maxlen - trie->minlen);
5344    flags &= ~SCF_DO_STCLASS; /* xxx */
5345    if (flags & SCF_DO_SUBSTR) {
5346     /* Cannot expect anything... */
5347     scan_commit(pRExC_state, data, minlenp, is_inf);
5348      data->pos_min += trie->minlen;
5349      data->pos_delta += (trie->maxlen - trie->minlen);
5350     if (trie->maxlen != trie->minlen)
5351      data->longest = &(data->longest_float);
5352     }
5353     if (trie->jump) /* no more substrings -- for now /grr*/
5354    flags &= ~SCF_DO_SUBSTR;
5355   }
5356 #endif /* old or new */
5357 #endif /* TRIE_STUDY_OPT */
5358
5359   /* Else: zero-length, ignore. */
5360   scan = regnext(scan);
5361  }
5362  /* If we are exiting a recursion we can unset its recursed bit
5363  * and allow ourselves to enter it again - no danger of an
5364  * infinite loop there.
5365  if (stopparen > -1 && recursed) {
5366   DEBUG_STUDYDATA("unset:", data,depth);
5367   PAREN_UNSET( recursed, stopparen);
5368  }
5369  */
5370  if (frame) {
5371   DEBUG_STUDYDATA("frame-end:",data,depth);
5372   DEBUG_PEEP("fend", scan, depth);
5373   /* restore previous context */
5374   last = frame->last;
5375   scan = frame->next;
5376   stopparen = frame->stop;
5377   recursed_depth = frame->prev_recursed_depth;
5378   depth = depth - 1;
5379
5380   frame = frame->prev;
5381   goto fake_study_recurse;
5382  }
5383
5384   finish:
5385  assert(!frame);
5386  DEBUG_STUDYDATA("pre-fin:",data,depth);
5387
5388  *scanp = scan;
5389  *deltap = is_inf_internal ? SSize_t_MAX : delta;
5390
5391  if (flags & SCF_DO_SUBSTR && is_inf)
5392   data->pos_delta = SSize_t_MAX - data->pos_min;
5393  if (is_par > (I32)U8_MAX)
5394   is_par = 0;
5395  if (is_par && pars==1 && data) {
5396   data->flags |= SF_IN_PAR;
5397   data->flags &= ~SF_HAS_PAR;
5398  }
5399  else if (pars && data) {
5400   data->flags |= SF_HAS_PAR;
5401   data->flags &= ~SF_IN_PAR;
5402  }
5403  if (flags & SCF_DO_STCLASS_OR)
5404   ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5405  if (flags & SCF_TRIE_RESTUDY)
5406   data->flags |=  SCF_TRIE_RESTUDY;
5407
5408  DEBUG_STUDYDATA("post-fin:",data,depth);
5409
5410  {
5411   SSize_t final_minlen= min < stopmin ? min : stopmin;
5412
5413   if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5414    RExC_maxlen = final_minlen + delta;
5415   }
5416   return final_minlen;
5417  }
5418  /* not-reached */
5419 }
5420
5421 STATIC U32
5422 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5423 {
5424  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5425
5426  PERL_ARGS_ASSERT_ADD_DATA;
5427
5428  Renewc(RExC_rxi->data,
5429   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5430   char, struct reg_data);
5431  if(count)
5432   Renew(RExC_rxi->data->what, count + n, U8);
5433  else
5434   Newx(RExC_rxi->data->what, n, U8);
5435  RExC_rxi->data->count = count + n;
5436  Copy(s, RExC_rxi->data->what + count, n, U8);
5437  return count;
5438 }
5439
5440 /*XXX: todo make this not included in a non debugging perl */
5441 #ifndef PERL_IN_XSUB_RE
5442 void
5443 Perl_reginitcolors(pTHX)
5444 {
5445  dVAR;
5446  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5447  if (s) {
5448   char *t = savepv(s);
5449   int i = 0;
5450   PL_colors[0] = t;
5451   while (++i < 6) {
5452    t = strchr(t, '\t');
5453    if (t) {
5454     *t = '\0';
5455     PL_colors[i] = ++t;
5456    }
5457    else
5458     PL_colors[i] = t = (char *)"";
5459   }
5460  } else {
5461   int i = 0;
5462   while (i < 6)
5463    PL_colors[i++] = (char *)"";
5464  }
5465  PL_colorset = 1;
5466 }
5467 #endif
5468
5469
5470 #ifdef TRIE_STUDY_OPT
5471 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5472  STMT_START {                                            \
5473   if (                                                \
5474    (data.flags & SCF_TRIE_RESTUDY)               \
5475    && ! restudied++                              \
5476   ) {                                                 \
5477    dOsomething;                                    \
5478    goto reStudy;                                   \
5479   }                                                   \
5480  } STMT_END
5481 #else
5482 #define CHECK_RESTUDY_GOTO_butfirst
5483 #endif
5484
5485 /*
5486  * pregcomp - compile a regular expression into internal code
5487  *
5488  * Decides which engine's compiler to call based on the hint currently in
5489  * scope
5490  */
5491
5492 #ifndef PERL_IN_XSUB_RE
5493
5494 /* return the currently in-scope regex engine (or the default if none)  */
5495
5496 regexp_engine const *
5497 Perl_current_re_engine(pTHX)
5498 {
5499  dVAR;
5500
5501  if (IN_PERL_COMPILETIME) {
5502   HV * const table = GvHV(PL_hintgv);
5503   SV **ptr;
5504
5505   if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5506    return &reh_regexp_engine;
5507   ptr = hv_fetchs(table, "regcomp", FALSE);
5508   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5509    return &reh_regexp_engine;
5510   return INT2PTR(regexp_engine*,SvIV(*ptr));
5511  }
5512  else {
5513   SV *ptr;
5514   if (!PL_curcop->cop_hints_hash)
5515    return &reh_regexp_engine;
5516   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5517   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5518    return &reh_regexp_engine;
5519   return INT2PTR(regexp_engine*,SvIV(ptr));
5520  }
5521 }
5522
5523
5524 REGEXP *
5525 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5526 {
5527  dVAR;
5528  regexp_engine const *eng = current_re_engine();
5529  GET_RE_DEBUG_FLAGS_DECL;
5530
5531  PERL_ARGS_ASSERT_PREGCOMP;
5532
5533  /* Dispatch a request to compile a regexp to correct regexp engine. */
5534  DEBUG_COMPILE_r({
5535   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5536       PTR2UV(eng));
5537  });
5538  return CALLREGCOMP_ENG(eng, pattern, flags);
5539 }
5540 #endif
5541
5542 /* public(ish) entry point for the perl core's own regex compiling code.
5543  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5544  * pattern rather than a list of OPs, and uses the internal engine rather
5545  * than the current one */
5546
5547 REGEXP *
5548 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5549 {
5550  SV *pat = pattern; /* defeat constness! */
5551  PERL_ARGS_ASSERT_RE_COMPILE;
5552  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5553 #ifdef PERL_IN_XSUB_RE
5554         &my_reg_engine,
5555 #else
5556         &reh_regexp_engine,
5557 #endif
5558         NULL, NULL, rx_flags, 0);
5559 }
5560
5561
5562 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5563  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5564  * point to the realloced string and length.
5565  *
5566  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5567  * stuff added */
5568
5569 static void
5570 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5571      char **pat_p, STRLEN *plen_p, int num_code_blocks)
5572 {
5573  U8 *const src = (U8*)*pat_p;
5574  U8 *dst;
5575  int n=0;
5576  STRLEN s = 0, d = 0;
5577  bool do_end = 0;
5578  GET_RE_DEBUG_FLAGS_DECL;
5579
5580  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5581   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5582
5583  Newx(dst, *plen_p * 2 + 1, U8);
5584
5585  while (s < *plen_p) {
5586   if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5587    dst[d]   = src[s];
5588   else {
5589    dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5590    dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5591   }
5592   if (n < num_code_blocks) {
5593    if (!do_end && pRExC_state->code_blocks[n].start == s) {
5594     pRExC_state->code_blocks[n].start = d;
5595     assert(dst[d] == '(');
5596     do_end = 1;
5597    }
5598    else if (do_end && pRExC_state->code_blocks[n].end == s) {
5599     pRExC_state->code_blocks[n].end = d;
5600     assert(dst[d] == ')');
5601     do_end = 0;
5602     n++;
5603    }
5604   }
5605   s++;
5606   d++;
5607  }
5608  dst[d] = '\0';
5609  *plen_p = d;
5610  *pat_p = (char*) dst;
5611  SAVEFREEPV(*pat_p);
5612  RExC_orig_utf8 = RExC_utf8 = 1;
5613 }
5614
5615
5616
5617 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5618  * while recording any code block indices, and handling overloading,
5619  * nested qr// objects etc.  If pat is null, it will allocate a new
5620  * string, or just return the first arg, if there's only one.
5621  *
5622  * Returns the malloced/updated pat.
5623  * patternp and pat_count is the array of SVs to be concatted;
5624  * oplist is the optional list of ops that generated the SVs;
5625  * recompile_p is a pointer to a boolean that will be set if
5626  *   the regex will need to be recompiled.
5627  * delim, if non-null is an SV that will be inserted between each element
5628  */
5629
5630 static SV*
5631 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5632     SV *pat, SV ** const patternp, int pat_count,
5633     OP *oplist, bool *recompile_p, SV *delim)
5634 {
5635  SV **svp;
5636  int n = 0;
5637  bool use_delim = FALSE;
5638  bool alloced = FALSE;
5639
5640  /* if we know we have at least two args, create an empty string,
5641  * then concatenate args to that. For no args, return an empty string */
5642  if (!pat && pat_count != 1) {
5643   pat = newSVpvn("", 0);
5644   SAVEFREESV(pat);
5645   alloced = TRUE;
5646  }
5647
5648  for (svp = patternp; svp < patternp + pat_count; svp++) {
5649   SV *sv;
5650   SV *rx  = NULL;
5651   STRLEN orig_patlen = 0;
5652   bool code = 0;
5653   SV *msv = use_delim ? delim : *svp;
5654   if (!msv) msv = &PL_sv_undef;
5655
5656   /* if we've got a delimiter, we go round the loop twice for each
5657   * svp slot (except the last), using the delimiter the second
5658   * time round */
5659   if (use_delim) {
5660    svp--;
5661    use_delim = FALSE;
5662   }
5663   else if (delim)
5664    use_delim = TRUE;
5665
5666   if (SvTYPE(msv) == SVt_PVAV) {
5667    /* we've encountered an interpolated array within
5668    * the pattern, e.g. /...@a..../. Expand the list of elements,
5669    * then recursively append elements.
5670    * The code in this block is based on S_pushav() */
5671
5672    AV *const av = (AV*)msv;
5673    const SSize_t maxarg = AvFILL(av) + 1;
5674    SV **array;
5675
5676    if (oplist) {
5677     assert(oplist->op_type == OP_PADAV
5678      || oplist->op_type == OP_RV2AV);
5679     oplist = oplist->op_sibling;;
5680    }
5681
5682    if (SvRMAGICAL(av)) {
5683     SSize_t i;
5684
5685     Newx(array, maxarg, SV*);
5686     SAVEFREEPV(array);
5687     for (i=0; i < maxarg; i++) {
5688      SV ** const svp = av_fetch(av, i, FALSE);
5689      array[i] = svp ? *svp : &PL_sv_undef;
5690     }
5691    }
5692    else
5693     array = AvARRAY(av);
5694
5695    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5696         array, maxarg, NULL, recompile_p,
5697         /* $" */
5698         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5699
5700    continue;
5701   }
5702
5703
5704   /* we make the assumption here that each op in the list of
5705   * op_siblings maps to one SV pushed onto the stack,
5706   * except for code blocks, with have both an OP_NULL and
5707   * and OP_CONST.
5708   * This allows us to match up the list of SVs against the
5709   * list of OPs to find the next code block.
5710   *
5711   * Note that       PUSHMARK PADSV PADSV ..
5712   * is optimised to
5713   *                 PADRANGE PADSV  PADSV  ..
5714   * so the alignment still works. */
5715
5716   if (oplist) {
5717    if (oplist->op_type == OP_NULL
5718     && (oplist->op_flags & OPf_SPECIAL))
5719    {
5720     assert(n < pRExC_state->num_code_blocks);
5721     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5722     pRExC_state->code_blocks[n].block = oplist;
5723     pRExC_state->code_blocks[n].src_regex = NULL;
5724     n++;
5725     code = 1;
5726     oplist = oplist->op_sibling; /* skip CONST */
5727     assert(oplist);
5728    }
5729    oplist = oplist->op_sibling;;
5730   }
5731
5732   /* apply magic and QR overloading to arg */
5733
5734   SvGETMAGIC(msv);
5735   if (SvROK(msv) && SvAMAGIC(msv)) {
5736    SV *sv = AMG_CALLunary(msv, regexp_amg);
5737    if (sv) {
5738     if (SvROK(sv))
5739      sv = SvRV(sv);
5740     if (SvTYPE(sv) != SVt_REGEXP)
5741      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5742     msv = sv;
5743    }
5744   }
5745
5746   /* try concatenation overload ... */
5747   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5748     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5749   {
5750    sv_setsv(pat, sv);
5751    /* overloading involved: all bets are off over literal
5752    * code. Pretend we haven't seen it */
5753    pRExC_state->num_code_blocks -= n;
5754    n = 0;
5755   }
5756   else  {
5757    /* ... or failing that, try "" overload */
5758    while (SvAMAGIC(msv)
5759      && (sv = AMG_CALLunary(msv, string_amg))
5760      && sv != msv
5761      &&  !(   SvROK(msv)
5762       && SvROK(sv)
5763       && SvRV(msv) == SvRV(sv))
5764    ) {
5765     msv = sv;
5766     SvGETMAGIC(msv);
5767    }
5768    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5769     msv = SvRV(msv);
5770
5771    if (pat) {
5772     /* this is a partially unrolled
5773     *     sv_catsv_nomg(pat, msv);
5774     * that allows us to adjust code block indices if
5775     * needed */
5776     STRLEN dlen;
5777     char *dst = SvPV_force_nomg(pat, dlen);
5778     orig_patlen = dlen;
5779     if (SvUTF8(msv) && !SvUTF8(pat)) {
5780      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5781      sv_setpvn(pat, dst, dlen);
5782      SvUTF8_on(pat);
5783     }
5784     sv_catsv_nomg(pat, msv);
5785     rx = msv;
5786    }
5787    else
5788     pat = msv;
5789
5790    if (code)
5791     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5792   }
5793
5794   /* extract any code blocks within any embedded qr//'s */
5795   if (rx && SvTYPE(rx) == SVt_REGEXP
5796    && RX_ENGINE((REGEXP*)rx)->op_comp)
5797   {
5798
5799    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5800    if (ri->num_code_blocks) {
5801     int i;
5802     /* the presence of an embedded qr// with code means
5803     * we should always recompile: the text of the
5804     * qr// may not have changed, but it may be a
5805     * different closure than last time */
5806     *recompile_p = 1;
5807     Renew(pRExC_state->code_blocks,
5808      pRExC_state->num_code_blocks + ri->num_code_blocks,
5809      struct reg_code_block);
5810     pRExC_state->num_code_blocks += ri->num_code_blocks;
5811
5812     for (i=0; i < ri->num_code_blocks; i++) {
5813      struct reg_code_block *src, *dst;
5814      STRLEN offset =  orig_patlen
5815       + ReANY((REGEXP *)rx)->pre_prefix;
5816      assert(n < pRExC_state->num_code_blocks);
5817      src = &ri->code_blocks[i];
5818      dst = &pRExC_state->code_blocks[n];
5819      dst->start     = src->start + offset;
5820      dst->end     = src->end   + offset;
5821      dst->block     = src->block;
5822      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5823            src->src_regex
5824             ? src->src_regex
5825             : (REGEXP*)rx);
5826      n++;
5827     }
5828    }
5829   }
5830  }
5831  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5832  if (alloced)
5833   SvSETMAGIC(pat);
5834
5835  return pat;
5836 }
5837
5838
5839
5840 /* see if there are any run-time code blocks in the pattern.
5841  * False positives are allowed */
5842
5843 static bool
5844 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5845      char *pat, STRLEN plen)
5846 {
5847  int n = 0;
5848  STRLEN s;
5849
5850  for (s = 0; s < plen; s++) {
5851   if (n < pRExC_state->num_code_blocks
5852    && s == pRExC_state->code_blocks[n].start)
5853   {
5854    s = pRExC_state->code_blocks[n].end;
5855    n++;
5856    continue;
5857   }
5858   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5859   * positives here */
5860   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5861    (pat[s+2] == '{'
5862     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5863   )
5864    return 1;
5865  }
5866  return 0;
5867 }
5868
5869 /* Handle run-time code blocks. We will already have compiled any direct
5870  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5871  * copy of it, but with any literal code blocks blanked out and
5872  * appropriate chars escaped; then feed it into
5873  *
5874  *    eval "qr'modified_pattern'"
5875  *
5876  * For example,
5877  *
5878  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5879  *
5880  * becomes
5881  *
5882  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5883  *
5884  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5885  * and merge them with any code blocks of the original regexp.
5886  *
5887  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5888  * instead, just save the qr and return FALSE; this tells our caller that
5889  * the original pattern needs upgrading to utf8.
5890  */
5891
5892 static bool
5893 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5894  char *pat, STRLEN plen)
5895 {
5896  SV *qr;
5897
5898  GET_RE_DEBUG_FLAGS_DECL;
5899
5900  if (pRExC_state->runtime_code_qr) {
5901   /* this is the second time we've been called; this should
5902   * only happen if the main pattern got upgraded to utf8
5903   * during compilation; re-use the qr we compiled first time
5904   * round (which should be utf8 too)
5905   */
5906   qr = pRExC_state->runtime_code_qr;
5907   pRExC_state->runtime_code_qr = NULL;
5908   assert(RExC_utf8 && SvUTF8(qr));
5909  }
5910  else {
5911   int n = 0;
5912   STRLEN s;
5913   char *p, *newpat;
5914   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5915   SV *sv, *qr_ref;
5916   dSP;
5917
5918   /* determine how many extra chars we need for ' and \ escaping */
5919   for (s = 0; s < plen; s++) {
5920    if (pat[s] == '\'' || pat[s] == '\\')
5921     newlen++;
5922   }
5923
5924   Newx(newpat, newlen, char);
5925   p = newpat;
5926   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5927
5928   for (s = 0; s < plen; s++) {
5929    if (n < pRExC_state->num_code_blocks
5930     && s == pRExC_state->code_blocks[n].start)
5931    {
5932     /* blank out literal code block */
5933     assert(pat[s] == '(');
5934     while (s <= pRExC_state->code_blocks[n].end) {
5935      *p++ = '_';
5936      s++;
5937     }
5938     s--;
5939     n++;
5940     continue;
5941    }
5942    if (pat[s] == '\'' || pat[s] == '\\')
5943     *p++ = '\\';
5944    *p++ = pat[s];
5945   }
5946   *p++ = '\'';
5947   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5948    *p++ = 'x';
5949   *p++ = '\0';
5950   DEBUG_COMPILE_r({
5951    PerlIO_printf(Perl_debug_log,
5952     "%sre-parsing pattern for runtime code:%s %s\n",
5953     PL_colors[4],PL_colors[5],newpat);
5954   });
5955
5956   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5957   Safefree(newpat);
5958
5959   ENTER;
5960   SAVETMPS;
5961   save_re_context();
5962   PUSHSTACKi(PERLSI_REQUIRE);
5963   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5964   * parsing qr''; normally only q'' does this. It also alters
5965   * hints handling */
5966   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5967   SvREFCNT_dec_NN(sv);
5968   SPAGAIN;
5969   qr_ref = POPs;
5970   PUTBACK;
5971   {
5972    SV * const errsv = ERRSV;
5973    if (SvTRUE_NN(errsv))
5974    {
5975     Safefree(pRExC_state->code_blocks);
5976     /* use croak_sv ? */
5977     Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5978    }
5979   }
5980   assert(SvROK(qr_ref));
5981   qr = SvRV(qr_ref);
5982   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5983   /* the leaving below frees the tmp qr_ref.
5984   * Give qr a life of its own */
5985   SvREFCNT_inc(qr);
5986   POPSTACK;
5987   FREETMPS;
5988   LEAVE;
5989
5990  }
5991
5992  if (!RExC_utf8 && SvUTF8(qr)) {
5993   /* first time through; the pattern got upgraded; save the
5994   * qr for the next time through */
5995   assert(!pRExC_state->runtime_code_qr);
5996   pRExC_state->runtime_code_qr = qr;
5997   return 0;
5998  }
5999
6000
6001  /* extract any code blocks within the returned qr//  */
6002
6003
6004  /* merge the main (r1) and run-time (r2) code blocks into one */
6005  {
6006   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6007   struct reg_code_block *new_block, *dst;
6008   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6009   int i1 = 0, i2 = 0;
6010
6011   if (!r2->num_code_blocks) /* we guessed wrong */
6012   {
6013    SvREFCNT_dec_NN(qr);
6014    return 1;
6015   }
6016
6017   Newx(new_block,
6018    r1->num_code_blocks + r2->num_code_blocks,
6019    struct reg_code_block);
6020   dst = new_block;
6021
6022   while (    i1 < r1->num_code_blocks
6023     || i2 < r2->num_code_blocks)
6024   {
6025    struct reg_code_block *src;
6026    bool is_qr = 0;
6027
6028    if (i1 == r1->num_code_blocks) {
6029     src = &r2->code_blocks[i2++];
6030     is_qr = 1;
6031    }
6032    else if (i2 == r2->num_code_blocks)
6033     src = &r1->code_blocks[i1++];
6034    else if (  r1->code_blocks[i1].start
6035      < r2->code_blocks[i2].start)
6036    {
6037     src = &r1->code_blocks[i1++];
6038     assert(src->end < r2->code_blocks[i2].start);
6039    }
6040    else {
6041     assert(  r1->code_blocks[i1].start
6042      > r2->code_blocks[i2].start);
6043     src = &r2->code_blocks[i2++];
6044     is_qr = 1;
6045     assert(src->end < r1->code_blocks[i1].start);
6046    }
6047
6048    assert(pat[src->start] == '(');
6049    assert(pat[src->end]   == ')');
6050    dst->start     = src->start;
6051    dst->end     = src->end;
6052    dst->block     = src->block;
6053    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6054          : src->src_regex;
6055    dst++;
6056   }
6057   r1->num_code_blocks += r2->num_code_blocks;
6058   Safefree(r1->code_blocks);
6059   r1->code_blocks = new_block;
6060  }
6061
6062  SvREFCNT_dec_NN(qr);
6063  return 1;
6064 }
6065
6066
6067 STATIC bool
6068 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6069      SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6070      SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6071      STRLEN longest_length, bool eol, bool meol)
6072 {
6073  /* This is the common code for setting up the floating and fixed length
6074  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6075  * as to whether succeeded or not */
6076
6077  I32 t;
6078  SSize_t ml;
6079
6080  if (! (longest_length
6081   || (eol /* Can't have SEOL and MULTI */
6082    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6083   )
6084    /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6085   || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6086  {
6087   return FALSE;
6088  }
6089
6090  /* copy the information about the longest from the reg_scan_data
6091   over to the program. */
6092  if (SvUTF8(sv_longest)) {
6093   *rx_utf8 = sv_longest;
6094   *rx_substr = NULL;
6095  } else {
6096   *rx_substr = sv_longest;
6097   *rx_utf8 = NULL;
6098  }
6099  /* end_shift is how many chars that must be matched that
6100   follow this item. We calculate it ahead of time as once the
6101   lookbehind offset is added in we lose the ability to correctly
6102   calculate it.*/
6103  ml = minlen ? *(minlen) : (SSize_t)longest_length;
6104  *rx_end_shift = ml - offset
6105   - longest_length + (SvTAIL(sv_longest) != 0)
6106   + lookbehind;
6107
6108  t = (eol/* Can't have SEOL and MULTI */
6109   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6110  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6111
6112  return TRUE;
6113 }
6114
6115 /*
6116  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6117  * regular expression into internal code.
6118  * The pattern may be passed either as:
6119  *    a list of SVs (patternp plus pat_count)
6120  *    a list of OPs (expr)
6121  * If both are passed, the SV list is used, but the OP list indicates
6122  * which SVs are actually pre-compiled code blocks
6123  *
6124  * The SVs in the list have magic and qr overloading applied to them (and
6125  * the list may be modified in-place with replacement SVs in the latter
6126  * case).
6127  *
6128  * If the pattern hasn't changed from old_re, then old_re will be
6129  * returned.
6130  *
6131  * eng is the current engine. If that engine has an op_comp method, then
6132  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6133  * do the initial concatenation of arguments and pass on to the external
6134  * engine.
6135  *
6136  * If is_bare_re is not null, set it to a boolean indicating whether the
6137  * arg list reduced (after overloading) to a single bare regex which has
6138  * been returned (i.e. /$qr/).
6139  *
6140  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6141  *
6142  * pm_flags contains the PMf_* flags, typically based on those from the
6143  * pm_flags field of the related PMOP. Currently we're only interested in
6144  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6145  *
6146  * We can't allocate space until we know how big the compiled form will be,
6147  * but we can't compile it (and thus know how big it is) until we've got a
6148  * place to put the code.  So we cheat:  we compile it twice, once with code
6149  * generation turned off and size counting turned on, and once "for real".
6150  * This also means that we don't allocate space until we are sure that the
6151  * thing really will compile successfully, and we never have to move the
6152  * code and thus invalidate pointers into it.  (Note that it has to be in
6153  * one piece because free() must be able to free it all.) [NB: not true in perl]
6154  *
6155  * Beware that the optimization-preparation code in here knows about some
6156  * of the structure of the compiled regexp.  [I'll say.]
6157  */
6158
6159 REGEXP *
6160 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6161      OP *expr, const regexp_engine* eng, REGEXP *old_re,
6162      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6163 {
6164  dVAR;
6165  REGEXP *rx;
6166  struct regexp *r;
6167  regexp_internal *ri;
6168  STRLEN plen;
6169  char *exp;
6170  regnode *scan;
6171  I32 flags;
6172  SSize_t minlen = 0;
6173  U32 rx_flags;
6174  SV *pat;
6175  SV *code_blocksv = NULL;
6176  SV** new_patternp = patternp;
6177
6178  /* these are all flags - maybe they should be turned
6179  * into a single int with different bit masks */
6180  I32 sawlookahead = 0;
6181  I32 sawplus = 0;
6182  I32 sawopen = 0;
6183  I32 sawminmod = 0;
6184
6185  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6186  bool recompile = 0;
6187  bool runtime_code = 0;
6188  scan_data_t data;
6189  RExC_state_t RExC_state;
6190  RExC_state_t * const pRExC_state = &RExC_state;
6191 #ifdef TRIE_STUDY_OPT
6192  int restudied = 0;
6193  RExC_state_t copyRExC_state;
6194 #endif
6195  GET_RE_DEBUG_FLAGS_DECL;
6196
6197  PERL_ARGS_ASSERT_RE_OP_COMPILE;
6198
6199  DEBUG_r(if (!PL_colorset) reginitcolors());
6200
6201 #ifndef PERL_IN_XSUB_RE
6202  /* Initialize these here instead of as-needed, as is quick and avoids
6203  * having to test them each time otherwise */
6204  if (! PL_AboveLatin1) {
6205   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6206   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6207   PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6208   PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6209   PL_HasMultiCharFold =
6210      _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6211  }
6212 #endif
6213
6214  pRExC_state->code_blocks = NULL;
6215  pRExC_state->num_code_blocks = 0;
6216
6217  if (is_bare_re)
6218   *is_bare_re = FALSE;
6219
6220  if (expr && (expr->op_type == OP_LIST ||
6221     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6222   /* allocate code_blocks if needed */
6223   OP *o;
6224   int ncode = 0;
6225
6226   for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6227    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6228     ncode++; /* count of DO blocks */
6229   if (ncode) {
6230    pRExC_state->num_code_blocks = ncode;
6231    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6232   }
6233  }
6234
6235  if (!pat_count) {
6236   /* compile-time pattern with just OP_CONSTs and DO blocks */
6237
6238   int n;
6239   OP *o;
6240
6241   /* find how many CONSTs there are */
6242   assert(expr);
6243   n = 0;
6244   if (expr->op_type == OP_CONST)
6245    n = 1;
6246   else
6247    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6248     if (o->op_type == OP_CONST)
6249      n++;
6250    }
6251
6252   /* fake up an SV array */
6253
6254   assert(!new_patternp);
6255   Newx(new_patternp, n, SV*);
6256   SAVEFREEPV(new_patternp);
6257   pat_count = n;
6258
6259   n = 0;
6260   if (expr->op_type == OP_CONST)
6261    new_patternp[n] = cSVOPx_sv(expr);
6262   else
6263    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6264     if (o->op_type == OP_CONST)
6265      new_patternp[n++] = cSVOPo_sv;
6266    }
6267
6268  }
6269
6270  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6271   "Assembling pattern from %d elements%s\n", pat_count,
6272    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6273
6274  /* set expr to the first arg op */
6275
6276  if (pRExC_state->num_code_blocks
6277   && expr->op_type != OP_CONST)
6278  {
6279    expr = cLISTOPx(expr)->op_first;
6280    assert(   expr->op_type == OP_PUSHMARK
6281     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6282     || expr->op_type == OP_PADRANGE);
6283    expr = expr->op_sibling;
6284  }
6285
6286  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6287       expr, &recompile, NULL);
6288
6289  /* handle bare (possibly after overloading) regex: foo =~ $re */
6290  {
6291   SV *re = pat;
6292   if (SvROK(re))
6293    re = SvRV(re);
6294   if (SvTYPE(re) == SVt_REGEXP) {
6295    if (is_bare_re)
6296     *is_bare_re = TRUE;
6297    SvREFCNT_inc(re);
6298    Safefree(pRExC_state->code_blocks);
6299    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6300     "Precompiled pattern%s\n",
6301      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6302
6303    return (REGEXP*)re;
6304   }
6305  }
6306
6307  exp = SvPV_nomg(pat, plen);
6308
6309  if (!eng->op_comp) {
6310   if ((SvUTF8(pat) && IN_BYTES)
6311     || SvGMAGICAL(pat) || SvAMAGIC(pat))
6312   {
6313    /* make a temporary copy; either to convert to bytes,
6314    * or to avoid repeating get-magic / overloaded stringify */
6315    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6316           (IN_BYTES ? 0 : SvUTF8(pat)));
6317   }
6318   Safefree(pRExC_state->code_blocks);
6319   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6320  }
6321
6322  /* ignore the utf8ness if the pattern is 0 length */
6323  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6324  RExC_uni_semantics = 0;
6325  RExC_contains_locale = 0;
6326  RExC_contains_i = 0;
6327  pRExC_state->runtime_code_qr = NULL;
6328
6329  DEBUG_COMPILE_r({
6330    SV *dsv= sv_newmortal();
6331    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6332    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6333       PL_colors[4],PL_colors[5],s);
6334   });
6335
6336   redo_first_pass:
6337  /* we jump here if we upgrade the pattern to utf8 and have to
6338  * recompile */
6339
6340  if ((pm_flags & PMf_USE_RE_EVAL)
6341     /* this second condition covers the non-regex literal case,
6342     * i.e.  $foo =~ '(?{})'. */
6343     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6344  )
6345   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6346
6347  /* return old regex if pattern hasn't changed */
6348  /* XXX: note in the below we have to check the flags as well as the
6349  * pattern.
6350  *
6351  * Things get a touch tricky as we have to compare the utf8 flag
6352  * independently from the compile flags.  */
6353
6354  if (   old_re
6355   && !recompile
6356   && !!RX_UTF8(old_re) == !!RExC_utf8
6357   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6358   && RX_PRECOMP(old_re)
6359   && RX_PRELEN(old_re) == plen
6360   && memEQ(RX_PRECOMP(old_re), exp, plen)
6361   && !runtime_code /* with runtime code, always recompile */ )
6362  {
6363   Safefree(pRExC_state->code_blocks);
6364   return old_re;
6365  }
6366
6367  rx_flags = orig_rx_flags;
6368
6369  if (rx_flags & PMf_FOLD) {
6370   RExC_contains_i = 1;
6371  }
6372  if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6373
6374   /* Set to use unicode semantics if the pattern is in utf8 and has the
6375   * 'depends' charset specified, as it means unicode when utf8  */
6376   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6377  }
6378
6379  RExC_precomp = exp;
6380  RExC_flags = rx_flags;
6381  RExC_pm_flags = pm_flags;
6382
6383  if (runtime_code) {
6384   if (TAINTING_get && TAINT_get)
6385    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6386
6387   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6388    /* whoops, we have a non-utf8 pattern, whilst run-time code
6389    * got compiled as utf8. Try again with a utf8 pattern */
6390    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6391          pRExC_state->num_code_blocks);
6392    goto redo_first_pass;
6393   }
6394  }
6395  assert(!pRExC_state->runtime_code_qr);
6396
6397  RExC_sawback = 0;
6398
6399  RExC_seen = 0;
6400  RExC_maxlen = 0;
6401  RExC_in_lookbehind = 0;
6402  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6403  RExC_extralen = 0;
6404  RExC_override_recoding = 0;
6405  RExC_in_multi_char_class = 0;
6406
6407  /* First pass: determine size, legality. */
6408  RExC_parse = exp;
6409  RExC_start = exp;
6410  RExC_end = exp + plen;
6411  RExC_naughty = 0;
6412  RExC_npar = 1;
6413  RExC_nestroot = 0;
6414  RExC_size = 0L;
6415  RExC_emit = (regnode *) &RExC_emit_dummy;
6416  RExC_whilem_seen = 0;
6417  RExC_open_parens = NULL;
6418  RExC_close_parens = NULL;
6419  RExC_opend = NULL;
6420  RExC_paren_names = NULL;
6421 #ifdef DEBUGGING
6422  RExC_paren_name_list = NULL;
6423 #endif
6424  RExC_recurse = NULL;
6425  RExC_study_chunk_recursed = NULL;
6426  RExC_study_chunk_recursed_bytes= 0;
6427  RExC_recurse_count = 0;
6428  pRExC_state->code_index = 0;
6429
6430 #if 0 /* REGC() is (currently) a NOP at the first pass.
6431  * Clever compilers notice this and complain. --jhi */
6432  REGC((U8)REG_MAGIC, (char*)RExC_emit);
6433 #endif
6434  DEBUG_PARSE_r(
6435   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6436   RExC_lastnum=0;
6437   RExC_lastparse=NULL;
6438  );
6439  /* reg may croak on us, not giving us a chance to free
6440  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6441  need it to survive as long as the regexp (qr/(?{})/).
6442  We must check that code_blocksv is not already set, because we may
6443  have jumped back to restart the sizing pass. */
6444  if (pRExC_state->code_blocks && !code_blocksv) {
6445   code_blocksv = newSV_type(SVt_PV);
6446   SAVEFREESV(code_blocksv);
6447   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6448   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6449  }
6450  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6451   /* It's possible to write a regexp in ascii that represents Unicode
6452   codepoints outside of the byte range, such as via \x{100}. If we
6453   detect such a sequence we have to convert the entire pattern to utf8
6454   and then recompile, as our sizing calculation will have been based
6455   on 1 byte == 1 character, but we will need to use utf8 to encode
6456   at least some part of the pattern, and therefore must convert the whole
6457   thing.
6458   -- dmq */
6459   if (flags & RESTART_UTF8) {
6460    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6461          pRExC_state->num_code_blocks);
6462    goto redo_first_pass;
6463   }
6464   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6465  }
6466  if (code_blocksv)
6467   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6468
6469  DEBUG_PARSE_r({
6470   PerlIO_printf(Perl_debug_log,
6471    "Required size %"IVdf" nodes\n"
6472    "Starting second pass (creation)\n",
6473    (IV)RExC_size);
6474   RExC_lastnum=0;
6475   RExC_lastparse=NULL;
6476  });
6477
6478  /* The first pass could have found things that force Unicode semantics */
6479  if ((RExC_utf8 || RExC_uni_semantics)
6480   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6481  {
6482   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6483  }
6484
6485  /* Small enough for pointer-storage convention?
6486  If extralen==0, this means that we will not need long jumps. */
6487  if (RExC_size >= 0x10000L && RExC_extralen)
6488   RExC_size += RExC_extralen;
6489  else
6490   RExC_extralen = 0;
6491  if (RExC_whilem_seen > 15)
6492   RExC_whilem_seen = 15;
6493
6494  /* Allocate space and zero-initialize. Note, the two step process
6495  of zeroing when in debug mode, thus anything assigned has to
6496  happen after that */
6497  rx = (REGEXP*) newSV_type(SVt_REGEXP);
6498  r = ReANY(rx);
6499  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6500   char, regexp_internal);
6501  if ( r == NULL || ri == NULL )
6502   FAIL("Regexp out of space");
6503 #ifdef DEBUGGING
6504  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6505  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6506   char);
6507 #else
6508  /* bulk initialize base fields with 0. */
6509  Zero(ri, sizeof(regexp_internal), char);
6510 #endif
6511
6512  /* non-zero initialization begins here */
6513  RXi_SET( r, ri );
6514  r->engine= eng;
6515  r->extflags = rx_flags;
6516  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6517
6518  if (pm_flags & PMf_IS_QR) {
6519   ri->code_blocks = pRExC_state->code_blocks;
6520   ri->num_code_blocks = pRExC_state->num_code_blocks;
6521  }
6522  else
6523  {
6524   int n;
6525   for (n = 0; n < pRExC_state->num_code_blocks; n++)
6526    if (pRExC_state->code_blocks[n].src_regex)
6527     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6528   SAVEFREEPV(pRExC_state->code_blocks);
6529  }
6530
6531  {
6532   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6533   bool has_charset = (get_regex_charset(r->extflags)
6534              != REGEX_DEPENDS_CHARSET);
6535
6536   /* The caret is output if there are any defaults: if not all the STD
6537   * flags are set, or if no character set specifier is needed */
6538   bool has_default =
6539      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6540      || ! has_charset);
6541   bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6542             == REG_RUN_ON_COMMENT_SEEN);
6543   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6544        >> RXf_PMf_STD_PMMOD_SHIFT);
6545   const char *fptr = STD_PAT_MODS;        /*"msix"*/
6546   char *p;
6547   /* Allocate for the worst case, which is all the std flags are turned
6548   * on.  If more precision is desired, we could do a population count of
6549   * the flags set.  This could be done with a small lookup table, or by
6550   * shifting, masking and adding, or even, when available, assembly
6551   * language for a machine-language population count.
6552   * We never output a minus, as all those are defaults, so are
6553   * covered by the caret */
6554   const STRLEN wraplen = plen + has_p + has_runon
6555    + has_default       /* If needs a caret */
6556
6557     /* If needs a character set specifier */
6558    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6559    + (sizeof(STD_PAT_MODS) - 1)
6560    + (sizeof("(?:)") - 1);
6561
6562   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6563   r->xpv_len_u.xpvlenu_pv = p;
6564   if (RExC_utf8)
6565    SvFLAGS(rx) |= SVf_UTF8;
6566   *p++='('; *p++='?';
6567
6568   /* If a default, cover it using the caret */
6569   if (has_default) {
6570    *p++= DEFAULT_PAT_MOD;
6571   }
6572   if (has_charset) {
6573    STRLEN len;
6574    const char* const name = get_regex_charset_name(r->extflags, &len);
6575    Copy(name, p, len, char);
6576    p += len;
6577   }
6578   if (has_p)
6579    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6580   {
6581    char ch;
6582    while((ch = *fptr++)) {
6583     if(reganch & 1)
6584      *p++ = ch;
6585     reganch >>= 1;
6586    }
6587   }
6588
6589   *p++ = ':';
6590   Copy(RExC_precomp, p, plen, char);
6591   assert ((RX_WRAPPED(rx) - p) < 16);
6592   r->pre_prefix = p - RX_WRAPPED(rx);
6593   p += plen;
6594   if (has_runon)
6595    *p++ = '\n';
6596   *p++ = ')';
6597   *p = 0;
6598   SvCUR_set(rx, p - RX_WRAPPED(rx));
6599  }
6600
6601  r->intflags = 0;
6602  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6603
6604  /* setup various meta data about recursion, this all requires
6605  * RExC_npar to be correctly set, and a bit later on we clear it */
6606  if (RExC_seen & REG_RECURSE_SEEN) {
6607   Newxz(RExC_open_parens, RExC_npar,regnode *);
6608   SAVEFREEPV(RExC_open_parens);
6609   Newxz(RExC_close_parens,RExC_npar,regnode *);
6610   SAVEFREEPV(RExC_close_parens);
6611  }
6612  if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6613   /* Note, RExC_npar is 1 + the number of parens in a pattern.
6614   * So its 1 if there are no parens. */
6615   RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6616           ((RExC_npar & 0x07) != 0);
6617   Newx(RExC_study_chunk_recursed,
6618    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6619   SAVEFREEPV(RExC_study_chunk_recursed);
6620  }
6621
6622  /* Useful during FAIL. */
6623 #ifdef RE_TRACK_PATTERN_OFFSETS
6624  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6625  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6626       "%s %"UVuf" bytes for offset annotations.\n",
6627       ri->u.offsets ? "Got" : "Couldn't get",
6628       (UV)((2*RExC_size+1) * sizeof(U32))));
6629 #endif
6630  SetProgLen(ri,RExC_size);
6631  RExC_rx_sv = rx;
6632  RExC_rx = r;
6633  RExC_rxi = ri;
6634  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6635
6636  /* Second pass: emit code. */
6637  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6638  RExC_pm_flags = pm_flags;
6639  RExC_parse = exp;
6640  RExC_end = exp + plen;
6641  RExC_naughty = 0;
6642  RExC_npar = 1;
6643  RExC_emit_start = ri->program;
6644  RExC_emit = ri->program;
6645  RExC_emit_bound = ri->program + RExC_size + 1;
6646  pRExC_state->code_index = 0;
6647
6648  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6649  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6650   ReREFCNT_dec(rx);
6651   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6652  }
6653  /* XXXX To minimize changes to RE engine we always allocate
6654  3-units-long substrs field. */
6655  Newx(r->substrs, 1, struct reg_substr_data);
6656  if (RExC_recurse_count) {
6657   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6658   SAVEFREEPV(RExC_recurse);
6659  }
6660
6661 reStudy:
6662  r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6663  Zero(r->substrs, 1, struct reg_substr_data);
6664  if (RExC_study_chunk_recursed)
6665   Zero(RExC_study_chunk_recursed,
6666    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6667
6668 #ifdef TRIE_STUDY_OPT
6669  if (!restudied) {
6670   StructCopy(&zero_scan_data, &data, scan_data_t);
6671   copyRExC_state = RExC_state;
6672  } else {
6673   U32 seen=RExC_seen;
6674   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6675
6676   RExC_state = copyRExC_state;
6677   if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6678    RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6679   else
6680    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6681   StructCopy(&zero_scan_data, &data, scan_data_t);
6682  }
6683 #else
6684  StructCopy(&zero_scan_data, &data, scan_data_t);
6685 #endif
6686
6687  /* Dig out information for optimizations. */
6688  r->extflags = RExC_flags; /* was pm_op */
6689  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6690
6691  if (UTF)
6692   SvUTF8_on(rx); /* Unicode in it? */
6693  ri->regstclass = NULL;
6694  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6695   r->intflags |= PREGf_NAUGHTY;
6696  scan = ri->program + 1;  /* First BRANCH. */
6697
6698  /* testing for BRANCH here tells us whether there is "must appear"
6699  data in the pattern. If there is then we can use it for optimisations */
6700  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6701             */
6702   SSize_t fake;
6703   STRLEN longest_float_length, longest_fixed_length;
6704   regnode_ssc ch_class; /* pointed to by data */
6705   int stclass_flag;
6706   SSize_t last_close = 0; /* pointed to by data */
6707   regnode *first= scan;
6708   regnode *first_next= regnext(first);
6709   /*
6710   * Skip introductions and multiplicators >= 1
6711   * so that we can extract the 'meat' of the pattern that must
6712   * match in the large if() sequence following.
6713   * NOTE that EXACT is NOT covered here, as it is normally
6714   * picked up by the optimiser separately.
6715   *
6716   * This is unfortunate as the optimiser isnt handling lookahead
6717   * properly currently.
6718   *
6719   */
6720   while ((OP(first) == OPEN && (sawopen = 1)) ||
6721    /* An OR of *one* alternative - should not happen now. */
6722    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6723    /* for now we can't handle lookbehind IFMATCH*/
6724    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6725    (OP(first) == PLUS) ||
6726    (OP(first) == MINMOD) ||
6727    /* An {n,m} with n>0 */
6728    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6729    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6730   {
6731     /*
6732     * the only op that could be a regnode is PLUS, all the rest
6733     * will be regnode_1 or regnode_2.
6734     *
6735     * (yves doesn't think this is true)
6736     */
6737     if (OP(first) == PLUS)
6738      sawplus = 1;
6739     else {
6740      if (OP(first) == MINMOD)
6741       sawminmod = 1;
6742      first += regarglen[OP(first)];
6743     }
6744     first = NEXTOPER(first);
6745     first_next= regnext(first);
6746   }
6747
6748   /* Starting-point info. */
6749  again:
6750   DEBUG_PEEP("first:",first,0);
6751   /* Ignore EXACT as we deal with it later. */
6752   if (PL_regkind[OP(first)] == EXACT) {
6753    if (OP(first) == EXACT)
6754     NOOP; /* Empty, get anchored substr later. */
6755    else
6756     ri->regstclass = first;
6757   }
6758 #ifdef TRIE_STCLASS
6759   else if (PL_regkind[OP(first)] == TRIE &&
6760     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6761   {
6762    regnode *trie_op;
6763    /* this can happen only on restudy */
6764    if ( OP(first) == TRIE ) {
6765     struct regnode_1 *trieop = (struct regnode_1 *)
6766      PerlMemShared_calloc(1, sizeof(struct regnode_1));
6767     StructCopy(first,trieop,struct regnode_1);
6768     trie_op=(regnode *)trieop;
6769    } else {
6770     struct regnode_charclass *trieop = (struct regnode_charclass *)
6771      PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6772     StructCopy(first,trieop,struct regnode_charclass);
6773     trie_op=(regnode *)trieop;
6774    }
6775    OP(trie_op)+=2;
6776    make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6777    ri->regstclass = trie_op;
6778   }
6779 #endif
6780   else if (REGNODE_SIMPLE(OP(first)))
6781    ri->regstclass = first;
6782   else if (PL_regkind[OP(first)] == BOUND ||
6783     PL_regkind[OP(first)] == NBOUND)
6784    ri->regstclass = first;
6785   else if (PL_regkind[OP(first)] == BOL) {
6786    r->intflags |= (OP(first) == MBOL
6787       ? PREGf_ANCH_MBOL
6788       : (OP(first) == SBOL
6789        ? PREGf_ANCH_SBOL
6790        : PREGf_ANCH_BOL));
6791    first = NEXTOPER(first);
6792    goto again;
6793   }
6794   else if (OP(first) == GPOS) {
6795    r->intflags |= PREGf_ANCH_GPOS;
6796    first = NEXTOPER(first);
6797    goto again;
6798   }
6799   else if ((!sawopen || !RExC_sawback) &&
6800    (OP(first) == STAR &&
6801    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6802    !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6803   {
6804    /* turn .* into ^.* with an implied $*=1 */
6805    const int type =
6806     (OP(NEXTOPER(first)) == REG_ANY)
6807      ? PREGf_ANCH_MBOL
6808      : PREGf_ANCH_SBOL;
6809    r->intflags |= (type | PREGf_IMPLICIT);
6810    first = NEXTOPER(first);
6811    goto again;
6812   }
6813   if (sawplus && !sawminmod && !sawlookahead
6814    && (!sawopen || !RExC_sawback)
6815    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6816    /* x+ must match at the 1st pos of run of x's */
6817    r->intflags |= PREGf_SKIP;
6818
6819   /* Scan is after the zeroth branch, first is atomic matcher. */
6820 #ifdef TRIE_STUDY_OPT
6821   DEBUG_PARSE_r(
6822    if (!restudied)
6823     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6824        (IV)(first - scan + 1))
6825   );
6826 #else
6827   DEBUG_PARSE_r(
6828    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6829     (IV)(first - scan + 1))
6830   );
6831 #endif
6832
6833
6834   /*
6835   * If there's something expensive in the r.e., find the
6836   * longest literal string that must appear and make it the
6837   * regmust.  Resolve ties in favor of later strings, since
6838   * the regstart check works with the beginning of the r.e.
6839   * and avoiding duplication strengthens checking.  Not a
6840   * strong reason, but sufficient in the absence of others.
6841   * [Now we resolve ties in favor of the earlier string if
6842   * it happens that c_offset_min has been invalidated, since the
6843   * earlier string may buy us something the later one won't.]
6844   */
6845
6846   data.longest_fixed = newSVpvs("");
6847   data.longest_float = newSVpvs("");
6848   data.last_found = newSVpvs("");
6849   data.longest = &(data.longest_fixed);
6850   ENTER_with_name("study_chunk");
6851   SAVEFREESV(data.longest_fixed);
6852   SAVEFREESV(data.longest_float);
6853   SAVEFREESV(data.last_found);
6854   first = scan;
6855   if (!ri->regstclass) {
6856    ssc_init(pRExC_state, &ch_class);
6857    data.start_class = &ch_class;
6858    stclass_flag = SCF_DO_STCLASS_AND;
6859   } else    /* XXXX Check for BOUND? */
6860    stclass_flag = 0;
6861   data.last_closep = &last_close;
6862
6863   DEBUG_RExC_seen();
6864   minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6865        scan + RExC_size, /* Up to end */
6866    &data, -1, 0, NULL,
6867    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6868       | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6869    0);
6870
6871
6872   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6873
6874
6875   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6876    && data.last_start_min == 0 && data.last_end > 0
6877    && !RExC_seen_zerolen
6878    && !(RExC_seen & REG_VERBARG_SEEN)
6879    && !(RExC_seen & REG_GPOS_SEEN)
6880   ){
6881    r->extflags |= RXf_CHECK_ALL;
6882   }
6883   scan_commit(pRExC_state, &data,&minlen,0);
6884
6885   longest_float_length = CHR_SVLEN(data.longest_float);
6886
6887   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6888     && data.offset_fixed == data.offset_float_min
6889     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6890    && S_setup_longest (aTHX_ pRExC_state,
6891          data.longest_float,
6892          &(r->float_utf8),
6893          &(r->float_substr),
6894          &(r->float_end_shift),
6895          data.lookbehind_float,
6896          data.offset_float_min,
6897          data.minlen_float,
6898          longest_float_length,
6899          cBOOL(data.flags & SF_FL_BEFORE_EOL),
6900          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6901   {
6902    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6903    r->float_max_offset = data.offset_float_max;
6904    if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6905     r->float_max_offset -= data.lookbehind_float;
6906    SvREFCNT_inc_simple_void_NN(data.longest_float);
6907   }
6908   else {
6909    r->float_substr = r->float_utf8 = NULL;
6910    longest_float_length = 0;
6911   }
6912
6913   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6914
6915   if (S_setup_longest (aTHX_ pRExC_state,
6916         data.longest_fixed,
6917         &(r->anchored_utf8),
6918         &(r->anchored_substr),
6919         &(r->anchored_end_shift),
6920         data.lookbehind_fixed,
6921         data.offset_fixed,
6922         data.minlen_fixed,
6923         longest_fixed_length,
6924         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6925         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6926   {
6927    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6928    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6929   }
6930   else {
6931    r->anchored_substr = r->anchored_utf8 = NULL;
6932    longest_fixed_length = 0;
6933   }
6934   LEAVE_with_name("study_chunk");
6935
6936   if (ri->regstclass
6937    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6938    ri->regstclass = NULL;
6939
6940   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6941    && stclass_flag
6942    && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6943    && !ssc_is_anything(data.start_class))
6944   {
6945    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6946
6947    ssc_finalize(pRExC_state, data.start_class);
6948
6949    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6950    StructCopy(data.start_class,
6951      (regnode_ssc*)RExC_rxi->data->data[n],
6952      regnode_ssc);
6953    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6954    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6955    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6956      regprop(r, sv, (regnode*)data.start_class, NULL);
6957      PerlIO_printf(Perl_debug_log,
6958          "synthetic stclass \"%s\".\n",
6959          SvPVX_const(sv));});
6960    data.start_class = NULL;
6961   }
6962
6963   /* A temporary algorithm prefers floated substr to fixed one to dig
6964   * more info. */
6965   if (longest_fixed_length > longest_float_length) {
6966    r->substrs->check_ix = 0;
6967    r->check_end_shift = r->anchored_end_shift;
6968    r->check_substr = r->anchored_substr;
6969    r->check_utf8 = r->anchored_utf8;
6970    r->check_offset_min = r->check_offset_max = r->anchored_offset;
6971    if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
6972     r->intflags |= PREGf_NOSCAN;
6973   }
6974   else {
6975    r->substrs->check_ix = 1;
6976    r->check_end_shift = r->float_end_shift;
6977    r->check_substr = r->float_substr;
6978    r->check_utf8 = r->float_utf8;
6979    r->check_offset_min = r->float_min_offset;
6980    r->check_offset_max = r->float_max_offset;
6981   }
6982   if ((r->check_substr || r->check_utf8) ) {
6983    r->extflags |= RXf_USE_INTUIT;
6984    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6985     r->extflags |= RXf_INTUIT_TAIL;
6986   }
6987   r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
6988
6989   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6990   if ( (STRLEN)minlen < longest_float_length )
6991    minlen= longest_float_length;
6992   if ( (STRLEN)minlen < longest_fixed_length )
6993    minlen= longest_fixed_length;
6994   */
6995  }
6996  else {
6997   /* Several toplevels. Best we can is to set minlen. */
6998   SSize_t fake;
6999   regnode_ssc ch_class;
7000   SSize_t last_close = 0;
7001
7002   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7003
7004   scan = ri->program + 1;
7005   ssc_init(pRExC_state, &ch_class);
7006   data.start_class = &ch_class;
7007   data.last_closep = &last_close;
7008
7009   DEBUG_RExC_seen();
7010   minlen = study_chunk(pRExC_state,
7011    &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7012    SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7013              ? SCF_TRIE_DOING_RESTUDY
7014              : 0),
7015    0);
7016
7017   CHECK_RESTUDY_GOTO_butfirst(NOOP);
7018
7019   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7020     = r->float_substr = r->float_utf8 = NULL;
7021
7022   if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7023    && ! ssc_is_anything(data.start_class))
7024   {
7025    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7026
7027    ssc_finalize(pRExC_state, data.start_class);
7028
7029    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7030    StructCopy(data.start_class,
7031      (regnode_ssc*)RExC_rxi->data->data[n],
7032      regnode_ssc);
7033    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7034    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7035    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7036      regprop(r, sv, (regnode*)data.start_class, NULL);
7037      PerlIO_printf(Perl_debug_log,
7038          "synthetic stclass \"%s\".\n",
7039          SvPVX_const(sv));});
7040    data.start_class = NULL;
7041   }
7042  }
7043
7044  if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7045   r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7046   r->maxlen = REG_INFTY;
7047  }
7048  else {
7049   r->maxlen = RExC_maxlen;
7050  }
7051
7052  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7053  the "real" pattern. */
7054  DEBUG_OPTIMISE_r({
7055   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7056      (IV)minlen, (IV)r->minlen, RExC_maxlen);
7057  });
7058  r->minlenret = minlen;
7059  if (r->minlen < minlen)
7060   r->minlen = minlen;
7061
7062  if (RExC_seen & REG_GPOS_SEEN)
7063   r->intflags |= PREGf_GPOS_SEEN;
7064  if (RExC_seen & REG_LOOKBEHIND_SEEN)
7065   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7066             lookbehind */
7067  if (pRExC_state->num_code_blocks)
7068   r->extflags |= RXf_EVAL_SEEN;
7069  if (RExC_seen & REG_CANY_SEEN)
7070   r->intflags |= PREGf_CANY_SEEN;
7071  if (RExC_seen & REG_VERBARG_SEEN)
7072  {
7073   r->intflags |= PREGf_VERBARG_SEEN;
7074   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7075  }
7076  if (RExC_seen & REG_CUTGROUP_SEEN)
7077   r->intflags |= PREGf_CUTGROUP_SEEN;
7078  if (pm_flags & PMf_USE_RE_EVAL)
7079   r->intflags |= PREGf_USE_RE_EVAL;
7080  if (RExC_paren_names)
7081   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7082  else
7083   RXp_PAREN_NAMES(r) = NULL;
7084
7085  /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7086  * so it can be used in pp.c */
7087  if (r->intflags & PREGf_ANCH)
7088   r->extflags |= RXf_IS_ANCHORED;
7089
7090
7091  {
7092   /* this is used to identify "special" patterns that might result
7093   * in Perl NOT calling the regex engine and instead doing the match "itself",
7094   * particularly special cases in split//. By having the regex compiler
7095   * do this pattern matching at a regop level (instead of by inspecting the pattern)
7096   * we avoid weird issues with equivalent patterns resulting in different behavior,
7097   * AND we allow non Perl engines to get the same optimizations by the setting the
7098   * flags appropriately - Yves */
7099   regnode *first = ri->program + 1;
7100   U8 fop = OP(first);
7101   regnode *next = NEXTOPER(first);
7102   U8 nop = OP(next);
7103
7104   if (PL_regkind[fop] == NOTHING && nop == END)
7105    r->extflags |= RXf_NULL;
7106   else if (PL_regkind[fop] == BOL && nop == END)
7107    r->extflags |= RXf_START_ONLY;
7108   else if (fop == PLUS
7109     && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7110     && OP(regnext(first)) == END)
7111    r->extflags |= RXf_WHITE;
7112   else if ( r->extflags & RXf_SPLIT
7113     && fop == EXACT
7114     && STR_LEN(first) == 1
7115     && *(STRING(first)) == ' '
7116     && OP(regnext(first)) == END )
7117    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7118
7119  }
7120
7121  if (RExC_contains_locale) {
7122   RXp_EXTFLAGS(r) |= RXf_TAINTED;
7123  }
7124
7125 #ifdef DEBUGGING
7126  if (RExC_paren_names) {
7127   ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7128   ri->data->data[ri->name_list_idx]
7129         = (void*)SvREFCNT_inc(RExC_paren_name_list);
7130  } else
7131 #endif
7132   ri->name_list_idx = 0;
7133
7134  if (RExC_recurse_count) {
7135   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7136    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7137    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7138   }
7139  }
7140  Newxz(r->offs, RExC_npar, regexp_paren_pair);
7141  /* assume we don't need to swap parens around before we match */
7142
7143  DEBUG_DUMP_r({
7144   DEBUG_RExC_seen();
7145   PerlIO_printf(Perl_debug_log,"Final program:\n");
7146   regdump(r);
7147  });
7148 #ifdef RE_TRACK_PATTERN_OFFSETS
7149  DEBUG_OFFSETS_r(if (ri->u.offsets) {
7150   const STRLEN len = ri->u.offsets[0];
7151   STRLEN i;
7152   GET_RE_DEBUG_FLAGS_DECL;
7153   PerlIO_printf(Perl_debug_log,
7154      "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7155   for (i = 1; i <= len; i++) {
7156    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7157     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7158     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7159    }
7160   PerlIO_printf(Perl_debug_log, "\n");
7161  });
7162 #endif
7163
7164 #ifdef USE_ITHREADS
7165  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7166  * by setting the regexp SV to readonly-only instead. If the
7167  * pattern's been recompiled, the USEDness should remain. */
7168  if (old_re && SvREADONLY(old_re))
7169   SvREADONLY_on(rx);
7170 #endif
7171  return rx;
7172 }
7173
7174
7175 SV*
7176 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7177      const U32 flags)
7178 {
7179  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7180
7181  PERL_UNUSED_ARG(value);
7182
7183  if (flags & RXapif_FETCH) {
7184   return reg_named_buff_fetch(rx, key, flags);
7185  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7186   Perl_croak_no_modify();
7187   return NULL;
7188  } else if (flags & RXapif_EXISTS) {
7189   return reg_named_buff_exists(rx, key, flags)
7190    ? &PL_sv_yes
7191    : &PL_sv_no;
7192  } else if (flags & RXapif_REGNAMES) {
7193   return reg_named_buff_all(rx, flags);
7194  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7195   return reg_named_buff_scalar(rx, flags);
7196  } else {
7197   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7198   return NULL;
7199  }
7200 }
7201
7202 SV*
7203 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7204       const U32 flags)
7205 {
7206  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7207  PERL_UNUSED_ARG(lastkey);
7208
7209  if (flags & RXapif_FIRSTKEY)
7210   return reg_named_buff_firstkey(rx, flags);
7211  else if (flags & RXapif_NEXTKEY)
7212   return reg_named_buff_nextkey(rx, flags);
7213  else {
7214   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7215            (int)flags);
7216   return NULL;
7217  }
7218 }
7219
7220 SV*
7221 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7222       const U32 flags)
7223 {
7224  AV *retarray = NULL;
7225  SV *ret;
7226  struct regexp *const rx = ReANY(r);
7227
7228  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7229
7230  if (flags & RXapif_ALL)
7231   retarray=newAV();
7232
7233  if (rx && RXp_PAREN_NAMES(rx)) {
7234   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7235   if (he_str) {
7236    IV i;
7237    SV* sv_dat=HeVAL(he_str);
7238    I32 *nums=(I32*)SvPVX(sv_dat);
7239    for ( i=0; i<SvIVX(sv_dat); i++ ) {
7240     if ((I32)(rx->nparens) >= nums[i]
7241      && rx->offs[nums[i]].start != -1
7242      && rx->offs[nums[i]].end != -1)
7243     {
7244      ret = newSVpvs("");
7245      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7246      if (!retarray)
7247       return ret;
7248     } else {
7249      if (retarray)
7250       ret = newSVsv(&PL_sv_undef);
7251     }
7252     if (retarray)
7253      av_push(retarray, ret);
7254    }
7255    if (retarray)
7256     return newRV_noinc(MUTABLE_SV(retarray));
7257   }
7258  }
7259  return NULL;
7260 }
7261
7262 bool
7263 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7264       const U32 flags)
7265 {
7266  struct regexp *const rx = ReANY(r);
7267
7268  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7269
7270  if (rx && RXp_PAREN_NAMES(rx)) {
7271   if (flags & RXapif_ALL) {
7272    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7273   } else {
7274    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7275    if (sv) {
7276     SvREFCNT_dec_NN(sv);
7277     return TRUE;
7278    } else {
7279     return FALSE;
7280    }
7281   }
7282  } else {
7283   return FALSE;
7284  }
7285 }
7286
7287 SV*
7288 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7289 {
7290  struct regexp *const rx = ReANY(r);
7291
7292  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7293
7294  if ( rx && RXp_PAREN_NAMES(rx) ) {
7295   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7296
7297   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7298  } else {
7299   return FALSE;
7300  }
7301 }
7302
7303 SV*
7304 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7305 {
7306  struct regexp *const rx = ReANY(r);
7307  GET_RE_DEBUG_FLAGS_DECL;
7308
7309  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7310
7311  if (rx && RXp_PAREN_NAMES(rx)) {
7312   HV *hv = RXp_PAREN_NAMES(rx);
7313   HE *temphe;
7314   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7315    IV i;
7316    IV parno = 0;
7317    SV* sv_dat = HeVAL(temphe);
7318    I32 *nums = (I32*)SvPVX(sv_dat);
7319    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7320     if ((I32)(rx->lastparen) >= nums[i] &&
7321      rx->offs[nums[i]].start != -1 &&
7322      rx->offs[nums[i]].end != -1)
7323     {
7324      parno = nums[i];
7325      break;
7326     }
7327    }
7328    if (parno || flags & RXapif_ALL) {
7329     return newSVhek(HeKEY_hek(temphe));
7330    }
7331   }
7332  }
7333  return NULL;
7334 }
7335
7336 SV*
7337 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7338 {
7339  SV *ret;
7340  AV *av;
7341  SSize_t length;
7342  struct regexp *const rx = ReANY(r);
7343
7344  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7345
7346  if (rx && RXp_PAREN_NAMES(rx)) {
7347   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7348    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7349   } else if (flags & RXapif_ONE) {
7350    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7351    av = MUTABLE_AV(SvRV(ret));
7352    length = av_tindex(av);
7353    SvREFCNT_dec_NN(ret);
7354    return newSViv(length + 1);
7355   } else {
7356    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7357             (int)flags);
7358    return NULL;
7359   }
7360  }
7361  return &PL_sv_undef;
7362 }
7363
7364 SV*
7365 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7366 {
7367  struct regexp *const rx = ReANY(r);
7368  AV *av = newAV();
7369
7370  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7371
7372  if (rx && RXp_PAREN_NAMES(rx)) {
7373   HV *hv= RXp_PAREN_NAMES(rx);
7374   HE *temphe;
7375   (void)hv_iterinit(hv);
7376   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7377    IV i;
7378    IV parno = 0;
7379    SV* sv_dat = HeVAL(temphe);
7380    I32 *nums = (I32*)SvPVX(sv_dat);
7381    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7382     if ((I32)(rx->lastparen) >= nums[i] &&
7383      rx->offs[nums[i]].start != -1 &&
7384      rx->offs[nums[i]].end != -1)
7385     {
7386      parno = nums[i];
7387      break;
7388     }
7389    }
7390    if (parno || flags & RXapif_ALL) {
7391     av_push(av, newSVhek(HeKEY_hek(temphe)));
7392    }
7393   }
7394  }
7395
7396  return newRV_noinc(MUTABLE_SV(av));
7397 }
7398
7399 void
7400 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7401        SV * const sv)
7402 {
7403  struct regexp *const rx = ReANY(r);
7404  char *s = NULL;
7405  SSize_t i = 0;
7406  SSize_t s1, t1;
7407  I32 n = paren;
7408
7409  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7410
7411  if (      n == RX_BUFF_IDX_CARET_PREMATCH
7412   || n == RX_BUFF_IDX_CARET_FULLMATCH
7413   || n == RX_BUFF_IDX_CARET_POSTMATCH
7414  )
7415  {
7416   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7417   if (!keepcopy) {
7418    /* on something like
7419    *    $r = qr/.../;
7420    *    /$qr/p;
7421    * the KEEPCOPY is set on the PMOP rather than the regex */
7422    if (PL_curpm && r == PM_GETRE(PL_curpm))
7423     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7424   }
7425   if (!keepcopy)
7426    goto ret_undef;
7427  }
7428
7429  if (!rx->subbeg)
7430   goto ret_undef;
7431
7432  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7433   /* no need to distinguish between them any more */
7434   n = RX_BUFF_IDX_FULLMATCH;
7435
7436  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7437   && rx->offs[0].start != -1)
7438  {
7439   /* $`, ${^PREMATCH} */
7440   i = rx->offs[0].start;
7441   s = rx->subbeg;
7442  }
7443  else
7444  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7445   && rx->offs[0].end != -1)
7446  {
7447   /* $', ${^POSTMATCH} */
7448   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7449   i = rx->sublen + rx->suboffset - rx->offs[0].end;
7450  }
7451  else
7452  if ( 0 <= n && n <= (I32)rx->nparens &&
7453   (s1 = rx->offs[n].start) != -1 &&
7454   (t1 = rx->offs[n].end) != -1)
7455  {
7456   /* $&, ${^MATCH},  $1 ... */
7457   i = t1 - s1;
7458   s = rx->subbeg + s1 - rx->suboffset;
7459  } else {
7460   goto ret_undef;
7461  }
7462
7463  assert(s >= rx->subbeg);
7464  assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7465  if (i >= 0) {
7466 #ifdef NO_TAINT_SUPPORT
7467   sv_setpvn(sv, s, i);
7468 #else
7469   const int oldtainted = TAINT_get;
7470   TAINT_NOT;
7471   sv_setpvn(sv, s, i);
7472   TAINT_set(oldtainted);
7473 #endif
7474   if ( (rx->intflags & PREGf_CANY_SEEN)
7475    ? (RXp_MATCH_UTF8(rx)
7476       && (!i || is_utf8_string((U8*)s, i)))
7477    : (RXp_MATCH_UTF8(rx)) )
7478   {
7479    SvUTF8_on(sv);
7480   }
7481   else
7482    SvUTF8_off(sv);
7483   if (TAINTING_get) {
7484    if (RXp_MATCH_TAINTED(rx)) {
7485     if (SvTYPE(sv) >= SVt_PVMG) {
7486      MAGIC* const mg = SvMAGIC(sv);
7487      MAGIC* mgt;
7488      TAINT;
7489      SvMAGIC_set(sv, mg->mg_moremagic);
7490      SvTAINT(sv);
7491      if ((mgt = SvMAGIC(sv))) {
7492       mg->mg_moremagic = mgt;
7493       SvMAGIC_set(sv, mg);
7494      }
7495     } else {
7496      TAINT;
7497      SvTAINT(sv);
7498     }
7499    } else
7500     SvTAINTED_off(sv);
7501   }
7502  } else {
7503  ret_undef:
7504   sv_setsv(sv,&PL_sv_undef);
7505   return;
7506  }
7507 }
7508
7509 void
7510 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7511               SV const * const value)
7512 {
7513  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7514
7515  PERL_UNUSED_ARG(rx);
7516  PERL_UNUSED_ARG(paren);
7517  PERL_UNUSED_ARG(value);
7518
7519  if (!PL_localizing)
7520   Perl_croak_no_modify();
7521 }
7522
7523 I32
7524 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7525        const I32 paren)
7526 {
7527  struct regexp *const rx = ReANY(r);
7528  I32 i;
7529  I32 s1, t1;
7530
7531  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7532
7533  if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7534   || paren == RX_BUFF_IDX_CARET_FULLMATCH
7535   || paren == RX_BUFF_IDX_CARET_POSTMATCH
7536  )
7537  {
7538   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7539   if (!keepcopy) {
7540    /* on something like
7541    *    $r = qr/.../;
7542    *    /$qr/p;
7543    * the KEEPCOPY is set on the PMOP rather than the regex */
7544    if (PL_curpm && r == PM_GETRE(PL_curpm))
7545     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7546   }
7547   if (!keepcopy)
7548    goto warn_undef;
7549  }
7550
7551  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7552  switch (paren) {
7553  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7554  case RX_BUFF_IDX_PREMATCH:       /* $` */
7555   if (rx->offs[0].start != -1) {
7556       i = rx->offs[0].start;
7557       if (i > 0) {
7558         s1 = 0;
7559         t1 = i;
7560         goto getlen;
7561       }
7562    }
7563   return 0;
7564
7565  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7566  case RX_BUFF_IDX_POSTMATCH:       /* $' */
7567    if (rx->offs[0].end != -1) {
7568       i = rx->sublen - rx->offs[0].end;
7569       if (i > 0) {
7570         s1 = rx->offs[0].end;
7571         t1 = rx->sublen;
7572         goto getlen;
7573       }
7574    }
7575   return 0;
7576
7577  default: /* $& / ${^MATCH}, $1, $2, ... */
7578    if (paren <= (I32)rx->nparens &&
7579    (s1 = rx->offs[paren].start) != -1 &&
7580    (t1 = rx->offs[paren].end) != -1)
7581    {
7582    i = t1 - s1;
7583    goto getlen;
7584   } else {
7585   warn_undef:
7586    if (ckWARN(WARN_UNINITIALIZED))
7587     report_uninit((const SV *)sv);
7588    return 0;
7589   }
7590  }
7591   getlen:
7592  if (i > 0 && RXp_MATCH_UTF8(rx)) {
7593   const char * const s = rx->subbeg - rx->suboffset + s1;
7594   const U8 *ep;
7595   STRLEN el;
7596
7597   i = t1 - s1;
7598   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7599       i = el;
7600  }
7601  return i;
7602 }
7603
7604 SV*
7605 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7606 {
7607  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7608   PERL_UNUSED_ARG(rx);
7609   if (0)
7610    return NULL;
7611   else
7612    return newSVpvs("Regexp");
7613 }
7614
7615 /* Scans the name of a named buffer from the pattern.
7616  * If flags is REG_RSN_RETURN_NULL returns null.
7617  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7618  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7619  * to the parsed name as looked up in the RExC_paren_names hash.
7620  * If there is an error throws a vFAIL().. type exception.
7621  */
7622
7623 #define REG_RSN_RETURN_NULL    0
7624 #define REG_RSN_RETURN_NAME    1
7625 #define REG_RSN_RETURN_DATA    2
7626
7627 STATIC SV*
7628 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7629 {
7630  char *name_start = RExC_parse;
7631
7632  PERL_ARGS_ASSERT_REG_SCAN_NAME;
7633
7634  assert (RExC_parse <= RExC_end);
7635  if (RExC_parse == RExC_end) NOOP;
7636  else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7637   /* skip IDFIRST by using do...while */
7638   if (UTF)
7639    do {
7640     RExC_parse += UTF8SKIP(RExC_parse);
7641    } while (isWORDCHAR_utf8((U8*)RExC_parse));
7642   else
7643    do {
7644     RExC_parse++;
7645    } while (isWORDCHAR(*RExC_parse));
7646  } else {
7647   RExC_parse++; /* so the <- from the vFAIL is after the offending
7648       character */
7649   vFAIL("Group name must start with a non-digit word character");
7650  }
7651  if ( flags ) {
7652   SV* sv_name
7653    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7654        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7655   if ( flags == REG_RSN_RETURN_NAME)
7656    return sv_name;
7657   else if (flags==REG_RSN_RETURN_DATA) {
7658    HE *he_str = NULL;
7659    SV *sv_dat = NULL;
7660    if ( ! sv_name )      /* should not happen*/
7661     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7662    if (RExC_paren_names)
7663     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7664    if ( he_str )
7665     sv_dat = HeVAL(he_str);
7666    if ( ! sv_dat )
7667     vFAIL("Reference to nonexistent named group");
7668    return sv_dat;
7669   }
7670   else {
7671    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7672      (unsigned long) flags);
7673   }
7674   assert(0); /* NOT REACHED */
7675  }
7676  return NULL;
7677 }
7678
7679 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7680  int rem=(int)(RExC_end - RExC_parse);                       \
7681  int cut;                                                    \
7682  int num;                                                    \
7683  int iscut=0;                                                \
7684  if (rem>10) {                                               \
7685   rem=10;                                                 \
7686   iscut=1;                                                \
7687  }                                                           \
7688  cut=10-rem;                                                 \
7689  if (RExC_lastparse!=RExC_parse)                             \
7690   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7691    rem, RExC_parse,                                    \
7692    cut + 4,                                            \
7693    iscut ? "..." : "<"                                 \
7694   );                                                      \
7695  else                                                        \
7696   PerlIO_printf(Perl_debug_log,"%16s","");                \
7697                 \
7698  if (SIZE_ONLY)                                              \
7699  num = RExC_size + 1;                                     \
7700  else                                                        \
7701  num=REG_NODE_NUM(RExC_emit);                             \
7702  if (RExC_lastnum!=num)                                      \
7703  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7704  else                                                        \
7705  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7706  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7707   (int)((depth*2)), "",                                   \
7708   (funcname)                                              \
7709  );                                                          \
7710  RExC_lastnum=num;                                           \
7711  RExC_lastparse=RExC_parse;                                  \
7712 })
7713
7714
7715
7716 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7717  DEBUG_PARSE_MSG((funcname));                            \
7718  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7719 })
7720 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7721  DEBUG_PARSE_MSG((funcname));                            \
7722  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7723 })
7724
7725 /* This section of code defines the inversion list object and its methods.  The
7726  * interfaces are highly subject to change, so as much as possible is static to
7727  * this file.  An inversion list is here implemented as a malloc'd C UV array
7728  * as an SVt_INVLIST scalar.
7729  *
7730  * An inversion list for Unicode is an array of code points, sorted by ordinal
7731  * number.  The zeroth element is the first code point in the list.  The 1th
7732  * element is the first element beyond that not in the list.  In other words,
7733  * the first range is
7734  *  invlist[0]..(invlist[1]-1)
7735  * The other ranges follow.  Thus every element whose index is divisible by two
7736  * marks the beginning of a range that is in the list, and every element not
7737  * divisible by two marks the beginning of a range not in the list.  A single
7738  * element inversion list that contains the single code point N generally
7739  * consists of two elements
7740  *  invlist[0] == N
7741  *  invlist[1] == N+1
7742  * (The exception is when N is the highest representable value on the
7743  * machine, in which case the list containing just it would be a single
7744  * element, itself.  By extension, if the last range in the list extends to
7745  * infinity, then the first element of that range will be in the inversion list
7746  * at a position that is divisible by two, and is the final element in the
7747  * list.)
7748  * Taking the complement (inverting) an inversion list is quite simple, if the
7749  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7750  * This implementation reserves an element at the beginning of each inversion
7751  * list to always contain 0; there is an additional flag in the header which
7752  * indicates if the list begins at the 0, or is offset to begin at the next
7753  * element.
7754  *
7755  * More about inversion lists can be found in "Unicode Demystified"
7756  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7757  * More will be coming when functionality is added later.
7758  *
7759  * The inversion list data structure is currently implemented as an SV pointing
7760  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7761  * array of UV whose memory management is automatically handled by the existing
7762  * facilities for SV's.
7763  *
7764  * Some of the methods should always be private to the implementation, and some
7765  * should eventually be made public */
7766
7767 /* The header definitions are in F<inline_invlist.c> */
7768
7769 PERL_STATIC_INLINE UV*
7770 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7771 {
7772  /* Returns a pointer to the first element in the inversion list's array.
7773  * This is called upon initialization of an inversion list.  Where the
7774  * array begins depends on whether the list has the code point U+0000 in it
7775  * or not.  The other parameter tells it whether the code that follows this
7776  * call is about to put a 0 in the inversion list or not.  The first
7777  * element is either the element reserved for 0, if TRUE, or the element
7778  * after it, if FALSE */
7779
7780  bool* offset = get_invlist_offset_addr(invlist);
7781  UV* zero_addr = (UV *) SvPVX(invlist);
7782
7783  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7784
7785  /* Must be empty */
7786  assert(! _invlist_len(invlist));
7787
7788  *zero_addr = 0;
7789
7790  /* 1^1 = 0; 1^0 = 1 */
7791  *offset = 1 ^ will_have_0;
7792  return zero_addr + *offset;
7793 }
7794
7795 PERL_STATIC_INLINE UV*
7796 S_invlist_array(pTHX_ SV* const invlist)
7797 {
7798  /* Returns the pointer to the inversion list's array.  Every time the
7799  * length changes, this needs to be called in case malloc or realloc moved
7800  * it */
7801
7802  PERL_ARGS_ASSERT_INVLIST_ARRAY;
7803
7804  /* Must not be empty.  If these fail, you probably didn't check for <len>
7805  * being non-zero before trying to get the array */
7806  assert(_invlist_len(invlist));
7807
7808  /* The very first element always contains zero, The array begins either
7809  * there, or if the inversion list is offset, at the element after it.
7810  * The offset header field determines which; it contains 0 or 1 to indicate
7811  * how much additionally to add */
7812  assert(0 == *(SvPVX(invlist)));
7813  return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7814 }
7815
7816 PERL_STATIC_INLINE void
7817 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7818 {
7819  /* Sets the current number of elements stored in the inversion list.
7820  * Updates SvCUR correspondingly */
7821
7822  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7823
7824  assert(SvTYPE(invlist) == SVt_INVLIST);
7825
7826  SvCUR_set(invlist,
7827    (len == 0)
7828    ? 0
7829    : TO_INTERNAL_SIZE(len + offset));
7830  assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7831 }
7832
7833 PERL_STATIC_INLINE IV*
7834 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7835 {
7836  /* Return the address of the IV that is reserved to hold the cached index
7837  * */
7838
7839  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7840
7841  assert(SvTYPE(invlist) == SVt_INVLIST);
7842
7843  return &(((XINVLIST*) SvANY(invlist))->prev_index);
7844 }
7845
7846 PERL_STATIC_INLINE IV
7847 S_invlist_previous_index(pTHX_ SV* const invlist)
7848 {
7849  /* Returns cached index of previous search */
7850
7851  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7852
7853  return *get_invlist_previous_index_addr(invlist);
7854 }
7855
7856 PERL_STATIC_INLINE void
7857 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7858 {
7859  /* Caches <index> for later retrieval */
7860
7861  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7862
7863  assert(index == 0 || index < (int) _invlist_len(invlist));
7864
7865  *get_invlist_previous_index_addr(invlist) = index;
7866 }
7867
7868 PERL_STATIC_INLINE UV
7869 S_invlist_max(pTHX_ SV* const invlist)
7870 {
7871  /* Returns the maximum number of elements storable in the inversion list's
7872  * array, without having to realloc() */
7873
7874  PERL_ARGS_ASSERT_INVLIST_MAX;
7875
7876  assert(SvTYPE(invlist) == SVt_INVLIST);
7877
7878  /* Assumes worst case, in which the 0 element is not counted in the
7879  * inversion list, so subtracts 1 for that */
7880  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7881   ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7882   : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7883 }
7884
7885 #ifndef PERL_IN_XSUB_RE
7886 SV*
7887 Perl__new_invlist(pTHX_ IV initial_size)
7888 {
7889
7890  /* Return a pointer to a newly constructed inversion list, with enough
7891  * space to store 'initial_size' elements.  If that number is negative, a
7892  * system default is used instead */
7893
7894  SV* new_list;
7895
7896  if (initial_size < 0) {
7897   initial_size = 10;
7898  }
7899
7900  /* Allocate the initial space */
7901  new_list = newSV_type(SVt_INVLIST);
7902
7903  /* First 1 is in case the zero element isn't in the list; second 1 is for
7904  * trailing NUL */
7905  SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7906  invlist_set_len(new_list, 0, 0);
7907
7908  /* Force iterinit() to be used to get iteration to work */
7909  *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7910
7911  *get_invlist_previous_index_addr(new_list) = 0;
7912
7913  return new_list;
7914 }
7915
7916 SV*
7917 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7918 {
7919  /* Return a pointer to a newly constructed inversion list, initialized to
7920  * point to <list>, which has to be in the exact correct inversion list
7921  * form, including internal fields.  Thus this is a dangerous routine that
7922  * should not be used in the wrong hands.  The passed in 'list' contains
7923  * several header fields at the beginning that are not part of the
7924  * inversion list body proper */
7925
7926  const STRLEN length = (STRLEN) list[0];
7927  const UV version_id =          list[1];
7928  const bool offset   =    cBOOL(list[2]);
7929 #define HEADER_LENGTH 3
7930  /* If any of the above changes in any way, you must change HEADER_LENGTH
7931  * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7932  *      perl -E 'say int(rand 2**31-1)'
7933  */
7934 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7935           data structure type, so that one being
7936           passed in can be validated to be an
7937           inversion list of the correct vintage.
7938          */
7939
7940  SV* invlist = newSV_type(SVt_INVLIST);
7941
7942  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7943
7944  if (version_id != INVLIST_VERSION_ID) {
7945   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7946  }
7947
7948  /* The generated array passed in includes header elements that aren't part
7949  * of the list proper, so start it just after them */
7950  SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7951
7952  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7953        shouldn't touch it */
7954
7955  *(get_invlist_offset_addr(invlist)) = offset;
7956
7957  /* The 'length' passed to us is the physical number of elements in the
7958  * inversion list.  But if there is an offset the logical number is one
7959  * less than that */
7960  invlist_set_len(invlist, length  - offset, offset);
7961
7962  invlist_set_previous_index(invlist, 0);
7963
7964  /* Initialize the iteration pointer. */
7965  invlist_iterfinish(invlist);
7966
7967  SvREADONLY_on(invlist);
7968
7969  return invlist;
7970 }
7971 #endif /* ifndef PERL_IN_XSUB_RE */
7972
7973 STATIC void
7974 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7975 {
7976  /* Grow the maximum size of an inversion list */
7977
7978  PERL_ARGS_ASSERT_INVLIST_EXTEND;
7979
7980  assert(SvTYPE(invlist) == SVt_INVLIST);
7981
7982  /* Add one to account for the zero element at the beginning which may not
7983  * be counted by the calling parameters */
7984  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7985 }
7986
7987 PERL_STATIC_INLINE void
7988 S_invlist_trim(pTHX_ SV* const invlist)
7989 {
7990  PERL_ARGS_ASSERT_INVLIST_TRIM;
7991
7992  assert(SvTYPE(invlist) == SVt_INVLIST);
7993
7994  /* Change the length of the inversion list to how many entries it currently
7995  * has */
7996  SvPV_shrink_to_cur((SV *) invlist);
7997 }
7998
7999 STATIC void
8000 S__append_range_to_invlist(pTHX_ SV* const invlist,
8001         const UV start, const UV end)
8002 {
8003    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8004  * the end of the inversion list.  The range must be above any existing
8005  * ones. */
8006
8007  UV* array;
8008  UV max = invlist_max(invlist);
8009  UV len = _invlist_len(invlist);
8010  bool offset;
8011
8012  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8013
8014  if (len == 0) { /* Empty lists must be initialized */
8015   offset = start != 0;
8016   array = _invlist_array_init(invlist, ! offset);
8017  }
8018  else {
8019   /* Here, the existing list is non-empty. The current max entry in the
8020   * list is generally the first value not in the set, except when the
8021   * set extends to the end of permissible values, in which case it is
8022   * the first entry in that final set, and so this call is an attempt to
8023   * append out-of-order */
8024
8025   UV final_element = len - 1;
8026   array = invlist_array(invlist);
8027   if (array[final_element] > start
8028    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8029   {
8030    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8031      array[final_element], start,
8032      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8033   }
8034
8035   /* Here, it is a legal append.  If the new range begins with the first
8036   * value not in the set, it is extending the set, so the new first
8037   * value not in the set is one greater than the newly extended range.
8038   * */
8039   offset = *get_invlist_offset_addr(invlist);
8040   if (array[final_element] == start) {
8041    if (end != UV_MAX) {
8042     array[final_element] = end + 1;
8043    }
8044    else {
8045     /* But if the end is the maximum representable on the machine,
8046     * just let the range that this would extend to have no end */
8047     invlist_set_len(invlist, len - 1, offset);
8048    }
8049    return;
8050   }
8051  }
8052
8053  /* Here the new range doesn't extend any existing set.  Add it */
8054
8055  len += 2; /* Includes an element each for the start and end of range */
8056
8057  /* If wll overflow the existing space, extend, which may cause the array to
8058  * be moved */
8059  if (max < len) {
8060   invlist_extend(invlist, len);
8061
8062   /* Have to set len here to avoid assert failure in invlist_array() */
8063   invlist_set_len(invlist, len, offset);
8064
8065   array = invlist_array(invlist);
8066  }
8067  else {
8068   invlist_set_len(invlist, len, offset);
8069  }
8070
8071  /* The next item on the list starts the range, the one after that is
8072  * one past the new range.  */
8073  array[len - 2] = start;
8074  if (end != UV_MAX) {
8075   array[len - 1] = end + 1;
8076  }
8077  else {
8078   /* But if the end is the maximum representable on the machine, just let
8079   * the range have no end */
8080   invlist_set_len(invlist, len - 1, offset);
8081  }
8082 }
8083
8084 #ifndef PERL_IN_XSUB_RE
8085
8086 IV
8087 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
8088 {
8089  /* Searches the inversion list for the entry that contains the input code
8090  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8091  * return value is the index into the list's array of the range that
8092  * contains <cp> */
8093
8094  IV low = 0;
8095  IV mid;
8096  IV high = _invlist_len(invlist);
8097  const IV highest_element = high - 1;
8098  const UV* array;
8099
8100  PERL_ARGS_ASSERT__INVLIST_SEARCH;
8101
8102  /* If list is empty, return failure. */
8103  if (high == 0) {
8104   return -1;
8105  }
8106
8107  /* (We can't get the array unless we know the list is non-empty) */
8108  array = invlist_array(invlist);
8109
8110  mid = invlist_previous_index(invlist);
8111  assert(mid >=0 && mid <= highest_element);
8112
8113  /* <mid> contains the cache of the result of the previous call to this
8114  * function (0 the first time).  See if this call is for the same result,
8115  * or if it is for mid-1.  This is under the theory that calls to this
8116  * function will often be for related code points that are near each other.
8117  * And benchmarks show that caching gives better results.  We also test
8118  * here if the code point is within the bounds of the list.  These tests
8119  * replace others that would have had to be made anyway to make sure that
8120  * the array bounds were not exceeded, and these give us extra information
8121  * at the same time */
8122  if (cp >= array[mid]) {
8123   if (cp >= array[highest_element]) {
8124    return highest_element;
8125   }
8126
8127   /* Here, array[mid] <= cp < array[highest_element].  This means that
8128   * the final element is not the answer, so can exclude it; it also
8129   * means that <mid> is not the final element, so can refer to 'mid + 1'
8130   * safely */
8131   if (cp < array[mid + 1]) {
8132    return mid;
8133   }
8134   high--;
8135   low = mid + 1;
8136  }
8137  else { /* cp < aray[mid] */
8138   if (cp < array[0]) { /* Fail if outside the array */
8139    return -1;
8140   }
8141   high = mid;
8142   if (cp >= array[mid - 1]) {
8143    goto found_entry;
8144   }
8145  }
8146
8147  /* Binary search.  What we are looking for is <i> such that
8148  * array[i] <= cp < array[i+1]
8149  * The loop below converges on the i+1.  Note that there may not be an
8150  * (i+1)th element in the array, and things work nonetheless */
8151  while (low < high) {
8152   mid = (low + high) / 2;
8153   assert(mid <= highest_element);
8154   if (array[mid] <= cp) { /* cp >= array[mid] */
8155    low = mid + 1;
8156
8157    /* We could do this extra test to exit the loop early.
8158    if (cp < array[low]) {
8159     return mid;
8160    }
8161    */
8162   }
8163   else { /* cp < array[mid] */
8164    high = mid;
8165   }
8166  }
8167
8168   found_entry:
8169  high--;
8170  invlist_set_previous_index(invlist, high);
8171  return high;
8172 }
8173
8174 void
8175 Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
8176          const UV start, const UV end, U8* swatch)
8177 {
8178  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8179  * but is used when the swash has an inversion list.  This makes this much
8180  * faster, as it uses a binary search instead of a linear one.  This is
8181  * intimately tied to that function, and perhaps should be in utf8.c,
8182  * except it is intimately tied to inversion lists as well.  It assumes
8183  * that <swatch> is all 0's on input */
8184
8185  UV current = start;
8186  const IV len = _invlist_len(invlist);
8187  IV i;
8188  const UV * array;
8189
8190  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8191
8192  if (len == 0) { /* Empty inversion list */
8193   return;
8194  }
8195
8196  array = invlist_array(invlist);
8197
8198  /* Find which element it is */
8199  i = _invlist_search(invlist, start);
8200
8201  /* We populate from <start> to <end> */
8202  while (current < end) {
8203   UV upper;
8204
8205   /* The inversion list gives the results for every possible code point
8206   * after the first one in the list.  Only those ranges whose index is
8207   * even are ones that the inversion list matches.  For the odd ones,
8208   * and if the initial code point is not in the list, we have to skip
8209   * forward to the next element */
8210   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8211    i++;
8212    if (i >= len) { /* Finished if beyond the end of the array */
8213     return;
8214    }
8215    current = array[i];
8216    if (current >= end) {   /* Finished if beyond the end of what we
8217          are populating */
8218     if (LIKELY(end < UV_MAX)) {
8219      return;
8220     }
8221
8222     /* We get here when the upper bound is the maximum
8223     * representable on the machine, and we are looking for just
8224     * that code point.  Have to special case it */
8225     i = len;
8226     goto join_end_of_list;
8227    }
8228   }
8229   assert(current >= start);
8230
8231   /* The current range ends one below the next one, except don't go past
8232   * <end> */
8233   i++;
8234   upper = (i < len && array[i] < end) ? array[i] : end;
8235
8236   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8237   * for each code point in it */
8238   for (; current < upper; current++) {
8239    const STRLEN offset = (STRLEN)(current - start);
8240    swatch[offset >> 3] |= 1 << (offset & 7);
8241   }
8242
8243  join_end_of_list:
8244
8245   /* Quit if at the end of the list */
8246   if (i >= len) {
8247
8248    /* But first, have to deal with the highest possible code point on
8249    * the platform.  The previous code assumes that <end> is one
8250    * beyond where we want to populate, but that is impossible at the
8251    * platform's infinity, so have to handle it specially */
8252    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8253    {
8254     const STRLEN offset = (STRLEN)(end - start);
8255     swatch[offset >> 3] |= 1 << (offset & 7);
8256    }
8257    return;
8258   }
8259
8260   /* Advance to the next range, which will be for code points not in the
8261   * inversion list */
8262   current = array[i];
8263  }
8264
8265  return;
8266 }
8267
8268 void
8269 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8270           const bool complement_b, SV** output)
8271 {
8272  /* Take the union of two inversion lists and point <output> to it.  *output
8273  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8274  * the reference count to that list will be decremented if not already a
8275  * temporary (mortal); otherwise *output will be made correspondingly
8276  * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8277  * second list is returned.  If <complement_b> is TRUE, the union is taken
8278  * of the complement (inversion) of <b> instead of b itself.
8279  *
8280  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8281  * Richard Gillam, published by Addison-Wesley, and explained at some
8282  * length there.  The preface says to incorporate its examples into your
8283  * code at your own risk.
8284  *
8285  * The algorithm is like a merge sort.
8286  *
8287  * XXX A potential performance improvement is to keep track as we go along
8288  * if only one of the inputs contributes to the result, meaning the other
8289  * is a subset of that one.  In that case, we can skip the final copy and
8290  * return the larger of the input lists, but then outside code might need
8291  * to keep track of whether to free the input list or not */
8292
8293  const UV* array_a;    /* a's array */
8294  const UV* array_b;
8295  UV len_a;     /* length of a's array */
8296  UV len_b;
8297
8298  SV* u;   /* the resulting union */
8299  UV* array_u;
8300  UV len_u;
8301
8302  UV i_a = 0;      /* current index into a's array */
8303  UV i_b = 0;
8304  UV i_u = 0;
8305
8306  /* running count, as explained in the algorithm source book; items are
8307  * stopped accumulating and are output when the count changes to/from 0.
8308  * The count is incremented when we start a range that's in the set, and
8309  * decremented when we start a range that's not in the set.  So its range
8310  * is 0 to 2.  Only when the count is zero is something not in the set.
8311  */
8312  UV count = 0;
8313
8314  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8315  assert(a != b);
8316
8317  /* If either one is empty, the union is the other one */
8318  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8319   bool make_temp = FALSE; /* Should we mortalize the result? */
8320
8321   if (*output == a) {
8322    if (a != NULL) {
8323     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8324      SvREFCNT_dec_NN(a);
8325     }
8326    }
8327   }
8328   if (*output != b) {
8329    *output = invlist_clone(b);
8330    if (complement_b) {
8331     _invlist_invert(*output);
8332    }
8333   } /* else *output already = b; */
8334
8335   if (make_temp) {
8336    sv_2mortal(*output);
8337   }
8338   return;
8339  }
8340  else if ((len_b = _invlist_len(b)) == 0) {
8341   bool make_temp = FALSE;
8342   if (*output == b) {
8343    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8344     SvREFCNT_dec_NN(b);
8345    }
8346   }
8347
8348   /* The complement of an empty list is a list that has everything in it,
8349   * so the union with <a> includes everything too */
8350   if (complement_b) {
8351    if (a == *output) {
8352     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8353      SvREFCNT_dec_NN(a);
8354     }
8355    }
8356    *output = _new_invlist(1);
8357    _append_range_to_invlist(*output, 0, UV_MAX);
8358   }
8359   else if (*output != a) {
8360    *output = invlist_clone(a);
8361   }
8362   /* else *output already = a; */
8363
8364   if (make_temp) {
8365    sv_2mortal(*output);
8366   }
8367   return;
8368  }
8369
8370  /* Here both lists exist and are non-empty */
8371  array_a = invlist_array(a);
8372  array_b = invlist_array(b);
8373
8374  /* If are to take the union of 'a' with the complement of b, set it
8375  * up so are looking at b's complement. */
8376  if (complement_b) {
8377
8378   /* To complement, we invert: if the first element is 0, remove it.  To
8379   * do this, we just pretend the array starts one later */
8380   if (array_b[0] == 0) {
8381    array_b++;
8382    len_b--;
8383   }
8384   else {
8385
8386    /* But if the first element is not zero, we pretend the list starts
8387    * at the 0 that is always stored immediately before the array. */
8388    array_b--;
8389    len_b++;
8390   }
8391  }
8392
8393  /* Size the union for the worst case: that the sets are completely
8394  * disjoint */
8395  u = _new_invlist(len_a + len_b);
8396
8397  /* Will contain U+0000 if either component does */
8398  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8399          || (len_b > 0 && array_b[0] == 0));
8400
8401  /* Go through each list item by item, stopping when exhausted one of
8402  * them */
8403  while (i_a < len_a && i_b < len_b) {
8404   UV cp;     /* The element to potentially add to the union's array */
8405   bool cp_in_set;   /* is it in the the input list's set or not */
8406
8407   /* We need to take one or the other of the two inputs for the union.
8408   * Since we are merging two sorted lists, we take the smaller of the
8409   * next items.  In case of a tie, we take the one that is in its set
8410   * first.  If we took one not in the set first, it would decrement the
8411   * count, possibly to 0 which would cause it to be output as ending the
8412   * range, and the next time through we would take the same number, and
8413   * output it again as beginning the next range.  By doing it the
8414   * opposite way, there is no possibility that the count will be
8415   * momentarily decremented to 0, and thus the two adjoining ranges will
8416   * be seamlessly merged.  (In a tie and both are in the set or both not
8417   * in the set, it doesn't matter which we take first.) */
8418   if (array_a[i_a] < array_b[i_b]
8419    || (array_a[i_a] == array_b[i_b]
8420     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8421   {
8422    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8423    cp= array_a[i_a++];
8424   }
8425   else {
8426    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8427    cp = array_b[i_b++];
8428   }
8429
8430   /* Here, have chosen which of the two inputs to look at.  Only output
8431   * if the running count changes to/from 0, which marks the
8432   * beginning/end of a range in that's in the set */
8433   if (cp_in_set) {
8434    if (count == 0) {
8435     array_u[i_u++] = cp;
8436    }
8437    count++;
8438   }
8439   else {
8440    count--;
8441    if (count == 0) {
8442     array_u[i_u++] = cp;
8443    }
8444   }
8445  }
8446
8447  /* Here, we are finished going through at least one of the lists, which
8448  * means there is something remaining in at most one.  We check if the list
8449  * that hasn't been exhausted is positioned such that we are in the middle
8450  * of a range in its set or not.  (i_a and i_b point to the element beyond
8451  * the one we care about.) If in the set, we decrement 'count'; if 0, there
8452  * is potentially more to output.
8453  * There are four cases:
8454  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
8455  *    in the union is entirely from the non-exhausted set.
8456  * 2) Both were in their sets, count is 2.  Nothing further should
8457  *    be output, as everything that remains will be in the exhausted
8458  *    list's set, hence in the union; decrementing to 1 but not 0 insures
8459  *    that
8460  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8461  *    Nothing further should be output because the union includes
8462  *    everything from the exhausted set.  Not decrementing ensures that.
8463  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8464  *    decrementing to 0 insures that we look at the remainder of the
8465  *    non-exhausted set */
8466  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8467   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8468  {
8469   count--;
8470  }
8471
8472  /* The final length is what we've output so far, plus what else is about to
8473  * be output.  (If 'count' is non-zero, then the input list we exhausted
8474  * has everything remaining up to the machine's limit in its set, and hence
8475  * in the union, so there will be no further output. */
8476  len_u = i_u;
8477  if (count == 0) {
8478   /* At most one of the subexpressions will be non-zero */
8479   len_u += (len_a - i_a) + (len_b - i_b);
8480  }
8481
8482  /* Set result to final length, which can change the pointer to array_u, so
8483  * re-find it */
8484  if (len_u != _invlist_len(u)) {
8485   invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8486   invlist_trim(u);
8487   array_u = invlist_array(u);
8488  }
8489
8490  /* When 'count' is 0, the list that was exhausted (if one was shorter than
8491  * the other) ended with everything above it not in its set.  That means
8492  * that the remaining part of the union is precisely the same as the
8493  * non-exhausted list, so can just copy it unchanged.  (If both list were
8494  * exhausted at the same time, then the operations below will be both 0.)
8495  */
8496  if (count == 0) {
8497   IV copy_count; /* At most one will have a non-zero copy count */
8498   if ((copy_count = len_a - i_a) > 0) {
8499    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8500   }
8501   else if ((copy_count = len_b - i_b) > 0) {
8502    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8503   }
8504  }
8505
8506  /*  We may be removing a reference to one of the inputs.  If so, the output
8507  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8508  *  count decremented) */
8509  if (a == *output || b == *output) {
8510   assert(! invlist_is_iterating(*output));
8511   if ((SvTEMP(*output))) {
8512    sv_2mortal(u);
8513   }
8514   else {
8515    SvREFCNT_dec_NN(*output);
8516   }
8517  }
8518
8519  *output = u;
8520
8521  return;
8522 }
8523
8524 void
8525 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8526            const bool complement_b, SV** i)
8527 {
8528  /* Take the intersection of two inversion lists and point <i> to it.  *i
8529  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8530  * the reference count to that list will be decremented if not already a
8531  * temporary (mortal); otherwise *i will be made correspondingly mortal.
8532  * The first list, <a>, may be NULL, in which case an empty list is
8533  * returned.  If <complement_b> is TRUE, the result will be the
8534  * intersection of <a> and the complement (or inversion) of <b> instead of
8535  * <b> directly.
8536  *
8537  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8538  * Richard Gillam, published by Addison-Wesley, and explained at some
8539  * length there.  The preface says to incorporate its examples into your
8540  * code at your own risk.  In fact, it had bugs
8541  *
8542  * The algorithm is like a merge sort, and is essentially the same as the
8543  * union above
8544  */
8545
8546  const UV* array_a;  /* a's array */
8547  const UV* array_b;
8548  UV len_a; /* length of a's array */
8549  UV len_b;
8550
8551  SV* r;       /* the resulting intersection */
8552  UV* array_r;
8553  UV len_r;
8554
8555  UV i_a = 0;      /* current index into a's array */
8556  UV i_b = 0;
8557  UV i_r = 0;
8558
8559  /* running count, as explained in the algorithm source book; items are
8560  * stopped accumulating and are output when the count changes to/from 2.
8561  * The count is incremented when we start a range that's in the set, and
8562  * decremented when we start a range that's not in the set.  So its range
8563  * is 0 to 2.  Only when the count is 2 is something in the intersection.
8564  */
8565  UV count = 0;
8566
8567  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8568  assert(a != b);
8569
8570  /* Special case if either one is empty */
8571  len_a = (a == NULL) ? 0 : _invlist_len(a);
8572  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8573   bool make_temp = FALSE;
8574
8575   if (len_a != 0 && complement_b) {
8576
8577    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8578    * be empty.  Here, also we are using 'b's complement, which hence
8579    * must be every possible code point.  Thus the intersection is
8580    * simply 'a'. */
8581    if (*i != a) {
8582     if (*i == b) {
8583      if (! (make_temp = cBOOL(SvTEMP(b)))) {
8584       SvREFCNT_dec_NN(b);
8585      }
8586     }
8587
8588     *i = invlist_clone(a);
8589    }
8590    /* else *i is already 'a' */
8591
8592    if (make_temp) {
8593     sv_2mortal(*i);
8594    }
8595    return;
8596   }
8597
8598   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8599   * intersection must be empty */
8600   if (*i == a) {
8601    if (! (make_temp = cBOOL(SvTEMP(a)))) {
8602     SvREFCNT_dec_NN(a);
8603    }
8604   }
8605   else if (*i == b) {
8606    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8607     SvREFCNT_dec_NN(b);
8608    }
8609   }
8610   *i = _new_invlist(0);
8611   if (make_temp) {
8612    sv_2mortal(*i);
8613   }
8614
8615   return;
8616  }
8617
8618  /* Here both lists exist and are non-empty */
8619  array_a = invlist_array(a);
8620  array_b = invlist_array(b);
8621
8622  /* If are to take the intersection of 'a' with the complement of b, set it
8623  * up so are looking at b's complement. */
8624  if (complement_b) {
8625
8626   /* To complement, we invert: if the first element is 0, remove it.  To
8627   * do this, we just pretend the array starts one later */
8628   if (array_b[0] == 0) {
8629    array_b++;
8630    len_b--;
8631   }
8632   else {
8633
8634    /* But if the first element is not zero, we pretend the list starts
8635    * at the 0 that is always stored immediately before the array. */
8636    array_b--;
8637    len_b++;
8638   }
8639  }
8640
8641  /* Size the intersection for the worst case: that the intersection ends up
8642  * fragmenting everything to be completely disjoint */
8643  r= _new_invlist(len_a + len_b);
8644
8645  /* Will contain U+0000 iff both components do */
8646  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8647          && len_b > 0 && array_b[0] == 0);
8648
8649  /* Go through each list item by item, stopping when exhausted one of
8650  * them */
8651  while (i_a < len_a && i_b < len_b) {
8652   UV cp;     /* The element to potentially add to the intersection's
8653      array */
8654   bool cp_in_set; /* Is it in the input list's set or not */
8655
8656   /* We need to take one or the other of the two inputs for the
8657   * intersection.  Since we are merging two sorted lists, we take the
8658   * smaller of the next items.  In case of a tie, we take the one that
8659   * is not in its set first (a difference from the union algorithm).  If
8660   * we took one in the set first, it would increment the count, possibly
8661   * to 2 which would cause it to be output as starting a range in the
8662   * intersection, and the next time through we would take that same
8663   * number, and output it again as ending the set.  By doing it the
8664   * opposite of this, there is no possibility that the count will be
8665   * momentarily incremented to 2.  (In a tie and both are in the set or
8666   * both not in the set, it doesn't matter which we take first.) */
8667   if (array_a[i_a] < array_b[i_b]
8668    || (array_a[i_a] == array_b[i_b]
8669     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8670   {
8671    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8672    cp= array_a[i_a++];
8673   }
8674   else {
8675    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8676    cp= array_b[i_b++];
8677   }
8678
8679   /* Here, have chosen which of the two inputs to look at.  Only output
8680   * if the running count changes to/from 2, which marks the
8681   * beginning/end of a range that's in the intersection */
8682   if (cp_in_set) {
8683    count++;
8684    if (count == 2) {
8685     array_r[i_r++] = cp;
8686    }
8687   }
8688   else {
8689    if (count == 2) {
8690     array_r[i_r++] = cp;
8691    }
8692    count--;
8693   }
8694  }
8695
8696  /* Here, we are finished going through at least one of the lists, which
8697  * means there is something remaining in at most one.  We check if the list
8698  * that has been exhausted is positioned such that we are in the middle
8699  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8700  * the ones we care about.)  There are four cases:
8701  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
8702  *    nothing left in the intersection.
8703  * 2) Both were in their sets, count is 2 and perhaps is incremented to
8704  *    above 2.  What should be output is exactly that which is in the
8705  *    non-exhausted set, as everything it has is also in the intersection
8706  *    set, and everything it doesn't have can't be in the intersection
8707  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8708  *    gets incremented to 2.  Like the previous case, the intersection is
8709  *    everything that remains in the non-exhausted set.
8710  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8711  *    remains 1.  And the intersection has nothing more. */
8712  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8713   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8714  {
8715   count++;
8716  }
8717
8718  /* The final length is what we've output so far plus what else is in the
8719  * intersection.  At most one of the subexpressions below will be non-zero
8720  * */
8721  len_r = i_r;
8722  if (count >= 2) {
8723   len_r += (len_a - i_a) + (len_b - i_b);
8724  }
8725
8726  /* Set result to final length, which can change the pointer to array_r, so
8727  * re-find it */
8728  if (len_r != _invlist_len(r)) {
8729   invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8730   invlist_trim(r);
8731   array_r = invlist_array(r);
8732  }
8733
8734  /* Finish outputting any remaining */
8735  if (count >= 2) { /* At most one will have a non-zero copy count */
8736   IV copy_count;
8737   if ((copy_count = len_a - i_a) > 0) {
8738    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8739   }
8740   else if ((copy_count = len_b - i_b) > 0) {
8741    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8742   }
8743  }
8744
8745  /*  We may be removing a reference to one of the inputs.  If so, the output
8746  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8747  *  count decremented) */
8748  if (a == *i || b == *i) {
8749   assert(! invlist_is_iterating(*i));
8750   if (SvTEMP(*i)) {
8751    sv_2mortal(r);
8752   }
8753   else {
8754    SvREFCNT_dec_NN(*i);
8755   }
8756  }
8757
8758  *i = r;
8759
8760  return;
8761 }
8762
8763 SV*
8764 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8765 {
8766  /* Add the range from 'start' to 'end' inclusive to the inversion list's
8767  * set.  A pointer to the inversion list is returned.  This may actually be
8768  * a new list, in which case the passed in one has been destroyed.  The
8769  * passed in inversion list can be NULL, in which case a new one is created
8770  * with just the one range in it */
8771
8772  SV* range_invlist;
8773  UV len;
8774
8775  if (invlist == NULL) {
8776   invlist = _new_invlist(2);
8777   len = 0;
8778  }
8779  else {
8780   len = _invlist_len(invlist);
8781  }
8782
8783  /* If comes after the final entry actually in the list, can just append it
8784  * to the end, */
8785  if (len == 0
8786   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8787    && start >= invlist_array(invlist)[len - 1]))
8788  {
8789   _append_range_to_invlist(invlist, start, end);
8790   return invlist;
8791  }
8792
8793  /* Here, can't just append things, create and return a new inversion list
8794  * which is the union of this range and the existing inversion list */
8795  range_invlist = _new_invlist(2);
8796  _append_range_to_invlist(range_invlist, start, end);
8797
8798  _invlist_union(invlist, range_invlist, &invlist);
8799
8800  /* The temporary can be freed */
8801  SvREFCNT_dec_NN(range_invlist);
8802
8803  return invlist;
8804 }
8805
8806 SV*
8807 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8808         UV** other_elements_ptr)
8809 {
8810  /* Create and return an inversion list whose contents are to be populated
8811  * by the caller.  The caller gives the number of elements (in 'size') and
8812  * the very first element ('element0').  This function will set
8813  * '*other_elements_ptr' to an array of UVs, where the remaining elements
8814  * are to be placed.
8815  *
8816  * Obviously there is some trust involved that the caller will properly
8817  * fill in the other elements of the array.
8818  *
8819  * (The first element needs to be passed in, as the underlying code does
8820  * things differently depending on whether it is zero or non-zero) */
8821
8822  SV* invlist = _new_invlist(size);
8823  bool offset;
8824
8825  PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8826
8827  _append_range_to_invlist(invlist, element0, element0);
8828  offset = *get_invlist_offset_addr(invlist);
8829
8830  invlist_set_len(invlist, size, offset);
8831  *other_elements_ptr = invlist_array(invlist) + 1;
8832  return invlist;
8833 }
8834
8835 #endif
8836
8837 PERL_STATIC_INLINE SV*
8838 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8839  return _add_range_to_invlist(invlist, cp, cp);
8840 }
8841
8842 #ifndef PERL_IN_XSUB_RE
8843 void
8844 Perl__invlist_invert(pTHX_ SV* const invlist)
8845 {
8846  /* Complement the input inversion list.  This adds a 0 if the list didn't
8847  * have a zero; removes it otherwise.  As described above, the data
8848  * structure is set up so that this is very efficient */
8849
8850  PERL_ARGS_ASSERT__INVLIST_INVERT;
8851
8852  assert(! invlist_is_iterating(invlist));
8853
8854  /* The inverse of matching nothing is matching everything */
8855  if (_invlist_len(invlist) == 0) {
8856   _append_range_to_invlist(invlist, 0, UV_MAX);
8857   return;
8858  }
8859
8860  *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8861 }
8862
8863 #endif
8864
8865 PERL_STATIC_INLINE SV*
8866 S_invlist_clone(pTHX_ SV* const invlist)
8867 {
8868
8869  /* Return a new inversion list that is a copy of the input one, which is
8870  * unchanged.  The new list will not be mortal even if the old one was. */
8871
8872  /* Need to allocate extra space to accommodate Perl's addition of a
8873  * trailing NUL to SvPV's, since it thinks they are always strings */
8874  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8875  STRLEN physical_length = SvCUR(invlist);
8876  bool offset = *(get_invlist_offset_addr(invlist));
8877
8878  PERL_ARGS_ASSERT_INVLIST_CLONE;
8879
8880  *(get_invlist_offset_addr(new_invlist)) = offset;
8881  invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8882  Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8883
8884  return new_invlist;
8885 }
8886
8887 PERL_STATIC_INLINE STRLEN*
8888 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8889 {
8890  /* Return the address of the UV that contains the current iteration
8891  * position */
8892
8893  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8894
8895  assert(SvTYPE(invlist) == SVt_INVLIST);
8896
8897  return &(((XINVLIST*) SvANY(invlist))->iterator);
8898 }
8899
8900 PERL_STATIC_INLINE void
8901 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8902 {
8903  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8904
8905  *get_invlist_iter_addr(invlist) = 0;
8906 }
8907
8908 PERL_STATIC_INLINE void
8909 S_invlist_iterfinish(pTHX_ SV* invlist)
8910 {
8911  /* Terminate iterator for invlist.  This is to catch development errors.
8912  * Any iteration that is interrupted before completed should call this
8913  * function.  Functions that add code points anywhere else but to the end
8914  * of an inversion list assert that they are not in the middle of an
8915  * iteration.  If they were, the addition would make the iteration
8916  * problematical: if the iteration hadn't reached the place where things
8917  * were being added, it would be ok */
8918
8919  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8920
8921  *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8922 }
8923
8924 STATIC bool
8925 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8926 {
8927  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8928  * This call sets in <*start> and <*end>, the next range in <invlist>.
8929  * Returns <TRUE> if successful and the next call will return the next
8930  * range; <FALSE> if was already at the end of the list.  If the latter,
8931  * <*start> and <*end> are unchanged, and the next call to this function
8932  * will start over at the beginning of the list */
8933
8934  STRLEN* pos = get_invlist_iter_addr(invlist);
8935  UV len = _invlist_len(invlist);
8936  UV *array;
8937
8938  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8939
8940  if (*pos >= len) {
8941   *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8942   return FALSE;
8943  }
8944
8945  array = invlist_array(invlist);
8946
8947  *start = array[(*pos)++];
8948
8949  if (*pos >= len) {
8950   *end = UV_MAX;
8951  }
8952  else {
8953   *end = array[(*pos)++] - 1;
8954  }
8955
8956  return TRUE;
8957 }
8958
8959 PERL_STATIC_INLINE bool
8960 S_invlist_is_iterating(pTHX_ SV* const invlist)
8961 {
8962  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8963
8964  return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8965 }
8966
8967 PERL_STATIC_INLINE UV
8968 S_invlist_highest(pTHX_ SV* const invlist)
8969 {
8970  /* Returns the highest code point that matches an inversion list.  This API
8971  * has an ambiguity, as it returns 0 under either the highest is actually
8972  * 0, or if the list is empty.  If this distinction matters to you, check
8973  * for emptiness before calling this function */
8974
8975  UV len = _invlist_len(invlist);
8976  UV *array;
8977
8978  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8979
8980  if (len == 0) {
8981   return 0;
8982  }
8983
8984  array = invlist_array(invlist);
8985
8986  /* The last element in the array in the inversion list always starts a
8987  * range that goes to infinity.  That range may be for code points that are
8988  * matched in the inversion list, or it may be for ones that aren't
8989  * matched.  In the latter case, the highest code point in the set is one
8990  * less than the beginning of this range; otherwise it is the final element
8991  * of this range: infinity */
8992  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8993   ? UV_MAX
8994   : array[len - 1] - 1;
8995 }
8996
8997 #ifndef PERL_IN_XSUB_RE
8998 SV *
8999 Perl__invlist_contents(pTHX_ SV* const invlist)
9000 {
9001  /* Get the contents of an inversion list into a string SV so that they can
9002  * be printed out.  It uses the format traditionally done for debug tracing
9003  */
9004
9005  UV start, end;
9006  SV* output = newSVpvs("\n");
9007
9008  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9009
9010  assert(! invlist_is_iterating(invlist));
9011
9012  invlist_iterinit(invlist);
9013  while (invlist_iternext(invlist, &start, &end)) {
9014   if (end == UV_MAX) {
9015    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9016   }
9017   else if (end != start) {
9018    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9019      start,       end);
9020   }
9021   else {
9022    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9023   }
9024  }
9025
9026  return output;
9027 }
9028 #endif
9029
9030 #ifndef PERL_IN_XSUB_RE
9031 void
9032 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9033       const char * const indent, SV* const invlist)
9034 {
9035  /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9036  * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9037  * the string 'indent'.  The output looks like this:
9038   [0] 0x000A .. 0x000D
9039   [2] 0x0085
9040   [4] 0x2028 .. 0x2029
9041   [6] 0x3104 .. INFINITY
9042  * This means that the first range of code points matched by the list are
9043  * 0xA through 0xD; the second range contains only the single code point
9044  * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9045  * are used to define each range (except if the final range extends to
9046  * infinity, only a single element is needed).  The array index of the
9047  * first element for the corresponding range is given in brackets. */
9048
9049  UV start, end;
9050  STRLEN count = 0;
9051
9052  PERL_ARGS_ASSERT__INVLIST_DUMP;
9053
9054  if (invlist_is_iterating(invlist)) {
9055   Perl_dump_indent(aTHX_ level, file,
9056    "%sCan't dump inversion list because is in middle of iterating\n",
9057    indent);
9058   return;
9059  }
9060
9061  invlist_iterinit(invlist);
9062  while (invlist_iternext(invlist, &start, &end)) {
9063   if (end == UV_MAX) {
9064    Perl_dump_indent(aTHX_ level, file,
9065          "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9066         indent, (UV)count, start);
9067   }
9068   else if (end != start) {
9069    Perl_dump_indent(aTHX_ level, file,
9070          "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9071         indent, (UV)count, start,         end);
9072   }
9073   else {
9074    Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9075            indent, (UV)count, start);
9076   }
9077   count += 2;
9078  }
9079 }
9080 #endif
9081
9082 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9083 bool
9084 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9085 {
9086  /* Return a boolean as to if the two passed in inversion lists are
9087  * identical.  The final argument, if TRUE, says to take the complement of
9088  * the second inversion list before doing the comparison */
9089
9090  const UV* array_a = invlist_array(a);
9091  const UV* array_b = invlist_array(b);
9092  UV len_a = _invlist_len(a);
9093  UV len_b = _invlist_len(b);
9094
9095  UV i = 0;      /* current index into the arrays */
9096  bool retval = TRUE;     /* Assume are identical until proven otherwise */
9097
9098  PERL_ARGS_ASSERT__INVLISTEQ;
9099
9100  /* If are to compare 'a' with the complement of b, set it
9101  * up so are looking at b's complement. */
9102  if (complement_b) {
9103
9104   /* The complement of nothing is everything, so <a> would have to have
9105   * just one element, starting at zero (ending at infinity) */
9106   if (len_b == 0) {
9107    return (len_a == 1 && array_a[0] == 0);
9108   }
9109   else if (array_b[0] == 0) {
9110
9111    /* Otherwise, to complement, we invert.  Here, the first element is
9112    * 0, just remove it.  To do this, we just pretend the array starts
9113    * one later */
9114
9115    array_b++;
9116    len_b--;
9117   }
9118   else {
9119
9120    /* But if the first element is not zero, we pretend the list starts
9121    * at the 0 that is always stored immediately before the array. */
9122    array_b--;
9123    len_b++;
9124   }
9125  }
9126
9127  /* Make sure that the lengths are the same, as well as the final element
9128  * before looping through the remainder.  (Thus we test the length, final,
9129  * and first elements right off the bat) */
9130  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9131   retval = FALSE;
9132  }
9133  else for (i = 0; i < len_a - 1; i++) {
9134   if (array_a[i] != array_b[i]) {
9135    retval = FALSE;
9136    break;
9137   }
9138  }
9139
9140  return retval;
9141 }
9142 #endif
9143
9144 #undef HEADER_LENGTH
9145 #undef TO_INTERNAL_SIZE
9146 #undef FROM_INTERNAL_SIZE
9147 #undef INVLIST_VERSION_ID
9148
9149 /* End of inversion list object */
9150
9151 STATIC void
9152 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9153 {
9154  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9155  * constructs, and updates RExC_flags with them.  On input, RExC_parse
9156  * should point to the first flag; it is updated on output to point to the
9157  * final ')' or ':'.  There needs to be at least one flag, or this will
9158  * abort */
9159
9160  /* for (?g), (?gc), and (?o) warnings; warning
9161  about (?c) will warn about (?g) -- japhy    */
9162
9163 #define WASTED_O  0x01
9164 #define WASTED_G  0x02
9165 #define WASTED_C  0x04
9166 #define WASTED_GC (WASTED_G|WASTED_C)
9167  I32 wastedflags = 0x00;
9168  U32 posflags = 0, negflags = 0;
9169  U32 *flagsp = &posflags;
9170  char has_charset_modifier = '\0';
9171  regex_charset cs;
9172  bool has_use_defaults = FALSE;
9173  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9174
9175  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9176
9177  /* '^' as an initial flag sets certain defaults */
9178  if (UCHARAT(RExC_parse) == '^') {
9179   RExC_parse++;
9180   has_use_defaults = TRUE;
9181   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9182   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9183           ? REGEX_UNICODE_CHARSET
9184           : REGEX_DEPENDS_CHARSET);
9185  }
9186
9187  cs = get_regex_charset(RExC_flags);
9188  if (cs == REGEX_DEPENDS_CHARSET
9189   && (RExC_utf8 || RExC_uni_semantics))
9190  {
9191   cs = REGEX_UNICODE_CHARSET;
9192  }
9193
9194  while (*RExC_parse) {
9195   /* && strchr("iogcmsx", *RExC_parse) */
9196   /* (?g), (?gc) and (?o) are useless here
9197   and must be globally applied -- japhy */
9198   switch (*RExC_parse) {
9199
9200    /* Code for the imsx flags */
9201    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9202
9203    case LOCALE_PAT_MOD:
9204     if (has_charset_modifier) {
9205      goto excess_modifier;
9206     }
9207     else if (flagsp == &negflags) {
9208      goto neg_modifier;
9209     }
9210     cs = REGEX_LOCALE_CHARSET;
9211     has_charset_modifier = LOCALE_PAT_MOD;
9212     break;
9213    case UNICODE_PAT_MOD:
9214     if (has_charset_modifier) {
9215      goto excess_modifier;
9216     }
9217     else if (flagsp == &negflags) {
9218      goto neg_modifier;
9219     }
9220     cs = REGEX_UNICODE_CHARSET;
9221     has_charset_modifier = UNICODE_PAT_MOD;
9222     break;
9223    case ASCII_RESTRICT_PAT_MOD:
9224     if (flagsp == &negflags) {
9225      goto neg_modifier;
9226     }
9227     if (has_charset_modifier) {
9228      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9229       goto excess_modifier;
9230      }
9231      /* Doubled modifier implies more restricted */
9232      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9233     }
9234     else {
9235      cs = REGEX_ASCII_RESTRICTED_CHARSET;
9236     }
9237     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9238     break;
9239    case DEPENDS_PAT_MOD:
9240     if (has_use_defaults) {
9241      goto fail_modifiers;
9242     }
9243     else if (flagsp == &negflags) {
9244      goto neg_modifier;
9245     }
9246     else if (has_charset_modifier) {
9247      goto excess_modifier;
9248     }
9249
9250     /* The dual charset means unicode semantics if the
9251     * pattern (or target, not known until runtime) are
9252     * utf8, or something in the pattern indicates unicode
9253     * semantics */
9254     cs = (RExC_utf8 || RExC_uni_semantics)
9255      ? REGEX_UNICODE_CHARSET
9256      : REGEX_DEPENDS_CHARSET;
9257     has_charset_modifier = DEPENDS_PAT_MOD;
9258     break;
9259    excess_modifier:
9260     RExC_parse++;
9261     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9262      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9263     }
9264     else if (has_charset_modifier == *(RExC_parse - 1)) {
9265      vFAIL2("Regexp modifier \"%c\" may not appear twice",
9266           *(RExC_parse - 1));
9267     }
9268     else {
9269      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9270     }
9271     /*NOTREACHED*/
9272    neg_modifier:
9273     RExC_parse++;
9274     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9275          *(RExC_parse - 1));
9276     /*NOTREACHED*/
9277    case ONCE_PAT_MOD: /* 'o' */
9278    case GLOBAL_PAT_MOD: /* 'g' */
9279     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9280      const I32 wflagbit = *RExC_parse == 'o'
9281           ? WASTED_O
9282           : WASTED_G;
9283      if (! (wastedflags & wflagbit) ) {
9284       wastedflags |= wflagbit;
9285       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9286       vWARN5(
9287        RExC_parse + 1,
9288        "Useless (%s%c) - %suse /%c modifier",
9289        flagsp == &negflags ? "?-" : "?",
9290        *RExC_parse,
9291        flagsp == &negflags ? "don't " : "",
9292        *RExC_parse
9293       );
9294      }
9295     }
9296     break;
9297
9298    case CONTINUE_PAT_MOD: /* 'c' */
9299     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9300      if (! (wastedflags & WASTED_C) ) {
9301       wastedflags |= WASTED_GC;
9302       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9303       vWARN3(
9304        RExC_parse + 1,
9305        "Useless (%sc) - %suse /gc modifier",
9306        flagsp == &negflags ? "?-" : "?",
9307        flagsp == &negflags ? "don't " : ""
9308       );
9309      }
9310     }
9311     break;
9312    case KEEPCOPY_PAT_MOD: /* 'p' */
9313     if (flagsp == &negflags) {
9314      if (SIZE_ONLY)
9315       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9316     } else {
9317      *flagsp |= RXf_PMf_KEEPCOPY;
9318     }
9319     break;
9320    case '-':
9321     /* A flag is a default iff it is following a minus, so
9322     * if there is a minus, it means will be trying to
9323     * re-specify a default which is an error */
9324     if (has_use_defaults || flagsp == &negflags) {
9325      goto fail_modifiers;
9326     }
9327     flagsp = &negflags;
9328     wastedflags = 0;  /* reset so (?g-c) warns twice */
9329     break;
9330    case ':':
9331    case ')':
9332     RExC_flags |= posflags;
9333     RExC_flags &= ~negflags;
9334     set_regex_charset(&RExC_flags, cs);
9335     if (RExC_flags & RXf_PMf_FOLD) {
9336      RExC_contains_i = 1;
9337     }
9338     return;
9339     /*NOTREACHED*/
9340    default:
9341    fail_modifiers:
9342     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9343     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9344     vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9345      UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9346     /*NOTREACHED*/
9347   }
9348
9349   ++RExC_parse;
9350  }
9351 }
9352
9353 /*
9354  - reg - regular expression, i.e. main body or parenthesized thing
9355  *
9356  * Caller must absorb opening parenthesis.
9357  *
9358  * Combining parenthesis handling with the base level of regular expression
9359  * is a trifle forced, but the need to tie the tails of the branches to what
9360  * follows makes it hard to avoid.
9361  */
9362 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9363 #ifdef DEBUGGING
9364 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9365 #else
9366 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9367 #endif
9368
9369 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9370    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9371    needs to be restarted.
9372    Otherwise would only return NULL if regbranch() returns NULL, which
9373    cannot happen.  */
9374 STATIC regnode *
9375 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9376  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9377  * 2 is like 1, but indicates that nextchar() has been called to advance
9378  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9379  * this flag alerts us to the need to check for that */
9380 {
9381  dVAR;
9382  regnode *ret;  /* Will be the head of the group. */
9383  regnode *br;
9384  regnode *lastbr;
9385  regnode *ender = NULL;
9386  I32 parno = 0;
9387  I32 flags;
9388  U32 oregflags = RExC_flags;
9389  bool have_branch = 0;
9390  bool is_open = 0;
9391  I32 freeze_paren = 0;
9392  I32 after_freeze = 0;
9393
9394  char * parse_start = RExC_parse; /* MJD */
9395  char * const oregcomp_parse = RExC_parse;
9396
9397  GET_RE_DEBUG_FLAGS_DECL;
9398
9399  PERL_ARGS_ASSERT_REG;
9400  DEBUG_PARSE("reg ");
9401
9402  *flagp = 0;    /* Tentatively. */
9403
9404
9405  /* Make an OPEN node, if parenthesized. */
9406  if (paren) {
9407
9408   /* Under /x, space and comments can be gobbled up between the '(' and
9409   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9410   * intervening space, as the sequence is a token, and a token should be
9411   * indivisible */
9412   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9413
9414   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9415    char *start_verb = RExC_parse;
9416    STRLEN verb_len = 0;
9417    char *start_arg = NULL;
9418    unsigned char op = 0;
9419    int argok = 1;
9420    int internal_argval = 0; /* internal_argval is only useful if
9421           !argok */
9422
9423    if (has_intervening_patws && SIZE_ONLY) {
9424     ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9425    }
9426    while ( *RExC_parse && *RExC_parse != ')' ) {
9427     if ( *RExC_parse == ':' ) {
9428      start_arg = RExC_parse + 1;
9429      break;
9430     }
9431     RExC_parse++;
9432    }
9433    ++start_verb;
9434    verb_len = RExC_parse - start_verb;
9435    if ( start_arg ) {
9436     RExC_parse++;
9437     while ( *RExC_parse && *RExC_parse != ')' )
9438      RExC_parse++;
9439     if ( *RExC_parse != ')' )
9440      vFAIL("Unterminated verb pattern argument");
9441     if ( RExC_parse == start_arg )
9442      start_arg = NULL;
9443    } else {
9444     if ( *RExC_parse != ')' )
9445      vFAIL("Unterminated verb pattern");
9446    }
9447
9448    switch ( *start_verb ) {
9449    case 'A':  /* (*ACCEPT) */
9450     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9451      op = ACCEPT;
9452      internal_argval = RExC_nestroot;
9453     }
9454     break;
9455    case 'C':  /* (*COMMIT) */
9456     if ( memEQs(start_verb,verb_len,"COMMIT") )
9457      op = COMMIT;
9458     break;
9459    case 'F':  /* (*FAIL) */
9460     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9461      op = OPFAIL;
9462      argok = 0;
9463     }
9464     break;
9465    case ':':  /* (*:NAME) */
9466    case 'M':  /* (*MARK:NAME) */
9467     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9468      op = MARKPOINT;
9469      argok = -1;
9470     }
9471     break;
9472    case 'P':  /* (*PRUNE) */
9473     if ( memEQs(start_verb,verb_len,"PRUNE") )
9474      op = PRUNE;
9475     break;
9476    case 'S':   /* (*SKIP) */
9477     if ( memEQs(start_verb,verb_len,"SKIP") )
9478      op = SKIP;
9479     break;
9480    case 'T':  /* (*THEN) */
9481     /* [19:06] <TimToady> :: is then */
9482     if ( memEQs(start_verb,verb_len,"THEN") ) {
9483      op = CUTGROUP;
9484      RExC_seen |= REG_CUTGROUP_SEEN;
9485     }
9486     break;
9487    }
9488    if ( ! op ) {
9489     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9490     vFAIL2utf8f(
9491      "Unknown verb pattern '%"UTF8f"'",
9492      UTF8fARG(UTF, verb_len, start_verb));
9493    }
9494    if ( argok ) {
9495     if ( start_arg && internal_argval ) {
9496      vFAIL3("Verb pattern '%.*s' may not have an argument",
9497       verb_len, start_verb);
9498     } else if ( argok < 0 && !start_arg ) {
9499      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9500       verb_len, start_verb);
9501     } else {
9502      ret = reganode(pRExC_state, op, internal_argval);
9503      if ( ! internal_argval && ! SIZE_ONLY ) {
9504       if (start_arg) {
9505        SV *sv = newSVpvn( start_arg,
9506            RExC_parse - start_arg);
9507        ARG(ret) = add_data( pRExC_state,
9508             STR_WITH_LEN("S"));
9509        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9510        ret->flags = 0;
9511       } else {
9512        ret->flags = 1;
9513       }
9514      }
9515     }
9516     if (!internal_argval)
9517      RExC_seen |= REG_VERBARG_SEEN;
9518    } else if ( start_arg ) {
9519     vFAIL3("Verb pattern '%.*s' may not have an argument",
9520       verb_len, start_verb);
9521    } else {
9522     ret = reg_node(pRExC_state, op);
9523    }
9524    nextchar(pRExC_state);
9525    return ret;
9526   }
9527   else if (*RExC_parse == '?') { /* (?...) */
9528    bool is_logical = 0;
9529    const char * const seqstart = RExC_parse;
9530    if (has_intervening_patws && SIZE_ONLY) {
9531     ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9532    }
9533
9534    RExC_parse++;
9535    paren = *RExC_parse++;
9536    ret = NULL;   /* For look-ahead/behind. */
9537    switch (paren) {
9538
9539    case 'P': /* (?P...) variants for those used to PCRE/Python */
9540     paren = *RExC_parse++;
9541     if ( paren == '<')         /* (?P<...>) named capture */
9542      goto named_capture;
9543     else if (paren == '>') {   /* (?P>name) named recursion */
9544      goto named_recursion;
9545     }
9546     else if (paren == '=') {   /* (?P=...)  named backref */
9547      /* this pretty much dupes the code for \k<NAME> in
9548      * regatom(), if you change this make sure you change that
9549      * */
9550      char* name_start = RExC_parse;
9551      U32 num = 0;
9552      SV *sv_dat = reg_scan_name(pRExC_state,
9553       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9554      if (RExC_parse == name_start || *RExC_parse != ')')
9555       /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9556       vFAIL2("Sequence %.3s... not terminated",parse_start);
9557
9558      if (!SIZE_ONLY) {
9559       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9560       RExC_rxi->data->data[num]=(void*)sv_dat;
9561       SvREFCNT_inc_simple_void(sv_dat);
9562      }
9563      RExC_sawback = 1;
9564      ret = reganode(pRExC_state,
9565         ((! FOLD)
9566          ? NREF
9567          : (ASCII_FOLD_RESTRICTED)
9568          ? NREFFA
9569          : (AT_LEAST_UNI_SEMANTICS)
9570           ? NREFFU
9571           : (LOC)
9572           ? NREFFL
9573           : NREFF),
9574          num);
9575      *flagp |= HASWIDTH;
9576
9577      Set_Node_Offset(ret, parse_start+1);
9578      Set_Node_Cur_Length(ret, parse_start);
9579
9580      nextchar(pRExC_state);
9581      return ret;
9582     }
9583     RExC_parse++;
9584     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9585     vFAIL3("Sequence (%.*s...) not recognized",
9586         RExC_parse-seqstart, seqstart);
9587     /*NOTREACHED*/
9588    case '<':           /* (?<...) */
9589     if (*RExC_parse == '!')
9590      paren = ',';
9591     else if (*RExC_parse != '=')
9592    named_capture:
9593     {               /* (?<...>) */
9594      char *name_start;
9595      SV *svname;
9596      paren= '>';
9597    case '\'':          /* (?'...') */
9598       name_start= RExC_parse;
9599       svname = reg_scan_name(pRExC_state,
9600       SIZE_ONLY    /* reverse test from the others */
9601       ? REG_RSN_RETURN_NAME
9602       : REG_RSN_RETURN_NULL);
9603      if (RExC_parse == name_start || *RExC_parse != paren)
9604       vFAIL2("Sequence (?%c... not terminated",
9605        paren=='>' ? '<' : paren);
9606      if (SIZE_ONLY) {
9607       HE *he_str;
9608       SV *sv_dat = NULL;
9609       if (!svname) /* shouldn't happen */
9610        Perl_croak(aTHX_
9611         "panic: reg_scan_name returned NULL");
9612       if (!RExC_paren_names) {
9613        RExC_paren_names= newHV();
9614        sv_2mortal(MUTABLE_SV(RExC_paren_names));
9615 #ifdef DEBUGGING
9616        RExC_paren_name_list= newAV();
9617        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9618 #endif
9619       }
9620       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9621       if ( he_str )
9622        sv_dat = HeVAL(he_str);
9623       if ( ! sv_dat ) {
9624        /* croak baby croak */
9625        Perl_croak(aTHX_
9626         "panic: paren_name hash element allocation failed");
9627       } else if ( SvPOK(sv_dat) ) {
9628        /* (?|...) can mean we have dupes so scan to check
9629        its already been stored. Maybe a flag indicating
9630        we are inside such a construct would be useful,
9631        but the arrays are likely to be quite small, so
9632        for now we punt -- dmq */
9633        IV count = SvIV(sv_dat);
9634        I32 *pv = (I32*)SvPVX(sv_dat);
9635        IV i;
9636        for ( i = 0 ; i < count ; i++ ) {
9637         if ( pv[i] == RExC_npar ) {
9638          count = 0;
9639          break;
9640         }
9641        }
9642        if ( count ) {
9643         pv = (I32*)SvGROW(sv_dat,
9644             SvCUR(sv_dat) + sizeof(I32)+1);
9645         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9646         pv[count] = RExC_npar;
9647         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9648        }
9649       } else {
9650        (void)SvUPGRADE(sv_dat,SVt_PVNV);
9651        sv_setpvn(sv_dat, (char *)&(RExC_npar),
9652                 sizeof(I32));
9653        SvIOK_on(sv_dat);
9654        SvIV_set(sv_dat, 1);
9655       }
9656 #ifdef DEBUGGING
9657       /* Yes this does cause a memory leak in debugging Perls
9658       * */
9659       if (!av_store(RExC_paren_name_list,
9660          RExC_npar, SvREFCNT_inc(svname)))
9661        SvREFCNT_dec_NN(svname);
9662 #endif
9663
9664       /*sv_dump(sv_dat);*/
9665      }
9666      nextchar(pRExC_state);
9667      paren = 1;
9668      goto capturing_parens;
9669     }
9670     RExC_seen |= REG_LOOKBEHIND_SEEN;
9671     RExC_in_lookbehind++;
9672     RExC_parse++;
9673    case '=':           /* (?=...) */
9674     RExC_seen_zerolen++;
9675     break;
9676    case '!':           /* (?!...) */
9677     RExC_seen_zerolen++;
9678     if (*RExC_parse == ')') {
9679      ret=reg_node(pRExC_state, OPFAIL);
9680      nextchar(pRExC_state);
9681      return ret;
9682     }
9683     break;
9684    case '|':           /* (?|...) */
9685     /* branch reset, behave like a (?:...) except that
9686     buffers in alternations share the same numbers */
9687     paren = ':';
9688     after_freeze = freeze_paren = RExC_npar;
9689     break;
9690    case ':':           /* (?:...) */
9691    case '>':           /* (?>...) */
9692     break;
9693    case '$':           /* (?$...) */
9694    case '@':           /* (?@...) */
9695     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9696     break;
9697    case '#':           /* (?#...) */
9698     /* XXX As soon as we disallow separating the '?' and '*' (by
9699     * spaces or (?#...) comment), it is believed that this case
9700     * will be unreachable and can be removed.  See
9701     * [perl #117327] */
9702     while (*RExC_parse && *RExC_parse != ')')
9703      RExC_parse++;
9704     if (*RExC_parse != ')')
9705      FAIL("Sequence (?#... not terminated");
9706     nextchar(pRExC_state);
9707     *flagp = TRYAGAIN;
9708     return NULL;
9709    case '0' :           /* (?0) */
9710    case 'R' :           /* (?R) */
9711     if (*RExC_parse != ')')
9712      FAIL("Sequence (?R) not terminated");
9713     ret = reg_node(pRExC_state, GOSTART);
9714      RExC_seen |= REG_GOSTART_SEEN;
9715     *flagp |= POSTPONED;
9716     nextchar(pRExC_state);
9717     return ret;
9718     /*notreached*/
9719    { /* named and numeric backreferences */
9720     I32 num;
9721    case '&':            /* (?&NAME) */
9722     parse_start = RExC_parse - 1;
9723    named_recursion:
9724     {
9725       SV *sv_dat = reg_scan_name(pRExC_state,
9726        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9727       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9728     }
9729     if (RExC_parse == RExC_end || *RExC_parse != ')')
9730      vFAIL("Sequence (?&... not terminated");
9731     goto gen_recurse_regop;
9732     assert(0); /* NOT REACHED */
9733    case '+':
9734     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9735      RExC_parse++;
9736      vFAIL("Illegal pattern");
9737     }
9738     goto parse_recursion;
9739     /* NOT REACHED*/
9740    case '-': /* (?-1) */
9741     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9742      RExC_parse--; /* rewind to let it be handled later */
9743      goto parse_flags;
9744     }
9745     /*FALLTHROUGH */
9746    case '1': case '2': case '3': case '4': /* (?1) */
9747    case '5': case '6': case '7': case '8': case '9':
9748     RExC_parse--;
9749    parse_recursion:
9750     num = atoi(RExC_parse);
9751     parse_start = RExC_parse - 1; /* MJD */
9752     if (*RExC_parse == '-')
9753      RExC_parse++;
9754     while (isDIGIT(*RExC_parse))
9755       RExC_parse++;
9756     if (*RExC_parse!=')')
9757      vFAIL("Expecting close bracket");
9758
9759    gen_recurse_regop:
9760     if ( paren == '-' ) {
9761      /*
9762      Diagram of capture buffer numbering.
9763      Top line is the normal capture buffer numbers
9764      Bottom line is the negative indexing as from
9765      the X (the (?-2))
9766
9767      +   1 2    3 4 5 X          6 7
9768      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9769      -   5 4    3 2 1 X          x x
9770
9771      */
9772      num = RExC_npar + num;
9773      if (num < 1)  {
9774       RExC_parse++;
9775       vFAIL("Reference to nonexistent group");
9776      }
9777     } else if ( paren == '+' ) {
9778      num = RExC_npar + num - 1;
9779     }
9780
9781     ret = reganode(pRExC_state, GOSUB, num);
9782     if (!SIZE_ONLY) {
9783      if (num > (I32)RExC_rx->nparens) {
9784       RExC_parse++;
9785       vFAIL("Reference to nonexistent group");
9786      }
9787      ARG2L_SET( ret, RExC_recurse_count++);
9788      RExC_emit++;
9789      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9790       "Recurse #%"UVuf" to %"IVdf"\n",
9791        (UV)ARG(ret), (IV)ARG2L(ret)));
9792     } else {
9793      RExC_size++;
9794      }
9795      RExC_seen |= REG_RECURSE_SEEN;
9796     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9797     Set_Node_Offset(ret, parse_start); /* MJD */
9798
9799     *flagp |= POSTPONED;
9800     nextchar(pRExC_state);
9801     return ret;
9802    } /* named and numeric backreferences */
9803    assert(0); /* NOT REACHED */
9804
9805    case '?':           /* (??...) */
9806     is_logical = 1;
9807     if (*RExC_parse != '{') {
9808      RExC_parse++;
9809      /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9810      vFAIL2utf8f(
9811       "Sequence (%"UTF8f"...) not recognized",
9812       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9813      /*NOTREACHED*/
9814     }
9815     *flagp |= POSTPONED;
9816     paren = *RExC_parse++;
9817     /* FALL THROUGH */
9818    case '{':           /* (?{...}) */
9819    {
9820     U32 n = 0;
9821     struct reg_code_block *cb;
9822
9823     RExC_seen_zerolen++;
9824
9825     if (   !pRExC_state->num_code_blocks
9826      || pRExC_state->code_index >= pRExC_state->num_code_blocks
9827      || pRExC_state->code_blocks[pRExC_state->code_index].start
9828       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9829        - RExC_start)
9830     ) {
9831      if (RExC_pm_flags & PMf_USE_RE_EVAL)
9832       FAIL("panic: Sequence (?{...}): no code block found\n");
9833      FAIL("Eval-group not allowed at runtime, use re 'eval'");
9834     }
9835     /* this is a pre-compiled code block (?{...}) */
9836     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9837     RExC_parse = RExC_start + cb->end;
9838     if (!SIZE_ONLY) {
9839      OP *o = cb->block;
9840      if (cb->src_regex) {
9841       n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9842       RExC_rxi->data->data[n] =
9843        (void*)SvREFCNT_inc((SV*)cb->src_regex);
9844       RExC_rxi->data->data[n+1] = (void*)o;
9845      }
9846      else {
9847       n = add_data(pRExC_state,
9848        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9849       RExC_rxi->data->data[n] = (void*)o;
9850      }
9851     }
9852     pRExC_state->code_index++;
9853     nextchar(pRExC_state);
9854
9855     if (is_logical) {
9856      regnode *eval;
9857      ret = reg_node(pRExC_state, LOGICAL);
9858      eval = reganode(pRExC_state, EVAL, n);
9859      if (!SIZE_ONLY) {
9860       ret->flags = 2;
9861       /* for later propagation into (??{}) return value */
9862       eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9863      }
9864      REGTAIL(pRExC_state, ret, eval);
9865      /* deal with the length of this later - MJD */
9866      return ret;
9867     }
9868     ret = reganode(pRExC_state, EVAL, n);
9869     Set_Node_Length(ret, RExC_parse - parse_start + 1);
9870     Set_Node_Offset(ret, parse_start);
9871     return ret;
9872    }
9873    case '(':           /* (?(?{...})...) and (?(?=...)...) */
9874    {
9875     int is_define= 0;
9876     if (RExC_parse[0] == '?') {        /* (?(?...)) */
9877      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9878       || RExC_parse[1] == '<'
9879       || RExC_parse[1] == '{') { /* Lookahead or eval. */
9880       I32 flag;
9881       regnode *tail;
9882
9883       ret = reg_node(pRExC_state, LOGICAL);
9884       if (!SIZE_ONLY)
9885        ret->flags = 1;
9886
9887       tail = reg(pRExC_state, 1, &flag, depth+1);
9888       if (flag & RESTART_UTF8) {
9889        *flagp = RESTART_UTF8;
9890        return NULL;
9891       }
9892       REGTAIL(pRExC_state, ret, tail);
9893       goto insert_if;
9894      }
9895     }
9896     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9897       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9898     {
9899      char ch = RExC_parse[0] == '<' ? '>' : '\'';
9900      char *name_start= RExC_parse++;
9901      U32 num = 0;
9902      SV *sv_dat=reg_scan_name(pRExC_state,
9903       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9904      if (RExC_parse == name_start || *RExC_parse != ch)
9905       vFAIL2("Sequence (?(%c... not terminated",
9906        (ch == '>' ? '<' : ch));
9907      RExC_parse++;
9908      if (!SIZE_ONLY) {
9909       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9910       RExC_rxi->data->data[num]=(void*)sv_dat;
9911       SvREFCNT_inc_simple_void(sv_dat);
9912      }
9913      ret = reganode(pRExC_state,NGROUPP,num);
9914      goto insert_if_check_paren;
9915     }
9916     else if (RExC_parse[0] == 'D' &&
9917       RExC_parse[1] == 'E' &&
9918       RExC_parse[2] == 'F' &&
9919       RExC_parse[3] == 'I' &&
9920       RExC_parse[4] == 'N' &&
9921       RExC_parse[5] == 'E')
9922     {
9923      ret = reganode(pRExC_state,DEFINEP,0);
9924      RExC_parse +=6 ;
9925      is_define = 1;
9926      goto insert_if_check_paren;
9927     }
9928     else if (RExC_parse[0] == 'R') {
9929      RExC_parse++;
9930      parno = 0;
9931      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9932       parno = atoi(RExC_parse++);
9933       while (isDIGIT(*RExC_parse))
9934        RExC_parse++;
9935      } else if (RExC_parse[0] == '&') {
9936       SV *sv_dat;
9937       RExC_parse++;
9938       sv_dat = reg_scan_name(pRExC_state,
9939        SIZE_ONLY
9940        ? REG_RSN_RETURN_NULL
9941        : REG_RSN_RETURN_DATA);
9942        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9943      }
9944      ret = reganode(pRExC_state,INSUBP,parno);
9945      goto insert_if_check_paren;
9946     }
9947     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9948      /* (?(1)...) */
9949      char c;
9950      char *tmp;
9951      parno = atoi(RExC_parse++);
9952
9953      while (isDIGIT(*RExC_parse))
9954       RExC_parse++;
9955      ret = reganode(pRExC_state, GROUPP, parno);
9956
9957     insert_if_check_paren:
9958      if (*(tmp = nextchar(pRExC_state)) != ')') {
9959       /* nextchar also skips comments, so undo its work
9960       * and skip over the the next character.
9961       */
9962       RExC_parse = tmp;
9963       RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9964       vFAIL("Switch condition not recognized");
9965      }
9966     insert_if:
9967      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9968      br = regbranch(pRExC_state, &flags, 1,depth+1);
9969      if (br == NULL) {
9970       if (flags & RESTART_UTF8) {
9971        *flagp = RESTART_UTF8;
9972        return NULL;
9973       }
9974       FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9975        (UV) flags);
9976      } else
9977       REGTAIL(pRExC_state, br, reganode(pRExC_state,
9978               LONGJMP, 0));
9979      c = *nextchar(pRExC_state);
9980      if (flags&HASWIDTH)
9981       *flagp |= HASWIDTH;
9982      if (c == '|') {
9983       if (is_define)
9984        vFAIL("(?(DEFINE)....) does not allow branches");
9985
9986       /* Fake one for optimizer.  */
9987       lastbr = reganode(pRExC_state, IFTHEN, 0);
9988
9989       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9990        if (flags & RESTART_UTF8) {
9991         *flagp = RESTART_UTF8;
9992         return NULL;
9993        }
9994        FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9995         (UV) flags);
9996       }
9997       REGTAIL(pRExC_state, ret, lastbr);
9998       if (flags&HASWIDTH)
9999        *flagp |= HASWIDTH;
10000       c = *nextchar(pRExC_state);
10001      }
10002      else
10003       lastbr = NULL;
10004      if (c != ')')
10005       vFAIL("Switch (?(condition)... contains too many branches");
10006      ender = reg_node(pRExC_state, TAIL);
10007      REGTAIL(pRExC_state, br, ender);
10008      if (lastbr) {
10009       REGTAIL(pRExC_state, lastbr, ender);
10010       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10011      }
10012      else
10013       REGTAIL(pRExC_state, ret, ender);
10014      RExC_size++; /* XXX WHY do we need this?!!
10015          For large programs it seems to be required
10016          but I can't figure out why. -- dmq*/
10017      return ret;
10018     }
10019     else {
10020      RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10021      vFAIL("Unknown switch condition (?(...))");
10022     }
10023    }
10024    case '[':           /* (?[ ... ]) */
10025     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10026           oregcomp_parse);
10027    case 0:
10028     RExC_parse--; /* for vFAIL to print correctly */
10029     vFAIL("Sequence (? incomplete");
10030     break;
10031    default: /* e.g., (?i) */
10032     --RExC_parse;
10033    parse_flags:
10034     parse_lparen_question_flags(pRExC_state);
10035     if (UCHARAT(RExC_parse) != ':') {
10036      nextchar(pRExC_state);
10037      *flagp = TRYAGAIN;
10038      return NULL;
10039     }
10040     paren = ':';
10041     nextchar(pRExC_state);
10042     ret = NULL;
10043     goto parse_rest;
10044    } /* end switch */
10045   }
10046   else {                  /* (...) */
10047   capturing_parens:
10048    parno = RExC_npar;
10049    RExC_npar++;
10050
10051    ret = reganode(pRExC_state, OPEN, parno);
10052    if (!SIZE_ONLY ){
10053     if (!RExC_nestroot)
10054      RExC_nestroot = parno;
10055     if (RExC_seen & REG_RECURSE_SEEN
10056      && !RExC_open_parens[parno-1])
10057     {
10058      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10059       "Setting open paren #%"IVdf" to %d\n",
10060       (IV)parno, REG_NODE_NUM(ret)));
10061      RExC_open_parens[parno-1]= ret;
10062     }
10063    }
10064    Set_Node_Length(ret, 1); /* MJD */
10065    Set_Node_Offset(ret, RExC_parse); /* MJD */
10066    is_open = 1;
10067   }
10068  }
10069  else                        /* ! paren */
10070   ret = NULL;
10071
10072    parse_rest:
10073  /* Pick up the branches, linking them together. */
10074  parse_start = RExC_parse;   /* MJD */
10075  br = regbranch(pRExC_state, &flags, 1,depth+1);
10076
10077  /*     branch_len = (paren != 0); */
10078
10079  if (br == NULL) {
10080   if (flags & RESTART_UTF8) {
10081    *flagp = RESTART_UTF8;
10082    return NULL;
10083   }
10084   FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10085  }
10086  if (*RExC_parse == '|') {
10087   if (!SIZE_ONLY && RExC_extralen) {
10088    reginsert(pRExC_state, BRANCHJ, br, depth+1);
10089   }
10090   else {                  /* MJD */
10091    reginsert(pRExC_state, BRANCH, br, depth+1);
10092    Set_Node_Length(br, paren != 0);
10093    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10094   }
10095   have_branch = 1;
10096   if (SIZE_ONLY)
10097    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
10098  }
10099  else if (paren == ':') {
10100   *flagp |= flags&SIMPLE;
10101  }
10102  if (is_open) {    /* Starts with OPEN. */
10103   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10104  }
10105  else if (paren != '?')  /* Not Conditional */
10106   ret = br;
10107  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10108  lastbr = br;
10109  while (*RExC_parse == '|') {
10110   if (!SIZE_ONLY && RExC_extralen) {
10111    ender = reganode(pRExC_state, LONGJMP,0);
10112
10113    /* Append to the previous. */
10114    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10115   }
10116   if (SIZE_ONLY)
10117    RExC_extralen += 2;  /* Account for LONGJMP. */
10118   nextchar(pRExC_state);
10119   if (freeze_paren) {
10120    if (RExC_npar > after_freeze)
10121     after_freeze = RExC_npar;
10122    RExC_npar = freeze_paren;
10123   }
10124   br = regbranch(pRExC_state, &flags, 0, depth+1);
10125
10126   if (br == NULL) {
10127    if (flags & RESTART_UTF8) {
10128     *flagp = RESTART_UTF8;
10129     return NULL;
10130    }
10131    FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10132   }
10133   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10134   lastbr = br;
10135   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10136  }
10137
10138  if (have_branch || paren != ':') {
10139   /* Make a closing node, and hook it on the end. */
10140   switch (paren) {
10141   case ':':
10142    ender = reg_node(pRExC_state, TAIL);
10143    break;
10144   case 1: case 2:
10145    ender = reganode(pRExC_state, CLOSE, parno);
10146    if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10147     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10148       "Setting close paren #%"IVdf" to %d\n",
10149       (IV)parno, REG_NODE_NUM(ender)));
10150     RExC_close_parens[parno-1]= ender;
10151     if (RExC_nestroot == parno)
10152      RExC_nestroot = 0;
10153    }
10154    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10155    Set_Node_Length(ender,1); /* MJD */
10156    break;
10157   case '<':
10158   case ',':
10159   case '=':
10160   case '!':
10161    *flagp &= ~HASWIDTH;
10162    /* FALL THROUGH */
10163   case '>':
10164    ender = reg_node(pRExC_state, SUCCEED);
10165    break;
10166   case 0:
10167    ender = reg_node(pRExC_state, END);
10168    if (!SIZE_ONLY) {
10169     assert(!RExC_opend); /* there can only be one! */
10170     RExC_opend = ender;
10171    }
10172    break;
10173   }
10174   DEBUG_PARSE_r(if (!SIZE_ONLY) {
10175    SV * const mysv_val1=sv_newmortal();
10176    SV * const mysv_val2=sv_newmortal();
10177    DEBUG_PARSE_MSG("lsbr");
10178    regprop(RExC_rx, mysv_val1, lastbr, NULL);
10179    regprop(RExC_rx, mysv_val2, ender, NULL);
10180    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10181       SvPV_nolen_const(mysv_val1),
10182       (IV)REG_NODE_NUM(lastbr),
10183       SvPV_nolen_const(mysv_val2),
10184       (IV)REG_NODE_NUM(ender),
10185       (IV)(ender - lastbr)
10186    );
10187   });
10188   REGTAIL(pRExC_state, lastbr, ender);
10189
10190   if (have_branch && !SIZE_ONLY) {
10191    char is_nothing= 1;
10192    if (depth==1)
10193     RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10194
10195    /* Hook the tails of the branches to the closing node. */
10196    for (br = ret; br; br = regnext(br)) {
10197     const U8 op = PL_regkind[OP(br)];
10198     if (op == BRANCH) {
10199      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10200      if ( OP(NEXTOPER(br)) != NOTHING
10201       || regnext(NEXTOPER(br)) != ender)
10202       is_nothing= 0;
10203     }
10204     else if (op == BRANCHJ) {
10205      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10206      /* for now we always disable this optimisation * /
10207      if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10208       || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10209      */
10210       is_nothing= 0;
10211     }
10212    }
10213    if (is_nothing) {
10214     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10215     DEBUG_PARSE_r(if (!SIZE_ONLY) {
10216      SV * const mysv_val1=sv_newmortal();
10217      SV * const mysv_val2=sv_newmortal();
10218      DEBUG_PARSE_MSG("NADA");
10219      regprop(RExC_rx, mysv_val1, ret, NULL);
10220      regprop(RExC_rx, mysv_val2, ender, NULL);
10221      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10222         SvPV_nolen_const(mysv_val1),
10223         (IV)REG_NODE_NUM(ret),
10224         SvPV_nolen_const(mysv_val2),
10225         (IV)REG_NODE_NUM(ender),
10226         (IV)(ender - ret)
10227      );
10228     });
10229     OP(br)= NOTHING;
10230     if (OP(ender) == TAIL) {
10231      NEXT_OFF(br)= 0;
10232      RExC_emit= br + 1;
10233     } else {
10234      regnode *opt;
10235      for ( opt= br + 1; opt < ender ; opt++ )
10236       OP(opt)= OPTIMIZED;
10237      NEXT_OFF(br)= ender - br;
10238     }
10239    }
10240   }
10241  }
10242
10243  {
10244   const char *p;
10245   static const char parens[] = "=!<,>";
10246
10247   if (paren && (p = strchr(parens, paren))) {
10248    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10249    int flag = (p - parens) > 1;
10250
10251    if (paren == '>')
10252     node = SUSPEND, flag = 0;
10253    reginsert(pRExC_state, node,ret, depth+1);
10254    Set_Node_Cur_Length(ret, parse_start);
10255    Set_Node_Offset(ret, parse_start + 1);
10256    ret->flags = flag;
10257    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10258   }
10259  }
10260
10261  /* Check for proper termination. */
10262  if (paren) {
10263   /* restore original flags, but keep (?p) */
10264   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10265   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10266    RExC_parse = oregcomp_parse;
10267    vFAIL("Unmatched (");
10268   }
10269  }
10270  else if (!paren && RExC_parse < RExC_end) {
10271   if (*RExC_parse == ')') {
10272    RExC_parse++;
10273    vFAIL("Unmatched )");
10274   }
10275   else
10276    FAIL("Junk on end of regexp"); /* "Can't happen". */
10277   assert(0); /* NOTREACHED */
10278  }
10279
10280  if (RExC_in_lookbehind) {
10281   RExC_in_lookbehind--;
10282  }
10283  if (after_freeze > RExC_npar)
10284   RExC_npar = after_freeze;
10285  return(ret);
10286 }
10287
10288 /*
10289  - regbranch - one alternative of an | operator
10290  *
10291  * Implements the concatenation operator.
10292  *
10293  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10294  * restarted.
10295  */
10296 STATIC regnode *
10297 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10298 {
10299  dVAR;
10300  regnode *ret;
10301  regnode *chain = NULL;
10302  regnode *latest;
10303  I32 flags = 0, c = 0;
10304  GET_RE_DEBUG_FLAGS_DECL;
10305
10306  PERL_ARGS_ASSERT_REGBRANCH;
10307
10308  DEBUG_PARSE("brnc");
10309
10310  if (first)
10311   ret = NULL;
10312  else {
10313   if (!SIZE_ONLY && RExC_extralen)
10314    ret = reganode(pRExC_state, BRANCHJ,0);
10315   else {
10316    ret = reg_node(pRExC_state, BRANCH);
10317    Set_Node_Length(ret, 1);
10318   }
10319  }
10320
10321  if (!first && SIZE_ONLY)
10322   RExC_extralen += 1;   /* BRANCHJ */
10323
10324  *flagp = WORST;   /* Tentatively. */
10325
10326  RExC_parse--;
10327  nextchar(pRExC_state);
10328  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10329   flags &= ~TRYAGAIN;
10330   latest = regpiece(pRExC_state, &flags,depth+1);
10331   if (latest == NULL) {
10332    if (flags & TRYAGAIN)
10333     continue;
10334    if (flags & RESTART_UTF8) {
10335     *flagp = RESTART_UTF8;
10336     return NULL;
10337    }
10338    FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10339   }
10340   else if (ret == NULL)
10341    ret = latest;
10342   *flagp |= flags&(HASWIDTH|POSTPONED);
10343   if (chain == NULL)  /* First piece. */
10344    *flagp |= flags&SPSTART;
10345   else {
10346    RExC_naughty++;
10347    REGTAIL(pRExC_state, chain, latest);
10348   }
10349   chain = latest;
10350   c++;
10351  }
10352  if (chain == NULL) { /* Loop ran zero times. */
10353   chain = reg_node(pRExC_state, NOTHING);
10354   if (ret == NULL)
10355    ret = chain;
10356  }
10357  if (c == 1) {
10358   *flagp |= flags&SIMPLE;
10359  }
10360
10361  return ret;
10362 }
10363
10364 /*
10365  - regpiece - something followed by possible [*+?]
10366  *
10367  * Note that the branching code sequences used for ? and the general cases
10368  * of * and + are somewhat optimized:  they use the same NOTHING node as
10369  * both the endmarker for their branch list and the body of the last branch.
10370  * It might seem that this node could be dispensed with entirely, but the
10371  * endmarker role is not redundant.
10372  *
10373  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10374  * TRYAGAIN.
10375  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10376  * restarted.
10377  */
10378 STATIC regnode *
10379 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10380 {
10381  dVAR;
10382  regnode *ret;
10383  char op;
10384  char *next;
10385  I32 flags;
10386  const char * const origparse = RExC_parse;
10387  I32 min;
10388  I32 max = REG_INFTY;
10389 #ifdef RE_TRACK_PATTERN_OFFSETS
10390  char *parse_start;
10391 #endif
10392  const char *maxpos = NULL;
10393
10394  /* Save the original in case we change the emitted regop to a FAIL. */
10395  regnode * const orig_emit = RExC_emit;
10396
10397  GET_RE_DEBUG_FLAGS_DECL;
10398
10399  PERL_ARGS_ASSERT_REGPIECE;
10400
10401  DEBUG_PARSE("piec");
10402
10403  ret = regatom(pRExC_state, &flags,depth+1);
10404  if (ret == NULL) {
10405   if (flags & (TRYAGAIN|RESTART_UTF8))
10406    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10407   else
10408    FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10409   return(NULL);
10410  }
10411
10412  op = *RExC_parse;
10413
10414  if (op == '{' && regcurly(RExC_parse, FALSE)) {
10415   maxpos = NULL;
10416 #ifdef RE_TRACK_PATTERN_OFFSETS
10417   parse_start = RExC_parse; /* MJD */
10418 #endif
10419   next = RExC_parse + 1;
10420   while (isDIGIT(*next) || *next == ',') {
10421    if (*next == ',') {
10422     if (maxpos)
10423      break;
10424     else
10425      maxpos = next;
10426    }
10427    next++;
10428   }
10429   if (*next == '}') {  /* got one */
10430    if (!maxpos)
10431     maxpos = next;
10432    RExC_parse++;
10433    min = atoi(RExC_parse);
10434    if (*maxpos == ',')
10435     maxpos++;
10436    else
10437     maxpos = RExC_parse;
10438    max = atoi(maxpos);
10439    if (!max && *maxpos != '0')
10440     max = REG_INFTY;  /* meaning "infinity" */
10441    else if (max >= REG_INFTY)
10442     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10443    RExC_parse = next;
10444    nextchar(pRExC_state);
10445    if (max < min) {    /* If can't match, warn and optimize to fail
10446         unconditionally */
10447     if (SIZE_ONLY) {
10448      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10449
10450      /* We can't back off the size because we have to reserve
10451      * enough space for all the things we are about to throw
10452      * away, but we can shrink it by the ammount we are about
10453      * to re-use here */
10454      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10455     }
10456     else {
10457      RExC_emit = orig_emit;
10458     }
10459     ret = reg_node(pRExC_state, OPFAIL);
10460     return ret;
10461    }
10462    else if (min == max
10463      && RExC_parse < RExC_end
10464      && (*RExC_parse == '?' || *RExC_parse == '+'))
10465    {
10466     if (SIZE_ONLY) {
10467      ckWARN2reg(RExC_parse + 1,
10468        "Useless use of greediness modifier '%c'",
10469        *RExC_parse);
10470     }
10471     /* Absorb the modifier, so later code doesn't see nor use
10472      * it */
10473     nextchar(pRExC_state);
10474    }
10475
10476   do_curly:
10477    if ((flags&SIMPLE)) {
10478     RExC_naughty += 2 + RExC_naughty / 2;
10479     reginsert(pRExC_state, CURLY, ret, depth+1);
10480     Set_Node_Offset(ret, parse_start+1); /* MJD */
10481     Set_Node_Cur_Length(ret, parse_start);
10482    }
10483    else {
10484     regnode * const w = reg_node(pRExC_state, WHILEM);
10485
10486     w->flags = 0;
10487     REGTAIL(pRExC_state, ret, w);
10488     if (!SIZE_ONLY && RExC_extralen) {
10489      reginsert(pRExC_state, LONGJMP,ret, depth+1);
10490      reginsert(pRExC_state, NOTHING,ret, depth+1);
10491      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10492     }
10493     reginsert(pRExC_state, CURLYX,ret, depth+1);
10494         /* MJD hk */
10495     Set_Node_Offset(ret, parse_start+1);
10496     Set_Node_Length(ret,
10497         op == '{' ? (RExC_parse - parse_start) : 1);
10498
10499     if (!SIZE_ONLY && RExC_extralen)
10500      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10501     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10502     if (SIZE_ONLY)
10503      RExC_whilem_seen++, RExC_extralen += 3;
10504     RExC_naughty += 4 + RExC_naughty; /* compound interest */
10505    }
10506    ret->flags = 0;
10507
10508    if (min > 0)
10509     *flagp = WORST;
10510    if (max > 0)
10511     *flagp |= HASWIDTH;
10512    if (!SIZE_ONLY) {
10513     ARG1_SET(ret, (U16)min);
10514     ARG2_SET(ret, (U16)max);
10515    }
10516    if (max == REG_INFTY)
10517     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10518
10519    goto nest_check;
10520   }
10521  }
10522
10523  if (!ISMULT1(op)) {
10524   *flagp = flags;
10525   return(ret);
10526  }
10527
10528 #if 0    /* Now runtime fix should be reliable. */
10529
10530  /* if this is reinstated, don't forget to put this back into perldiag:
10531
10532    =item Regexp *+ operand could be empty at {#} in regex m/%s/
10533
10534   (F) The part of the regexp subject to either the * or + quantifier
10535   could match an empty string. The {#} shows in the regular
10536   expression about where the problem was discovered.
10537
10538  */
10539
10540  if (!(flags&HASWIDTH) && op != '?')
10541  vFAIL("Regexp *+ operand could be empty");
10542 #endif
10543
10544 #ifdef RE_TRACK_PATTERN_OFFSETS
10545  parse_start = RExC_parse;
10546 #endif
10547  nextchar(pRExC_state);
10548
10549  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10550
10551  if (op == '*' && (flags&SIMPLE)) {
10552   reginsert(pRExC_state, STAR, ret, depth+1);
10553   ret->flags = 0;
10554   RExC_naughty += 4;
10555   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10556  }
10557  else if (op == '*') {
10558   min = 0;
10559   goto do_curly;
10560  }
10561  else if (op == '+' && (flags&SIMPLE)) {
10562   reginsert(pRExC_state, PLUS, ret, depth+1);
10563   ret->flags = 0;
10564   RExC_naughty += 3;
10565   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10566  }
10567  else if (op == '+') {
10568   min = 1;
10569   goto do_curly;
10570  }
10571  else if (op == '?') {
10572   min = 0; max = 1;
10573   goto do_curly;
10574  }
10575   nest_check:
10576  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10577   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10578   ckWARN2reg(RExC_parse,
10579     "%"UTF8f" matches null string many times",
10580     UTF8fARG(UTF, (RExC_parse >= origparse
10581         ? RExC_parse - origparse
10582         : 0),
10583     origparse));
10584   (void)ReREFCNT_inc(RExC_rx_sv);
10585  }
10586
10587  if (RExC_parse < RExC_end && *RExC_parse == '?') {
10588   nextchar(pRExC_state);
10589   reginsert(pRExC_state, MINMOD, ret, depth+1);
10590   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10591  }
10592  else
10593  if (RExC_parse < RExC_end && *RExC_parse == '+') {
10594   regnode *ender;
10595   nextchar(pRExC_state);
10596   ender = reg_node(pRExC_state, SUCCEED);
10597   REGTAIL(pRExC_state, ret, ender);
10598   reginsert(pRExC_state, SUSPEND, ret, depth+1);
10599   ret->flags = 0;
10600   ender = reg_node(pRExC_state, TAIL);
10601   REGTAIL(pRExC_state, ret, ender);
10602  }
10603
10604  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10605   RExC_parse++;
10606   vFAIL("Nested quantifiers");
10607  }
10608
10609  return(ret);
10610 }
10611
10612 STATIC bool
10613 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10614      UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10615      const bool strict   /* Apply stricter parsing rules? */
10616  )
10617 {
10618
10619  /* This is expected to be called by a parser routine that has recognized '\N'
10620    and needs to handle the rest. RExC_parse is expected to point at the first
10621    char following the N at the time of the call.  On successful return,
10622    RExC_parse has been updated to point to just after the sequence identified
10623    by this routine, and <*flagp> has been updated.
10624
10625    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10626    character class.
10627
10628    \N may begin either a named sequence, or if outside a character class, mean
10629    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10630    attempted to decide which, and in the case of a named sequence, converted it
10631    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10632    where c1... are the characters in the sequence.  For single-quoted regexes,
10633    the tokenizer passes the \N sequence through unchanged; this code will not
10634    attempt to determine this nor expand those, instead raising a syntax error.
10635    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10636    or there is no '}', it signals that this \N occurrence means to match a
10637    non-newline.
10638
10639    Only the \N{U+...} form should occur in a character class, for the same
10640    reason that '.' inside a character class means to just match a period: it
10641    just doesn't make sense.
10642
10643    The function raises an error (via vFAIL), and doesn't return for various
10644    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10645    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10646    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10647    only possible if node_p is non-NULL.
10648
10649
10650    If <valuep> is non-null, it means the caller can accept an input sequence
10651    consisting of a just a single code point; <*valuep> is set to that value
10652    if the input is such.
10653
10654    If <node_p> is non-null it signifies that the caller can accept any other
10655    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10656    is set as follows:
10657  1) \N means not-a-NL: points to a newly created REG_ANY node;
10658  2) \N{}:              points to a new NOTHING node;
10659  3) otherwise:         points to a new EXACT node containing the resolved
10660       string.
10661    Note that FALSE is returned for single code point sequences if <valuep> is
10662    null.
10663  */
10664
10665  char * endbrace;    /* '}' following the name */
10666  char* p;
10667  char *endchar; /* Points to '.' or '}' ending cur char in the input
10668       stream */
10669  bool has_multiple_chars; /* true if the input stream contains a sequence of
10670         more than one character */
10671
10672  GET_RE_DEBUG_FLAGS_DECL;
10673
10674  PERL_ARGS_ASSERT_GROK_BSLASH_N;
10675
10676  GET_RE_DEBUG_FLAGS;
10677
10678  assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10679
10680  /* The [^\n] meaning of \N ignores spaces and comments under the /x
10681  * modifier.  The other meaning does not, so use a temporary until we find
10682  * out which we are being called with */
10683  p = (RExC_flags & RXf_PMf_EXTENDED)
10684   ? regwhite( pRExC_state, RExC_parse )
10685   : RExC_parse;
10686
10687  /* Disambiguate between \N meaning a named character versus \N meaning
10688  * [^\n].  The former is assumed when it can't be the latter. */
10689  if (*p != '{' || regcurly(p, FALSE)) {
10690   RExC_parse = p;
10691   if (! node_p) {
10692    /* no bare \N allowed in a charclass */
10693    if (in_char_class) {
10694     vFAIL("\\N in a character class must be a named character: \\N{...}");
10695    }
10696    return FALSE;
10697   }
10698   RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10699       current char */
10700   nextchar(pRExC_state);
10701   *node_p = reg_node(pRExC_state, REG_ANY);
10702   *flagp |= HASWIDTH|SIMPLE;
10703   RExC_naughty++;
10704   Set_Node_Length(*node_p, 1); /* MJD */
10705   return TRUE;
10706  }
10707
10708  /* Here, we have decided it should be a named character or sequence */
10709
10710  /* The test above made sure that the next real character is a '{', but
10711  * under the /x modifier, it could be separated by space (or a comment and
10712  * \n) and this is not allowed (for consistency with \x{...} and the
10713  * tokenizer handling of \N{NAME}). */
10714  if (*RExC_parse != '{') {
10715   vFAIL("Missing braces on \\N{}");
10716  }
10717
10718  RExC_parse++; /* Skip past the '{' */
10719
10720  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10721   || ! (endbrace == RExC_parse  /* nothing between the {} */
10722    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10723             */
10724     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10725              */
10726  {
10727   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10728   vFAIL("\\N{NAME} must be resolved by the lexer");
10729  }
10730
10731  if (endbrace == RExC_parse) {   /* empty: \N{} */
10732   bool ret = TRUE;
10733   if (node_p) {
10734    *node_p = reg_node(pRExC_state,NOTHING);
10735   }
10736   else if (in_char_class) {
10737    if (SIZE_ONLY && in_char_class) {
10738     if (strict) {
10739      RExC_parse++;   /* Position after the "}" */
10740      vFAIL("Zero length \\N{}");
10741     }
10742     else {
10743      ckWARNreg(RExC_parse,
10744        "Ignoring zero length \\N{} in character class");
10745     }
10746    }
10747    ret = FALSE;
10748   }
10749   else {
10750    return FALSE;
10751   }
10752   nextchar(pRExC_state);
10753   return ret;
10754  }
10755
10756  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10757  RExC_parse += 2; /* Skip past the 'U+' */
10758
10759  endchar = RExC_parse + strcspn(RExC_parse, ".}");
10760
10761  /* Code points are separated by dots.  If none, there is only one code
10762  * point, and is terminated by the brace */
10763  has_multiple_chars = (endchar < endbrace);
10764
10765  if (valuep && (! has_multiple_chars || in_char_class)) {
10766   /* We only pay attention to the first char of
10767   multichar strings being returned in char classes. I kinda wonder
10768   if this makes sense as it does change the behaviour
10769   from earlier versions, OTOH that behaviour was broken
10770   as well. XXX Solution is to recharacterize as
10771   [rest-of-class]|multi1|multi2... */
10772
10773   STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10774   I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10775    | PERL_SCAN_DISALLOW_PREFIX
10776    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10777
10778   *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10779
10780   /* The tokenizer should have guaranteed validity, but it's possible to
10781   * bypass it by using single quoting, so check */
10782   if (length_of_hex == 0
10783    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10784   {
10785    RExC_parse += length_of_hex; /* Includes all the valid */
10786    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10787        ? UTF8SKIP(RExC_parse)
10788        : 1;
10789    /* Guard against malformed utf8 */
10790    if (RExC_parse >= endchar) {
10791     RExC_parse = endchar;
10792    }
10793    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10794   }
10795
10796   if (in_char_class && has_multiple_chars) {
10797    if (strict) {
10798     RExC_parse = endbrace;
10799     vFAIL("\\N{} in character class restricted to one character");
10800    }
10801    else {
10802     ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10803    }
10804   }
10805
10806   RExC_parse = endbrace + 1;
10807  }
10808  else if (! node_p || ! has_multiple_chars) {
10809
10810   /* Here, the input is legal, but not according to the caller's
10811   * options.  We fail without advancing the parse, so that the
10812   * caller can try again */
10813   RExC_parse = p;
10814   return FALSE;
10815  }
10816  else {
10817
10818   /* What is done here is to convert this to a sub-pattern of the form
10819   * (?:\x{char1}\x{char2}...)
10820   * and then call reg recursively.  That way, it retains its atomicness,
10821   * while not having to worry about special handling that some code
10822   * points may have.  toke.c has converted the original Unicode values
10823   * to native, so that we can just pass on the hex values unchanged.  We
10824   * do have to set a flag to keep recoding from happening in the
10825   * recursion */
10826
10827   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10828   STRLEN len;
10829   char *orig_end = RExC_end;
10830   I32 flags;
10831
10832   while (RExC_parse < endbrace) {
10833
10834    /* Convert to notation the rest of the code understands */
10835    sv_catpv(substitute_parse, "\\x{");
10836    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10837    sv_catpv(substitute_parse, "}");
10838
10839    /* Point to the beginning of the next character in the sequence. */
10840    RExC_parse = endchar + 1;
10841    endchar = RExC_parse + strcspn(RExC_parse, ".}");
10842   }
10843   sv_catpv(substitute_parse, ")");
10844
10845   RExC_parse = SvPV(substitute_parse, len);
10846
10847   /* Don't allow empty number */
10848   if (len < 8) {
10849    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10850   }
10851   RExC_end = RExC_parse + len;
10852
10853   /* The values are Unicode, and therefore not subject to recoding */
10854   RExC_override_recoding = 1;
10855
10856   if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10857    if (flags & RESTART_UTF8) {
10858     *flagp = RESTART_UTF8;
10859     return FALSE;
10860    }
10861    FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10862     (UV) flags);
10863   }
10864   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10865
10866   RExC_parse = endbrace;
10867   RExC_end = orig_end;
10868   RExC_override_recoding = 0;
10869
10870   nextchar(pRExC_state);
10871  }
10872
10873  return TRUE;
10874 }
10875
10876
10877 /*
10878  * reg_recode
10879  *
10880  * It returns the code point in utf8 for the value in *encp.
10881  *    value: a code value in the source encoding
10882  *    encp:  a pointer to an Encode object
10883  *
10884  * If the result from Encode is not a single character,
10885  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10886  */
10887 STATIC UV
10888 S_reg_recode(pTHX_ const char value, SV **encp)
10889 {
10890  STRLEN numlen = 1;
10891  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10892  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10893  const STRLEN newlen = SvCUR(sv);
10894  UV uv = UNICODE_REPLACEMENT;
10895
10896  PERL_ARGS_ASSERT_REG_RECODE;
10897
10898  if (newlen)
10899   uv = SvUTF8(sv)
10900    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10901    : *(U8*)s;
10902
10903  if (!newlen || numlen != newlen) {
10904   uv = UNICODE_REPLACEMENT;
10905   *encp = NULL;
10906  }
10907  return uv;
10908 }
10909
10910 PERL_STATIC_INLINE U8
10911 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10912 {
10913  U8 op;
10914
10915  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10916
10917  if (! FOLD) {
10918   return EXACT;
10919  }
10920
10921  op = get_regex_charset(RExC_flags);
10922  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10923   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10924     been, so there is no hole */
10925  }
10926
10927  return op + EXACTF;
10928 }
10929
10930 PERL_STATIC_INLINE void
10931 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
10932       regnode *node, I32* flagp, STRLEN len, UV code_point,
10933       bool downgradable)
10934 {
10935  /* This knows the details about sizing an EXACTish node, setting flags for
10936  * it (by setting <*flagp>, and potentially populating it with a single
10937  * character.
10938  *
10939  * If <len> (the length in bytes) is non-zero, this function assumes that
10940  * the node has already been populated, and just does the sizing.  In this
10941  * case <code_point> should be the final code point that has already been
10942  * placed into the node.  This value will be ignored except that under some
10943  * circumstances <*flagp> is set based on it.
10944  *
10945  * If <len> is zero, the function assumes that the node is to contain only
10946  * the single character given by <code_point> and calculates what <len>
10947  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10948  * additionally will populate the node's STRING with <code_point> or its
10949  * fold if folding.
10950  *
10951  * In both cases <*flagp> is appropriately set
10952  *
10953  * It knows that under FOLD, the Latin Sharp S and UTF characters above
10954  * 255, must be folded (the former only when the rules indicate it can
10955  * match 'ss')
10956  *
10957  * When it does the populating, it looks at the flag 'downgradable'.  If
10958  * true with a node that folds, it checks if the single code point
10959  * participates in a fold, and if not downgrades the node to an EXACT.
10960  * This helps the optimizer */
10961
10962  bool len_passed_in = cBOOL(len != 0);
10963  U8 character[UTF8_MAXBYTES_CASE+1];
10964
10965  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10966
10967  /* Don't bother to check for downgrading in PASS1, as it doesn't make any
10968  * sizing difference, and is extra work that is thrown away */
10969  if (downgradable && ! PASS2) {
10970   downgradable = FALSE;
10971  }
10972
10973  if (! len_passed_in) {
10974   if (UTF) {
10975    if (UNI_IS_INVARIANT(code_point)) {
10976     if (LOC || ! FOLD) {    /* /l defers folding until runtime */
10977      *character = (U8) code_point;
10978     }
10979     else { /* Here is /i and not /l (toFOLD() is defined on just
10980       ASCII, which isn't the same thing as INVARIANT on
10981       EBCDIC, but it works there, as the extra invariants
10982       fold to themselves) */
10983      *character = toFOLD((U8) code_point);
10984
10985      /* We can downgrade to an EXACT node if this character
10986      * isn't a folding one.  Note that this assumes that
10987      * nothing above Latin1 folds to some other invariant than
10988      * one of these alphabetics; otherwise we would also have
10989      * to check:
10990      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
10991      *      || ASCII_FOLD_RESTRICTED))
10992      */
10993      if (downgradable && PL_fold[code_point] == code_point) {
10994       OP(node) = EXACT;
10995      }
10996     }
10997     len = 1;
10998    }
10999    else if (FOLD && (! LOC
11000        || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11001    {   /* Folding, and ok to do so now */
11002     UV folded = _to_uni_fold_flags(
11003         code_point,
11004         character,
11005         &len,
11006         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11007              ? FOLD_FLAGS_NOMIX_ASCII
11008              : 0));
11009     if (downgradable
11010      && folded == code_point
11011      && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11012     {
11013      OP(node) = EXACT;
11014     }
11015    }
11016    else if (code_point <= MAX_UTF8_TWO_BYTE) {
11017
11018     /* Not folding this cp, and can output it directly */
11019     *character = UTF8_TWO_BYTE_HI(code_point);
11020     *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11021     len = 2;
11022    }
11023    else {
11024     uvchr_to_utf8( character, code_point);
11025     len = UTF8SKIP(character);
11026    }
11027   } /* Else pattern isn't UTF8.  */
11028   else if (! FOLD) {
11029    *character = (U8) code_point;
11030    len = 1;
11031   } /* Else is folded non-UTF8 */
11032   else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11033
11034    /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11035    * comments at join_exact()); */
11036    *character = (U8) code_point;
11037    len = 1;
11038
11039    /* Can turn into an EXACT node if we know the fold at compile time,
11040    * and it folds to itself and doesn't particpate in other folds */
11041    if (downgradable
11042     && ! LOC
11043     && PL_fold_latin1[code_point] == code_point
11044     && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11045      || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11046    {
11047     OP(node) = EXACT;
11048    }
11049   } /* else is Sharp s.  May need to fold it */
11050   else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11051    *character = 's';
11052    *(character + 1) = 's';
11053    len = 2;
11054   }
11055   else {
11056    *character = LATIN_SMALL_LETTER_SHARP_S;
11057    len = 1;
11058   }
11059  }
11060
11061  if (SIZE_ONLY) {
11062   RExC_size += STR_SZ(len);
11063  }
11064  else {
11065   RExC_emit += STR_SZ(len);
11066   STR_LEN(node) = len;
11067   if (! len_passed_in) {
11068    Copy((char *) character, STRING(node), len, char);
11069   }
11070  }
11071
11072  *flagp |= HASWIDTH;
11073
11074  /* A single character node is SIMPLE, except for the special-cased SHARP S
11075  * under /di. */
11076  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11077   && (code_point != LATIN_SMALL_LETTER_SHARP_S
11078    || ! FOLD || ! DEPENDS_SEMANTICS))
11079  {
11080   *flagp |= SIMPLE;
11081  }
11082
11083  /* The OP may not be well defined in PASS1 */
11084  if (PASS2 && OP(node) == EXACTFL) {
11085   RExC_contains_locale = 1;
11086  }
11087 }
11088
11089
11090 /* return atoi(p), unless it's too big to sensibly be a backref,
11091  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11092
11093 static I32
11094 S_backref_value(char *p)
11095 {
11096  char *q = p;
11097
11098  for (;isDIGIT(*q); q++); /* calculate length of num */
11099  if (q - p == 0 || q - p > 9)
11100   return I32_MAX;
11101  return atoi(p);
11102 }
11103
11104
11105 /*
11106  - regatom - the lowest level
11107
11108    Try to identify anything special at the start of the pattern. If there
11109    is, then handle it as required. This may involve generating a single regop,
11110    such as for an assertion; or it may involve recursing, such as to
11111    handle a () structure.
11112
11113    If the string doesn't start with something special then we gobble up
11114    as much literal text as we can.
11115
11116    Once we have been able to handle whatever type of thing started the
11117    sequence, we return.
11118
11119    Note: we have to be careful with escapes, as they can be both literal
11120    and special, and in the case of \10 and friends, context determines which.
11121
11122    A summary of the code structure is:
11123
11124    switch (first_byte) {
11125   cases for each special:
11126    handle this special;
11127    break;
11128   case '\\':
11129    switch (2nd byte) {
11130     cases for each unambiguous special:
11131      handle this special;
11132      break;
11133     cases for each ambigous special/literal:
11134      disambiguate;
11135      if (special)  handle here
11136      else goto defchar;
11137     default: // unambiguously literal:
11138      goto defchar;
11139    }
11140   default:  // is a literal char
11141    // FALL THROUGH
11142   defchar:
11143    create EXACTish node for literal;
11144    while (more input and node isn't full) {
11145     switch (input_byte) {
11146     cases for each special;
11147      make sure parse pointer is set so that the next call to
11148       regatom will see this special first
11149      goto loopdone; // EXACTish node terminated by prev. char
11150     default:
11151      append char to EXACTISH node;
11152     }
11153     get next input byte;
11154    }
11155   loopdone:
11156    }
11157    return the generated node;
11158
11159    Specifically there are two separate switches for handling
11160    escape sequences, with the one for handling literal escapes requiring
11161    a dummy entry for all of the special escapes that are actually handled
11162    by the other.
11163
11164    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11165    TRYAGAIN.
11166    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11167    restarted.
11168    Otherwise does not return NULL.
11169 */
11170
11171 STATIC regnode *
11172 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11173 {
11174  dVAR;
11175  regnode *ret = NULL;
11176  I32 flags = 0;
11177  char *parse_start = RExC_parse;
11178  U8 op;
11179  int invert = 0;
11180
11181  GET_RE_DEBUG_FLAGS_DECL;
11182
11183  *flagp = WORST;  /* Tentatively. */
11184
11185  DEBUG_PARSE("atom");
11186
11187  PERL_ARGS_ASSERT_REGATOM;
11188
11189 tryagain:
11190  switch ((U8)*RExC_parse) {
11191  case '^':
11192   RExC_seen_zerolen++;
11193   nextchar(pRExC_state);
11194   if (RExC_flags & RXf_PMf_MULTILINE)
11195    ret = reg_node(pRExC_state, MBOL);
11196   else if (RExC_flags & RXf_PMf_SINGLELINE)
11197    ret = reg_node(pRExC_state, SBOL);
11198   else
11199    ret = reg_node(pRExC_state, BOL);
11200   Set_Node_Length(ret, 1); /* MJD */
11201   break;
11202  case '$':
11203   nextchar(pRExC_state);
11204   if (*RExC_parse)
11205    RExC_seen_zerolen++;
11206   if (RExC_flags & RXf_PMf_MULTILINE)
11207    ret = reg_node(pRExC_state, MEOL);
11208   else if (RExC_flags & RXf_PMf_SINGLELINE)
11209    ret = reg_node(pRExC_state, SEOL);
11210   else
11211    ret = reg_node(pRExC_state, EOL);
11212   Set_Node_Length(ret, 1); /* MJD */
11213   break;
11214  case '.':
11215   nextchar(pRExC_state);
11216   if (RExC_flags & RXf_PMf_SINGLELINE)
11217    ret = reg_node(pRExC_state, SANY);
11218   else
11219    ret = reg_node(pRExC_state, REG_ANY);
11220   *flagp |= HASWIDTH|SIMPLE;
11221   RExC_naughty++;
11222   Set_Node_Length(ret, 1); /* MJD */
11223   break;
11224  case '[':
11225  {
11226   char * const oregcomp_parse = ++RExC_parse;
11227   ret = regclass(pRExC_state, flagp,depth+1,
11228      FALSE, /* means parse the whole char class */
11229      TRUE, /* allow multi-char folds */
11230      FALSE, /* don't silence non-portable warnings. */
11231      NULL);
11232   if (*RExC_parse != ']') {
11233    RExC_parse = oregcomp_parse;
11234    vFAIL("Unmatched [");
11235   }
11236   if (ret == NULL) {
11237    if (*flagp & RESTART_UTF8)
11238     return NULL;
11239    FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11240     (UV) *flagp);
11241   }
11242   nextchar(pRExC_state);
11243   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11244   break;
11245  }
11246  case '(':
11247   nextchar(pRExC_state);
11248   ret = reg(pRExC_state, 2, &flags,depth+1);
11249   if (ret == NULL) {
11250     if (flags & TRYAGAIN) {
11251      if (RExC_parse == RExC_end) {
11252       /* Make parent create an empty node if needed. */
11253       *flagp |= TRYAGAIN;
11254       return(NULL);
11255      }
11256      goto tryagain;
11257     }
11258     if (flags & RESTART_UTF8) {
11259      *flagp = RESTART_UTF8;
11260      return NULL;
11261     }
11262     FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11263                 (UV) flags);
11264   }
11265   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11266   break;
11267  case '|':
11268  case ')':
11269   if (flags & TRYAGAIN) {
11270    *flagp |= TRYAGAIN;
11271    return NULL;
11272   }
11273   vFAIL("Internal urp");
11274         /* Supposed to be caught earlier. */
11275   break;
11276  case '{':
11277   if (!regcurly(RExC_parse, FALSE)) {
11278    RExC_parse++;
11279    goto defchar;
11280   }
11281   /* FALL THROUGH */
11282  case '?':
11283  case '+':
11284  case '*':
11285   RExC_parse++;
11286   vFAIL("Quantifier follows nothing");
11287   break;
11288  case '\\':
11289   /* Special Escapes
11290
11291   This switch handles escape sequences that resolve to some kind
11292   of special regop and not to literal text. Escape sequnces that
11293   resolve to literal text are handled below in the switch marked
11294   "Literal Escapes".
11295
11296   Every entry in this switch *must* have a corresponding entry
11297   in the literal escape switch. However, the opposite is not
11298   required, as the default for this switch is to jump to the
11299   literal text handling code.
11300   */
11301   switch ((U8)*++RExC_parse) {
11302    U8 arg;
11303   /* Special Escapes */
11304   case 'A':
11305    RExC_seen_zerolen++;
11306    ret = reg_node(pRExC_state, SBOL);
11307    *flagp |= SIMPLE;
11308    goto finish_meta_pat;
11309   case 'G':
11310    ret = reg_node(pRExC_state, GPOS);
11311    RExC_seen |= REG_GPOS_SEEN;
11312    *flagp |= SIMPLE;
11313    goto finish_meta_pat;
11314   case 'K':
11315    RExC_seen_zerolen++;
11316    ret = reg_node(pRExC_state, KEEPS);
11317    *flagp |= SIMPLE;
11318    /* XXX:dmq : disabling in-place substitution seems to
11319    * be necessary here to avoid cases of memory corruption, as
11320    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11321    */
11322    RExC_seen |= REG_LOOKBEHIND_SEEN;
11323    goto finish_meta_pat;
11324   case 'Z':
11325    ret = reg_node(pRExC_state, SEOL);
11326    *flagp |= SIMPLE;
11327    RExC_seen_zerolen++;  /* Do not optimize RE away */
11328    goto finish_meta_pat;
11329   case 'z':
11330    ret = reg_node(pRExC_state, EOS);
11331    *flagp |= SIMPLE;
11332    RExC_seen_zerolen++;  /* Do not optimize RE away */
11333    goto finish_meta_pat;
11334   case 'C':
11335    ret = reg_node(pRExC_state, CANY);
11336    RExC_seen |= REG_CANY_SEEN;
11337    *flagp |= HASWIDTH|SIMPLE;
11338    goto finish_meta_pat;
11339   case 'X':
11340    ret = reg_node(pRExC_state, CLUMP);
11341    *flagp |= HASWIDTH;
11342    goto finish_meta_pat;
11343
11344   case 'W':
11345    invert = 1;
11346    /* FALLTHROUGH */
11347   case 'w':
11348    arg = ANYOF_WORDCHAR;
11349    goto join_posix;
11350
11351   case 'b':
11352    RExC_seen_zerolen++;
11353    RExC_seen |= REG_LOOKBEHIND_SEEN;
11354    op = BOUND + get_regex_charset(RExC_flags);
11355    if (op > BOUNDA) {  /* /aa is same as /a */
11356     op = BOUNDA;
11357    }
11358    else if (op == BOUNDL) {
11359     RExC_contains_locale = 1;
11360    }
11361    ret = reg_node(pRExC_state, op);
11362    FLAGS(ret) = get_regex_charset(RExC_flags);
11363    *flagp |= SIMPLE;
11364    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11365     /* diag_listed_as: Use "%s" instead of "%s" */
11366     vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11367    }
11368    goto finish_meta_pat;
11369   case 'B':
11370    RExC_seen_zerolen++;
11371    RExC_seen |= REG_LOOKBEHIND_SEEN;
11372    op = NBOUND + get_regex_charset(RExC_flags);
11373    if (op > NBOUNDA) { /* /aa is same as /a */
11374     op = NBOUNDA;
11375    }
11376    else if (op == NBOUNDL) {
11377     RExC_contains_locale = 1;
11378    }
11379    ret = reg_node(pRExC_state, op);
11380    FLAGS(ret) = get_regex_charset(RExC_flags);
11381    *flagp |= SIMPLE;
11382    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11383     /* diag_listed_as: Use "%s" instead of "%s" */
11384     vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11385    }
11386    goto finish_meta_pat;
11387
11388   case 'D':
11389    invert = 1;
11390    /* FALLTHROUGH */
11391   case 'd':
11392    arg = ANYOF_DIGIT;
11393    goto join_posix;
11394
11395   case 'R':
11396    ret = reg_node(pRExC_state, LNBREAK);
11397    *flagp |= HASWIDTH|SIMPLE;
11398    goto finish_meta_pat;
11399
11400   case 'H':
11401    invert = 1;
11402    /* FALLTHROUGH */
11403   case 'h':
11404    arg = ANYOF_BLANK;
11405    op = POSIXU;
11406    goto join_posix_op_known;
11407
11408   case 'V':
11409    invert = 1;
11410    /* FALLTHROUGH */
11411   case 'v':
11412    arg = ANYOF_VERTWS;
11413    op = POSIXU;
11414    goto join_posix_op_known;
11415
11416   case 'S':
11417    invert = 1;
11418    /* FALLTHROUGH */
11419   case 's':
11420    arg = ANYOF_SPACE;
11421
11422   join_posix:
11423
11424    op = POSIXD + get_regex_charset(RExC_flags);
11425    if (op > POSIXA) {  /* /aa is same as /a */
11426     op = POSIXA;
11427    }
11428    else if (op == POSIXL) {
11429     RExC_contains_locale = 1;
11430    }
11431
11432   join_posix_op_known:
11433
11434    if (invert) {
11435     op += NPOSIXD - POSIXD;
11436    }
11437
11438    ret = reg_node(pRExC_state, op);
11439    if (! SIZE_ONLY) {
11440     FLAGS(ret) = namedclass_to_classnum(arg);
11441    }
11442
11443    *flagp |= HASWIDTH|SIMPLE;
11444    /* FALL THROUGH */
11445
11446   finish_meta_pat:
11447    nextchar(pRExC_state);
11448    Set_Node_Length(ret, 2); /* MJD */
11449    break;
11450   case 'p':
11451   case 'P':
11452    {
11453 #ifdef DEBUGGING
11454     char* parse_start = RExC_parse - 2;
11455 #endif
11456
11457     RExC_parse--;
11458
11459     ret = regclass(pRExC_state, flagp,depth+1,
11460        TRUE, /* means just parse this element */
11461        FALSE, /* don't allow multi-char folds */
11462        FALSE, /* don't silence non-portable warnings.
11463           It would be a bug if these returned
11464           non-portables */
11465        NULL);
11466     /* regclass() can only return RESTART_UTF8 if multi-char folds
11467     are allowed.  */
11468     if (!ret)
11469      FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11470       (UV) *flagp);
11471
11472     RExC_parse--;
11473
11474     Set_Node_Offset(ret, parse_start + 2);
11475     Set_Node_Cur_Length(ret, parse_start);
11476     nextchar(pRExC_state);
11477    }
11478    break;
11479   case 'N':
11480    /* Handle \N and \N{NAME} with multiple code points here and not
11481    * below because it can be multicharacter. join_exact() will join
11482    * them up later on.  Also this makes sure that things like
11483    * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11484    * The options to the grok function call causes it to fail if the
11485    * sequence is just a single code point.  We then go treat it as
11486    * just another character in the current EXACT node, and hence it
11487    * gets uniform treatment with all the other characters.  The
11488    * special treatment for quantifiers is not needed for such single
11489    * character sequences */
11490    ++RExC_parse;
11491    if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11492         FALSE /* not strict */ )) {
11493     if (*flagp & RESTART_UTF8)
11494      return NULL;
11495     RExC_parse--;
11496     goto defchar;
11497    }
11498    break;
11499   case 'k':    /* Handle \k<NAME> and \k'NAME' */
11500   parse_named_seq:
11501   {
11502    char ch= RExC_parse[1];
11503    if (ch != '<' && ch != '\'' && ch != '{') {
11504     RExC_parse++;
11505     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11506     vFAIL2("Sequence %.2s... not terminated",parse_start);
11507    } else {
11508     /* this pretty much dupes the code for (?P=...) in reg(), if
11509     you change this make sure you change that */
11510     char* name_start = (RExC_parse += 2);
11511     U32 num = 0;
11512     SV *sv_dat = reg_scan_name(pRExC_state,
11513      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11514     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11515     if (RExC_parse == name_start || *RExC_parse != ch)
11516      /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11517      vFAIL2("Sequence %.3s... not terminated",parse_start);
11518
11519     if (!SIZE_ONLY) {
11520      num = add_data( pRExC_state, STR_WITH_LEN("S"));
11521      RExC_rxi->data->data[num]=(void*)sv_dat;
11522      SvREFCNT_inc_simple_void(sv_dat);
11523     }
11524
11525     RExC_sawback = 1;
11526     ret = reganode(pRExC_state,
11527        ((! FOLD)
11528         ? NREF
11529         : (ASCII_FOLD_RESTRICTED)
11530         ? NREFFA
11531         : (AT_LEAST_UNI_SEMANTICS)
11532          ? NREFFU
11533          : (LOC)
11534          ? NREFFL
11535          : NREFF),
11536         num);
11537     *flagp |= HASWIDTH;
11538
11539     /* override incorrect value set in reganode MJD */
11540     Set_Node_Offset(ret, parse_start+1);
11541     Set_Node_Cur_Length(ret, parse_start);
11542     nextchar(pRExC_state);
11543
11544    }
11545    break;
11546   }
11547   case 'g':
11548   case '1': case '2': case '3': case '4':
11549   case '5': case '6': case '7': case '8': case '9':
11550    {
11551     I32 num;
11552     bool hasbrace = 0;
11553
11554     if (*RExC_parse == 'g') {
11555      bool isrel = 0;
11556
11557      RExC_parse++;
11558      if (*RExC_parse == '{') {
11559       RExC_parse++;
11560       hasbrace = 1;
11561      }
11562      if (*RExC_parse == '-') {
11563       RExC_parse++;
11564       isrel = 1;
11565      }
11566      if (hasbrace && !isDIGIT(*RExC_parse)) {
11567       if (isrel) RExC_parse--;
11568       RExC_parse -= 2;
11569       goto parse_named_seq;
11570      }
11571
11572      num = S_backref_value(RExC_parse);
11573      if (num == 0)
11574       vFAIL("Reference to invalid group 0");
11575      else if (num == I32_MAX) {
11576       if (isDIGIT(*RExC_parse))
11577        vFAIL("Reference to nonexistent group");
11578       else
11579        vFAIL("Unterminated \\g... pattern");
11580      }
11581
11582      if (isrel) {
11583       num = RExC_npar - num;
11584       if (num < 1)
11585        vFAIL("Reference to nonexistent or unclosed group");
11586      }
11587     }
11588     else {
11589      num = S_backref_value(RExC_parse);
11590      /* bare \NNN might be backref or octal - if it is larger than or equal
11591      * RExC_npar then it is assumed to be and octal escape.
11592      * Note RExC_npar is +1 from the actual number of parens*/
11593      if (num == I32_MAX || (num > 9 && num >= RExC_npar
11594        && *RExC_parse != '8' && *RExC_parse != '9'))
11595      {
11596       /* Probably a character specified in octal, e.g. \35 */
11597       goto defchar;
11598      }
11599     }
11600
11601     /* at this point RExC_parse definitely points to a backref
11602     * number */
11603     {
11604 #ifdef RE_TRACK_PATTERN_OFFSETS
11605      char * const parse_start = RExC_parse - 1; /* MJD */
11606 #endif
11607      while (isDIGIT(*RExC_parse))
11608       RExC_parse++;
11609      if (hasbrace) {
11610       if (*RExC_parse != '}')
11611        vFAIL("Unterminated \\g{...} pattern");
11612       RExC_parse++;
11613      }
11614      if (!SIZE_ONLY) {
11615       if (num > (I32)RExC_rx->nparens)
11616        vFAIL("Reference to nonexistent group");
11617      }
11618      RExC_sawback = 1;
11619      ret = reganode(pRExC_state,
11620         ((! FOLD)
11621          ? REF
11622          : (ASCII_FOLD_RESTRICTED)
11623          ? REFFA
11624          : (AT_LEAST_UNI_SEMANTICS)
11625           ? REFFU
11626           : (LOC)
11627           ? REFFL
11628           : REFF),
11629          num);
11630      *flagp |= HASWIDTH;
11631
11632      /* override incorrect value set in reganode MJD */
11633      Set_Node_Offset(ret, parse_start+1);
11634      Set_Node_Cur_Length(ret, parse_start);
11635      RExC_parse--;
11636      nextchar(pRExC_state);
11637     }
11638    }
11639    break;
11640   case '\0':
11641    if (RExC_parse >= RExC_end)
11642     FAIL("Trailing \\");
11643    /* FALL THROUGH */
11644   default:
11645    /* Do not generate "unrecognized" warnings here, we fall
11646    back into the quick-grab loop below */
11647    parse_start--;
11648    goto defchar;
11649   }
11650   break;
11651
11652  case '#':
11653   if (RExC_flags & RXf_PMf_EXTENDED) {
11654    if ( reg_skipcomment( pRExC_state ) )
11655     goto tryagain;
11656   }
11657   /* FALL THROUGH */
11658
11659  default:
11660
11661    parse_start = RExC_parse - 1;
11662
11663    RExC_parse++;
11664
11665   defchar: {
11666    STRLEN len = 0;
11667    UV ender = 0;
11668    char *p;
11669    char *s;
11670 #define MAX_NODE_STRING_SIZE 127
11671    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11672    char *s0;
11673    U8 upper_parse = MAX_NODE_STRING_SIZE;
11674    U8 node_type = compute_EXACTish(pRExC_state);
11675    bool next_is_quantifier;
11676    char * oldp = NULL;
11677
11678    /* We can convert EXACTF nodes to EXACTFU if they contain only
11679    * characters that match identically regardless of the target
11680    * string's UTF8ness.  The reason to do this is that EXACTF is not
11681    * trie-able, EXACTFU is.
11682    *
11683    * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11684    * contain only above-Latin1 characters (hence must be in UTF8),
11685    * which don't participate in folds with Latin1-range characters,
11686    * as the latter's folds aren't known until runtime.  (We don't
11687    * need to figure this out until pass 2) */
11688    bool maybe_exactfu = PASS2
11689        && (node_type == EXACTF || node_type == EXACTFL);
11690
11691    /* If a folding node contains only code points that don't
11692    * participate in folds, it can be changed into an EXACT node,
11693    * which allows the optimizer more things to look for */
11694    bool maybe_exact;
11695
11696    ret = reg_node(pRExC_state, node_type);
11697
11698    /* In pass1, folded, we use a temporary buffer instead of the
11699    * actual node, as the node doesn't exist yet */
11700    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11701
11702    s0 = s;
11703
11704   reparse:
11705
11706    /* We do the EXACTFish to EXACT node only if folding.  (And we
11707    * don't need to figure this out until pass 2) */
11708    maybe_exact = FOLD && PASS2;
11709
11710    /* XXX The node can hold up to 255 bytes, yet this only goes to
11711    * 127.  I (khw) do not know why.  Keeping it somewhat less than
11712    * 255 allows us to not have to worry about overflow due to
11713    * converting to utf8 and fold expansion, but that value is
11714    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11715    * split up by this limit into a single one using the real max of
11716    * 255.  Even at 127, this breaks under rare circumstances.  If
11717    * folding, we do not want to split a node at a character that is a
11718    * non-final in a multi-char fold, as an input string could just
11719    * happen to want to match across the node boundary.  The join
11720    * would solve that problem if the join actually happens.  But a
11721    * series of more than two nodes in a row each of 127 would cause
11722    * the first join to succeed to get to 254, but then there wouldn't
11723    * be room for the next one, which could at be one of those split
11724    * multi-char folds.  I don't know of any fool-proof solution.  One
11725    * could back off to end with only a code point that isn't such a
11726    * non-final, but it is possible for there not to be any in the
11727    * entire node. */
11728    for (p = RExC_parse - 1;
11729     len < upper_parse && p < RExC_end;
11730     len++)
11731    {
11732     oldp = p;
11733
11734     if (RExC_flags & RXf_PMf_EXTENDED)
11735      p = regwhite( pRExC_state, p );
11736     switch ((U8)*p) {
11737     case '^':
11738     case '$':
11739     case '.':
11740     case '[':
11741     case '(':
11742     case ')':
11743     case '|':
11744      goto loopdone;
11745     case '\\':
11746      /* Literal Escapes Switch
11747
11748      This switch is meant to handle escape sequences that
11749      resolve to a literal character.
11750
11751      Every escape sequence that represents something
11752      else, like an assertion or a char class, is handled
11753      in the switch marked 'Special Escapes' above in this
11754      routine, but also has an entry here as anything that
11755      isn't explicitly mentioned here will be treated as
11756      an unescaped equivalent literal.
11757      */
11758
11759      switch ((U8)*++p) {
11760      /* These are all the special escapes. */
11761      case 'A':             /* Start assertion */
11762      case 'b': case 'B':   /* Word-boundary assertion*/
11763      case 'C':             /* Single char !DANGEROUS! */
11764      case 'd': case 'D':   /* digit class */
11765      case 'g': case 'G':   /* generic-backref, pos assertion */
11766      case 'h': case 'H':   /* HORIZWS */
11767      case 'k': case 'K':   /* named backref, keep marker */
11768      case 'p': case 'P':   /* Unicode property */
11769        case 'R':   /* LNBREAK */
11770      case 's': case 'S':   /* space class */
11771      case 'v': case 'V':   /* VERTWS */
11772      case 'w': case 'W':   /* word class */
11773      case 'X':             /* eXtended Unicode "combining
11774            character sequence" */
11775      case 'z': case 'Z':   /* End of line/string assertion */
11776       --p;
11777       goto loopdone;
11778
11779      /* Anything after here is an escape that resolves to a
11780      literal. (Except digits, which may or may not)
11781      */
11782      case 'n':
11783       ender = '\n';
11784       p++;
11785       break;
11786      case 'N': /* Handle a single-code point named character. */
11787       /* The options cause it to fail if a multiple code
11788       * point sequence.  Handle those in the switch() above
11789       * */
11790       RExC_parse = p + 1;
11791       if (! grok_bslash_N(pRExC_state, NULL, &ender,
11792            flagp, depth, FALSE,
11793            FALSE /* not strict */ ))
11794       {
11795        if (*flagp & RESTART_UTF8)
11796         FAIL("panic: grok_bslash_N set RESTART_UTF8");
11797        RExC_parse = p = oldp;
11798        goto loopdone;
11799       }
11800       p = RExC_parse;
11801       if (ender > 0xff) {
11802        REQUIRE_UTF8;
11803       }
11804       break;
11805      case 'r':
11806       ender = '\r';
11807       p++;
11808       break;
11809      case 't':
11810       ender = '\t';
11811       p++;
11812       break;
11813      case 'f':
11814       ender = '\f';
11815       p++;
11816       break;
11817      case 'e':
11818       ender = ASCII_TO_NATIVE('\033');
11819       p++;
11820       break;
11821      case 'a':
11822       ender = '\a';
11823       p++;
11824       break;
11825      case 'o':
11826       {
11827        UV result;
11828        const char* error_msg;
11829
11830        bool valid = grok_bslash_o(&p,
11831              &result,
11832              &error_msg,
11833              TRUE, /* out warnings */
11834              FALSE, /* not strict */
11835              TRUE, /* Output warnings
11836                 for non-
11837                 portables */
11838              UTF);
11839        if (! valid) {
11840         RExC_parse = p; /* going to die anyway; point
11841             to exact spot of failure */
11842         vFAIL(error_msg);
11843        }
11844        ender = result;
11845        if (PL_encoding && ender < 0x100) {
11846         goto recode_encoding;
11847        }
11848        if (ender > 0xff) {
11849         REQUIRE_UTF8;
11850        }
11851        break;
11852       }
11853      case 'x':
11854       {
11855        UV result = UV_MAX; /* initialize to erroneous
11856             value */
11857        const char* error_msg;
11858
11859        bool valid = grok_bslash_x(&p,
11860              &result,
11861              &error_msg,
11862              TRUE, /* out warnings */
11863              FALSE, /* not strict */
11864              TRUE, /* Output warnings
11865                 for non-
11866                 portables */
11867              UTF);
11868        if (! valid) {
11869         RExC_parse = p; /* going to die anyway; point
11870             to exact spot of failure */
11871         vFAIL(error_msg);
11872        }
11873        ender = result;
11874
11875        if (PL_encoding && ender < 0x100) {
11876         goto recode_encoding;
11877        }
11878        if (ender > 0xff) {
11879         REQUIRE_UTF8;
11880        }
11881        break;
11882       }
11883      case 'c':
11884       p++;
11885       ender = grok_bslash_c(*p++, SIZE_ONLY);
11886       break;
11887      case '8': case '9': /* must be a backreference */
11888       --p;
11889       goto loopdone;
11890      case '1': case '2': case '3':case '4':
11891      case '5': case '6': case '7':
11892       /* When we parse backslash escapes there is ambiguity
11893       * between backreferences and octal escapes. Any escape
11894       * from \1 - \9 is a backreference, any multi-digit
11895       * escape which does not start with 0 and which when
11896       * evaluated as decimal could refer to an already
11897       * parsed capture buffer is a backslash. Anything else
11898       * is octal.
11899       *
11900       * Note this implies that \118 could be interpreted as
11901       * 118 OR as "\11" . "8" depending on whether there
11902       * were 118 capture buffers defined already in the
11903       * pattern.  */
11904
11905       /* NOTE, RExC_npar is 1 more than the actual number of
11906       * parens we have seen so far, hence the < RExC_npar below. */
11907
11908       if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11909       {  /* Not to be treated as an octal constant, go
11910         find backref */
11911        --p;
11912        goto loopdone;
11913       }
11914      case '0':
11915       {
11916        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11917        STRLEN numlen = 3;
11918        ender = grok_oct(p, &numlen, &flags, NULL);
11919        if (ender > 0xff) {
11920         REQUIRE_UTF8;
11921        }
11922        p += numlen;
11923        if (SIZE_ONLY   /* like \08, \178 */
11924         && numlen < 3
11925         && p < RExC_end
11926         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11927        {
11928         reg_warn_non_literal_string(
11929           p + 1,
11930           form_short_octal_warning(p, numlen));
11931        }
11932       }
11933       if (PL_encoding && ender < 0x100)
11934        goto recode_encoding;
11935       break;
11936      recode_encoding:
11937       if (! RExC_override_recoding) {
11938        SV* enc = PL_encoding;
11939        ender = reg_recode((const char)(U8)ender, &enc);
11940        if (!enc && SIZE_ONLY)
11941         ckWARNreg(p, "Invalid escape in the specified encoding");
11942        REQUIRE_UTF8;
11943       }
11944       break;
11945      case '\0':
11946       if (p >= RExC_end)
11947        FAIL("Trailing \\");
11948       /* FALL THROUGH */
11949      default:
11950       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11951        /* Include any { following the alpha to emphasize
11952        * that it could be part of an escape at some point
11953        * in the future */
11954        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11955        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11956       }
11957       goto normal_default;
11958      } /* End of switch on '\' */
11959      break;
11960     default:    /* A literal character */
11961
11962      if (! SIZE_ONLY
11963       && RExC_flags & RXf_PMf_EXTENDED
11964       && ckWARN_d(WARN_DEPRECATED)
11965       && is_PATWS_non_low_safe(p, RExC_end, UTF))
11966      {
11967       vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11968         "Escape literal pattern white space under /x");
11969      }
11970
11971     normal_default:
11972      if (UTF8_IS_START(*p) && UTF) {
11973       STRLEN numlen;
11974       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11975            &numlen, UTF8_ALLOW_DEFAULT);
11976       p += numlen;
11977      }
11978      else
11979       ender = (U8) *p++;
11980      break;
11981     } /* End of switch on the literal */
11982
11983     /* Here, have looked at the literal character and <ender>
11984     * contains its ordinal, <p> points to the character after it
11985     */
11986
11987     if ( RExC_flags & RXf_PMf_EXTENDED)
11988      p = regwhite( pRExC_state, p );
11989
11990     /* If the next thing is a quantifier, it applies to this
11991     * character only, which means that this character has to be in
11992     * its own node and can't just be appended to the string in an
11993     * existing node, so if there are already other characters in
11994     * the node, close the node with just them, and set up to do
11995     * this character again next time through, when it will be the
11996     * only thing in its new node */
11997     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11998     {
11999      p = oldp;
12000      goto loopdone;
12001     }
12002
12003     if (! FOLD   /* The simple case, just append the literal */
12004      || (LOC  /* Also don't fold for tricky chars under /l */
12005       && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12006     {
12007      if (UTF) {
12008       const STRLEN unilen = reguni(pRExC_state, ender, s);
12009       if (unilen > 0) {
12010       s   += unilen;
12011       len += unilen;
12012       }
12013
12014       /* The loop increments <len> each time, as all but this
12015       * path (and one other) through it add a single byte to
12016       * the EXACTish node.  But this one has changed len to
12017       * be the correct final value, so subtract one to
12018       * cancel out the increment that follows */
12019       len--;
12020      }
12021      else {
12022       REGC((char)ender, s++);
12023      }
12024
12025      /* Can get here if folding only if is one of the /l
12026      * characters whose fold depends on the locale.  The
12027      * occurrence of any of these indicate that we can't
12028      * simplify things */
12029      if (FOLD) {
12030       maybe_exact = FALSE;
12031       maybe_exactfu = FALSE;
12032      }
12033     }
12034     else             /* FOLD */
12035      if (! ( UTF
12036       /* See comments for join_exact() as to why we fold this
12037       * non-UTF at compile time */
12038       || (node_type == EXACTFU
12039        && ender == LATIN_SMALL_LETTER_SHARP_S)))
12040     {
12041      /* Here, are folding and are not UTF-8 encoded; therefore
12042      * the character must be in the range 0-255, and is not /l
12043      * (Not /l because we already handled these under /l in
12044      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12045      if (IS_IN_SOME_FOLD_L1(ender)) {
12046       maybe_exact = FALSE;
12047
12048       /* See if the character's fold differs between /d and
12049       * /u.  This includes the multi-char fold SHARP S to
12050       * 'ss' */
12051       if (maybe_exactfu
12052        && (PL_fold[ender] != PL_fold_latin1[ender]
12053         || ender == LATIN_SMALL_LETTER_SHARP_S
12054         || (len > 0
12055         && isARG2_lower_or_UPPER_ARG1('s', ender)
12056         && isARG2_lower_or_UPPER_ARG1('s',
12057                 *(s-1)))))
12058       {
12059        maybe_exactfu = FALSE;
12060       }
12061      }
12062
12063      /* Even when folding, we store just the input character, as
12064      * we have an array that finds its fold quickly */
12065      *(s++) = (char) ender;
12066     }
12067     else {  /* FOLD and UTF */
12068      /* Unlike the non-fold case, we do actually have to
12069      * calculate the results here in pass 1.  This is for two
12070      * reasons, the folded length may be longer than the
12071      * unfolded, and we have to calculate how many EXACTish
12072      * nodes it will take; and we may run out of room in a node
12073      * in the middle of a potential multi-char fold, and have
12074      * to back off accordingly.  (Hence we can't use REGC for
12075      * the simple case just below.) */
12076
12077      UV folded;
12078      if (isASCII(ender)) {
12079       folded = toFOLD(ender);
12080       *(s)++ = (U8) folded;
12081      }
12082      else {
12083       STRLEN foldlen;
12084
12085       folded = _to_uni_fold_flags(
12086          ender,
12087          (U8 *) s,
12088          &foldlen,
12089          FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12090               ? FOLD_FLAGS_NOMIX_ASCII
12091               : 0));
12092       s += foldlen;
12093
12094       /* The loop increments <len> each time, as all but this
12095       * path (and one other) through it add a single byte to
12096       * the EXACTish node.  But this one has changed len to
12097       * be the correct final value, so subtract one to
12098       * cancel out the increment that follows */
12099       len += foldlen - 1;
12100      }
12101      /* If this node only contains non-folding code points so
12102      * far, see if this new one is also non-folding */
12103      if (maybe_exact) {
12104       if (folded != ender) {
12105        maybe_exact = FALSE;
12106       }
12107       else {
12108        /* Here the fold is the original; we have to check
12109        * further to see if anything folds to it */
12110        if (_invlist_contains_cp(PL_utf8_foldable,
12111               ender))
12112        {
12113         maybe_exact = FALSE;
12114        }
12115       }
12116      }
12117      ender = folded;
12118     }
12119
12120     if (next_is_quantifier) {
12121
12122      /* Here, the next input is a quantifier, and to get here,
12123      * the current character is the only one in the node.
12124      * Also, here <len> doesn't include the final byte for this
12125      * character */
12126      len++;
12127      goto loopdone;
12128     }
12129
12130    } /* End of loop through literal characters */
12131
12132    /* Here we have either exhausted the input or ran out of room in
12133    * the node.  (If we encountered a character that can't be in the
12134    * node, transfer is made directly to <loopdone>, and so we
12135    * wouldn't have fallen off the end of the loop.)  In the latter
12136    * case, we artificially have to split the node into two, because
12137    * we just don't have enough space to hold everything.  This
12138    * creates a problem if the final character participates in a
12139    * multi-character fold in the non-final position, as a match that
12140    * should have occurred won't, due to the way nodes are matched,
12141    * and our artificial boundary.  So back off until we find a non-
12142    * problematic character -- one that isn't at the beginning or
12143    * middle of such a fold.  (Either it doesn't participate in any
12144    * folds, or appears only in the final position of all the folds it
12145    * does participate in.)  A better solution with far fewer false
12146    * positives, and that would fill the nodes more completely, would
12147    * be to actually have available all the multi-character folds to
12148    * test against, and to back-off only far enough to be sure that
12149    * this node isn't ending with a partial one.  <upper_parse> is set
12150    * further below (if we need to reparse the node) to include just
12151    * up through that final non-problematic character that this code
12152    * identifies, so when it is set to less than the full node, we can
12153    * skip the rest of this */
12154    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12155
12156     const STRLEN full_len = len;
12157
12158     assert(len >= MAX_NODE_STRING_SIZE);
12159
12160     /* Here, <s> points to the final byte of the final character.
12161     * Look backwards through the string until find a non-
12162     * problematic character */
12163
12164     if (! UTF) {
12165
12166      /* This has no multi-char folds to non-UTF characters */
12167      if (ASCII_FOLD_RESTRICTED) {
12168       goto loopdone;
12169      }
12170
12171      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12172      len = s - s0 + 1;
12173     }
12174     else {
12175      if (!  PL_NonL1NonFinalFold) {
12176       PL_NonL1NonFinalFold = _new_invlist_C_array(
12177           NonL1_Perl_Non_Final_Folds_invlist);
12178      }
12179
12180      /* Point to the first byte of the final character */
12181      s = (char *) utf8_hop((U8 *) s, -1);
12182
12183      while (s >= s0) {   /* Search backwards until find
12184           non-problematic char */
12185       if (UTF8_IS_INVARIANT(*s)) {
12186
12187        /* There are no ascii characters that participate
12188        * in multi-char folds under /aa.  In EBCDIC, the
12189        * non-ascii invariants are all control characters,
12190        * so don't ever participate in any folds. */
12191        if (ASCII_FOLD_RESTRICTED
12192         || ! IS_NON_FINAL_FOLD(*s))
12193        {
12194         break;
12195        }
12196       }
12197       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12198        if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12199                 *s, *(s+1))))
12200        {
12201         break;
12202        }
12203       }
12204       else if (! _invlist_contains_cp(
12205           PL_NonL1NonFinalFold,
12206           valid_utf8_to_uvchr((U8 *) s, NULL)))
12207       {
12208        break;
12209       }
12210
12211       /* Here, the current character is problematic in that
12212       * it does occur in the non-final position of some
12213       * fold, so try the character before it, but have to
12214       * special case the very first byte in the string, so
12215       * we don't read outside the string */
12216       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12217      } /* End of loop backwards through the string */
12218
12219      /* If there were only problematic characters in the string,
12220      * <s> will point to before s0, in which case the length
12221      * should be 0, otherwise include the length of the
12222      * non-problematic character just found */
12223      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12224     }
12225
12226     /* Here, have found the final character, if any, that is
12227     * non-problematic as far as ending the node without splitting
12228     * it across a potential multi-char fold.  <len> contains the
12229     * number of bytes in the node up-to and including that
12230     * character, or is 0 if there is no such character, meaning
12231     * the whole node contains only problematic characters.  In
12232     * this case, give up and just take the node as-is.  We can't
12233     * do any better */
12234     if (len == 0) {
12235      len = full_len;
12236
12237      /* If the node ends in an 's' we make sure it stays EXACTF,
12238      * as if it turns into an EXACTFU, it could later get
12239      * joined with another 's' that would then wrongly match
12240      * the sharp s */
12241      if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12242      {
12243       maybe_exactfu = FALSE;
12244      }
12245     } else {
12246
12247      /* Here, the node does contain some characters that aren't
12248      * problematic.  If one such is the final character in the
12249      * node, we are done */
12250      if (len == full_len) {
12251       goto loopdone;
12252      }
12253      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12254
12255       /* If the final character is problematic, but the
12256       * penultimate is not, back-off that last character to
12257       * later start a new node with it */
12258       p = oldp;
12259       goto loopdone;
12260      }
12261
12262      /* Here, the final non-problematic character is earlier
12263      * in the input than the penultimate character.  What we do
12264      * is reparse from the beginning, going up only as far as
12265      * this final ok one, thus guaranteeing that the node ends
12266      * in an acceptable character.  The reason we reparse is
12267      * that we know how far in the character is, but we don't
12268      * know how to correlate its position with the input parse.
12269      * An alternate implementation would be to build that
12270      * correlation as we go along during the original parse,
12271      * but that would entail extra work for every node, whereas
12272      * this code gets executed only when the string is too
12273      * large for the node, and the final two characters are
12274      * problematic, an infrequent occurrence.  Yet another
12275      * possible strategy would be to save the tail of the
12276      * string, and the next time regatom is called, initialize
12277      * with that.  The problem with this is that unless you
12278      * back off one more character, you won't be guaranteed
12279      * regatom will get called again, unless regbranch,
12280      * regpiece ... are also changed.  If you do back off that
12281      * extra character, so that there is input guaranteed to
12282      * force calling regatom, you can't handle the case where
12283      * just the first character in the node is acceptable.  I
12284      * (khw) decided to try this method which doesn't have that
12285      * pitfall; if performance issues are found, we can do a
12286      * combination of the current approach plus that one */
12287      upper_parse = len;
12288      len = 0;
12289      s = s0;
12290      goto reparse;
12291     }
12292    }   /* End of verifying node ends with an appropriate char */
12293
12294   loopdone:   /* Jumped to when encounters something that shouldn't be in
12295      the node */
12296
12297    /* I (khw) don't know if you can get here with zero length, but the
12298    * old code handled this situation by creating a zero-length EXACT
12299    * node.  Might as well be NOTHING instead */
12300    if (len == 0) {
12301     OP(ret) = NOTHING;
12302    }
12303    else {
12304     if (FOLD) {
12305      /* If 'maybe_exact' is still set here, means there are no
12306      * code points in the node that participate in folds;
12307      * similarly for 'maybe_exactfu' and code points that match
12308      * differently depending on UTF8ness of the target string
12309      * (for /u), or depending on locale for /l */
12310      if (maybe_exact) {
12311       OP(ret) = EXACT;
12312      }
12313      else if (maybe_exactfu) {
12314       OP(ret) = EXACTFU;
12315      }
12316     }
12317     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12318           FALSE /* Don't look to see if could
12319              be turned into an EXACT
12320              node, as we have already
12321              computed that */
12322           );
12323    }
12324
12325    RExC_parse = p - 1;
12326    Set_Node_Cur_Length(ret, parse_start);
12327    nextchar(pRExC_state);
12328    {
12329     /* len is STRLEN which is unsigned, need to copy to signed */
12330     IV iv = len;
12331     if (iv < 0)
12332      vFAIL("Internal disaster");
12333    }
12334
12335   } /* End of label 'defchar:' */
12336   break;
12337  } /* End of giant switch on input character */
12338
12339  return(ret);
12340 }
12341
12342 STATIC char *
12343 S_regwhite( RExC_state_t *pRExC_state, char *p )
12344 {
12345  const char *e = RExC_end;
12346
12347  PERL_ARGS_ASSERT_REGWHITE;
12348
12349  while (p < e) {
12350   if (isSPACE(*p))
12351    ++p;
12352   else if (*p == '#') {
12353    bool ended = 0;
12354    do {
12355     if (*p++ == '\n') {
12356      ended = 1;
12357      break;
12358     }
12359    } while (p < e);
12360    if (!ended)
12361     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12362   }
12363   else
12364    break;
12365  }
12366  return p;
12367 }
12368
12369 STATIC char *
12370 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12371 {
12372  /* Returns the next non-pattern-white space, non-comment character (the
12373  * latter only if 'recognize_comment is true) in the string p, which is
12374  * ended by RExC_end.  If there is no line break ending a comment,
12375  * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
12376  const char *e = RExC_end;
12377
12378  PERL_ARGS_ASSERT_REGPATWS;
12379
12380  while (p < e) {
12381   STRLEN len;
12382   if ((len = is_PATWS_safe(p, e, UTF))) {
12383    p += len;
12384   }
12385   else if (recognize_comment && *p == '#') {
12386    bool ended = 0;
12387    do {
12388     p++;
12389     if (is_LNBREAK_safe(p, e, UTF)) {
12390      ended = 1;
12391      break;
12392     }
12393    } while (p < e);
12394    if (!ended)
12395     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12396   }
12397   else
12398    break;
12399  }
12400  return p;
12401 }
12402
12403 STATIC void
12404 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12405 {
12406  /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12407  * sets up the bitmap and any flags, removing those code points from the
12408  * inversion list, setting it to NULL should it become completely empty */
12409
12410  PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12411  assert(PL_regkind[OP(node)] == ANYOF);
12412
12413  ANYOF_BITMAP_ZERO(node);
12414  if (*invlist_ptr) {
12415
12416   /* This gets set if we actually need to modify things */
12417   bool change_invlist = FALSE;
12418
12419   UV start, end;
12420
12421   /* Start looking through *invlist_ptr */
12422   invlist_iterinit(*invlist_ptr);
12423   while (invlist_iternext(*invlist_ptr, &start, &end)) {
12424    UV high;
12425    int i;
12426
12427    if (end == UV_MAX && start <= 256) {
12428     ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12429    }
12430    else if (end >= 256) {
12431     ANYOF_FLAGS(node) |= ANYOF_UTF8;
12432    }
12433
12434    /* Quit if are above what we should change */
12435    if (start > 255) {
12436     break;
12437    }
12438
12439    change_invlist = TRUE;
12440
12441    /* Set all the bits in the range, up to the max that we are doing */
12442    high = (end < 255) ? end : 255;
12443    for (i = start; i <= (int) high; i++) {
12444     if (! ANYOF_BITMAP_TEST(node, i)) {
12445      ANYOF_BITMAP_SET(node, i);
12446     }
12447    }
12448   }
12449   invlist_iterfinish(*invlist_ptr);
12450
12451   /* Done with loop; remove any code points that are in the bitmap from
12452   * *invlist_ptr; similarly for code points above latin1 if we have a
12453   * flag to match all of them anyways */
12454   if (change_invlist) {
12455    _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12456   }
12457   if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12458    _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12459   }
12460
12461   /* If have completely emptied it, remove it completely */
12462   if (_invlist_len(*invlist_ptr) == 0) {
12463    SvREFCNT_dec_NN(*invlist_ptr);
12464    *invlist_ptr = NULL;
12465   }
12466  }
12467 }
12468
12469 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12470    Character classes ([:foo:]) can also be negated ([:^foo:]).
12471    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12472    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12473    but trigger failures because they are currently unimplemented. */
12474
12475 #define POSIXCC_DONE(c)   ((c) == ':')
12476 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12477 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12478
12479 PERL_STATIC_INLINE I32
12480 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12481 {
12482  dVAR;
12483  I32 namedclass = OOB_NAMEDCLASS;
12484
12485  PERL_ARGS_ASSERT_REGPPOSIXCC;
12486
12487  if (value == '[' && RExC_parse + 1 < RExC_end &&
12488   /* I smell either [: or [= or [. -- POSIX has been here, right? */
12489   POSIXCC(UCHARAT(RExC_parse)))
12490  {
12491   const char c = UCHARAT(RExC_parse);
12492   char* const s = RExC_parse++;
12493
12494   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12495    RExC_parse++;
12496   if (RExC_parse == RExC_end) {
12497    if (strict) {
12498
12499     /* Try to give a better location for the error (than the end of
12500     * the string) by looking for the matching ']' */
12501     RExC_parse = s;
12502     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12503      RExC_parse++;
12504     }
12505     vFAIL2("Unmatched '%c' in POSIX class", c);
12506    }
12507    /* Grandfather lone [:, [=, [. */
12508    RExC_parse = s;
12509   }
12510   else {
12511    const char* const t = RExC_parse++; /* skip over the c */
12512    assert(*t == c);
12513
12514    if (UCHARAT(RExC_parse) == ']') {
12515     const char *posixcc = s + 1;
12516     RExC_parse++; /* skip over the ending ] */
12517
12518     if (*s == ':') {
12519      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12520      const I32 skip = t - posixcc;
12521
12522      /* Initially switch on the length of the name.  */
12523      switch (skip) {
12524      case 4:
12525       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12526               this is the Perl \w
12527               */
12528        namedclass = ANYOF_WORDCHAR;
12529       break;
12530      case 5:
12531       /* Names all of length 5.  */
12532       /* alnum alpha ascii blank cntrl digit graph lower
12533       print punct space upper  */
12534       /* Offset 4 gives the best switch position.  */
12535       switch (posixcc[4]) {
12536       case 'a':
12537        if (memEQ(posixcc, "alph", 4)) /* alpha */
12538         namedclass = ANYOF_ALPHA;
12539        break;
12540       case 'e':
12541        if (memEQ(posixcc, "spac", 4)) /* space */
12542         namedclass = ANYOF_PSXSPC;
12543        break;
12544       case 'h':
12545        if (memEQ(posixcc, "grap", 4)) /* graph */
12546         namedclass = ANYOF_GRAPH;
12547        break;
12548       case 'i':
12549        if (memEQ(posixcc, "asci", 4)) /* ascii */
12550         namedclass = ANYOF_ASCII;
12551        break;
12552       case 'k':
12553        if (memEQ(posixcc, "blan", 4)) /* blank */
12554         namedclass = ANYOF_BLANK;
12555        break;
12556       case 'l':
12557        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12558         namedclass = ANYOF_CNTRL;
12559        break;
12560       case 'm':
12561        if (memEQ(posixcc, "alnu", 4)) /* alnum */
12562         namedclass = ANYOF_ALPHANUMERIC;
12563        break;
12564       case 'r':
12565        if (memEQ(posixcc, "lowe", 4)) /* lower */
12566         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12567        else if (memEQ(posixcc, "uppe", 4)) /* upper */
12568         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12569        break;
12570       case 't':
12571        if (memEQ(posixcc, "digi", 4)) /* digit */
12572         namedclass = ANYOF_DIGIT;
12573        else if (memEQ(posixcc, "prin", 4)) /* print */
12574         namedclass = ANYOF_PRINT;
12575        else if (memEQ(posixcc, "punc", 4)) /* punct */
12576         namedclass = ANYOF_PUNCT;
12577        break;
12578       }
12579       break;
12580      case 6:
12581       if (memEQ(posixcc, "xdigit", 6))
12582        namedclass = ANYOF_XDIGIT;
12583       break;
12584      }
12585
12586      if (namedclass == OOB_NAMEDCLASS)
12587       vFAIL2utf8f(
12588        "POSIX class [:%"UTF8f":] unknown",
12589        UTF8fARG(UTF, t - s - 1, s + 1));
12590
12591      /* The #defines are structured so each complement is +1 to
12592      * the normal one */
12593      if (complement) {
12594       namedclass++;
12595      }
12596      assert (posixcc[skip] == ':');
12597      assert (posixcc[skip+1] == ']');
12598     } else if (!SIZE_ONLY) {
12599      /* [[=foo=]] and [[.foo.]] are still future. */
12600
12601      /* adjust RExC_parse so the warning shows after
12602      the class closes */
12603      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12604       RExC_parse++;
12605      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12606     }
12607    } else {
12608     /* Maternal grandfather:
12609     * "[:" ending in ":" but not in ":]" */
12610     if (strict) {
12611      vFAIL("Unmatched '[' in POSIX class");
12612     }
12613
12614     /* Grandfather lone [:, [=, [. */
12615     RExC_parse = s;
12616    }
12617   }
12618  }
12619
12620  return namedclass;
12621 }
12622
12623 STATIC bool
12624 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12625 {
12626  /* This applies some heuristics at the current parse position (which should
12627  * be at a '[') to see if what follows might be intended to be a [:posix:]
12628  * class.  It returns true if it really is a posix class, of course, but it
12629  * also can return true if it thinks that what was intended was a posix
12630  * class that didn't quite make it.
12631  *
12632  * It will return true for
12633  *      [:alphanumerics:
12634  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12635  *                         ')' indicating the end of the (?[
12636  *      [:any garbage including %^&$ punctuation:]
12637  *
12638  * This is designed to be called only from S_handle_regex_sets; it could be
12639  * easily adapted to be called from the spot at the beginning of regclass()
12640  * that checks to see in a normal bracketed class if the surrounding []
12641  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12642  * change long-standing behavior, so I (khw) didn't do that */
12643  char* p = RExC_parse + 1;
12644  char first_char = *p;
12645
12646  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12647
12648  assert(*(p - 1) == '[');
12649
12650  if (! POSIXCC(first_char)) {
12651   return FALSE;
12652  }
12653
12654  p++;
12655  while (p < RExC_end && isWORDCHAR(*p)) p++;
12656
12657  if (p >= RExC_end) {
12658   return FALSE;
12659  }
12660
12661  if (p - RExC_parse > 2    /* Got at least 1 word character */
12662   && (*p == first_char
12663    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12664  {
12665   return TRUE;
12666  }
12667
12668  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12669
12670  return (p
12671    && p - RExC_parse > 2 /* [:] evaluates to colon;
12672          [::] is a bad posix class. */
12673    && first_char == *(p - 1));
12674 }
12675
12676 STATIC regnode *
12677 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12678      I32 *flagp, U32 depth,
12679      char * const oregcomp_parse)
12680 {
12681  /* Handle the (?[...]) construct to do set operations */
12682
12683  U8 curchar;
12684  UV start, end; /* End points of code point ranges */
12685  SV* result_string;
12686  char *save_end, *save_parse;
12687  SV* final;
12688  STRLEN len;
12689  regnode* node;
12690  AV* stack;
12691  const bool save_fold = FOLD;
12692
12693  GET_RE_DEBUG_FLAGS_DECL;
12694
12695  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12696
12697  if (LOC) {
12698   vFAIL("(?[...]) not valid in locale");
12699  }
12700  RExC_uni_semantics = 1;
12701
12702  /* This will return only an ANYOF regnode, or (unlikely) something smaller
12703  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12704  * call regclass to handle '[]' so as to not have to reinvent its parsing
12705  * rules here (throwing away the size it computes each time).  And, we exit
12706  * upon an unescaped ']' that isn't one ending a regclass.  To do both
12707  * these things, we need to realize that something preceded by a backslash
12708  * is escaped, so we have to keep track of backslashes */
12709  if (SIZE_ONLY) {
12710   UV depth = 0; /* how many nested (?[...]) constructs */
12711
12712   Perl_ck_warner_d(aTHX_
12713    packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12714    "The regex_sets feature is experimental" REPORT_LOCATION,
12715     UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12716     UTF8fARG(UTF,
12717       RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12718       RExC_precomp + (RExC_parse - RExC_precomp)));
12719
12720   while (RExC_parse < RExC_end) {
12721    SV* current = NULL;
12722    RExC_parse = regpatws(pRExC_state, RExC_parse,
12723         TRUE); /* means recognize comments */
12724    switch (*RExC_parse) {
12725     case '?':
12726      if (RExC_parse[1] == '[') depth++, RExC_parse++;
12727      /* FALL THROUGH */
12728     default:
12729      break;
12730     case '\\':
12731      /* Skip the next byte (which could cause us to end up in
12732      * the middle of a UTF-8 character, but since none of those
12733      * are confusable with anything we currently handle in this
12734      * switch (invariants all), it's safe.  We'll just hit the
12735      * default: case next time and keep on incrementing until
12736      * we find one of the invariants we do handle. */
12737      RExC_parse++;
12738      break;
12739     case '[':
12740     {
12741      /* If this looks like it is a [:posix:] class, leave the
12742      * parse pointer at the '[' to fool regclass() into
12743      * thinking it is part of a '[[:posix:]]'.  That function
12744      * will use strict checking to force a syntax error if it
12745      * doesn't work out to a legitimate class */
12746      bool is_posix_class
12747          = could_it_be_a_POSIX_class(pRExC_state);
12748      if (! is_posix_class) {
12749       RExC_parse++;
12750      }
12751
12752      /* regclass() can only return RESTART_UTF8 if multi-char
12753      folds are allowed.  */
12754      if (!regclass(pRExC_state, flagp,depth+1,
12755         is_posix_class, /* parse the whole char
12756              class only if not a
12757              posix class */
12758         FALSE, /* don't allow multi-char folds */
12759         TRUE, /* silence non-portable warnings. */
12760         &current))
12761       FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12762        (UV) *flagp);
12763
12764      /* function call leaves parse pointing to the ']', except
12765      * if we faked it */
12766      if (is_posix_class) {
12767       RExC_parse--;
12768      }
12769
12770      SvREFCNT_dec(current);   /* In case it returned something */
12771      break;
12772     }
12773
12774     case ']':
12775      if (depth--) break;
12776      RExC_parse++;
12777      if (RExC_parse < RExC_end
12778       && *RExC_parse == ')')
12779      {
12780       node = reganode(pRExC_state, ANYOF, 0);
12781       RExC_size += ANYOF_SKIP;
12782       nextchar(pRExC_state);
12783       Set_Node_Length(node,
12784         RExC_parse - oregcomp_parse + 1); /* MJD */
12785       return node;
12786      }
12787      goto no_close;
12788    }
12789    RExC_parse++;
12790   }
12791
12792   no_close:
12793   FAIL("Syntax error in (?[...])");
12794  }
12795
12796  /* Pass 2 only after this.  Everything in this construct is a
12797  * metacharacter.  Operands begin with either a '\' (for an escape
12798  * sequence), or a '[' for a bracketed character class.  Any other
12799  * character should be an operator, or parenthesis for grouping.  Both
12800  * types of operands are handled by calling regclass() to parse them.  It
12801  * is called with a parameter to indicate to return the computed inversion
12802  * list.  The parsing here is implemented via a stack.  Each entry on the
12803  * stack is a single character representing one of the operators, or the
12804  * '('; or else a pointer to an operand inversion list. */
12805
12806 #define IS_OPERAND(a)  (! SvIOK(a))
12807
12808  /* The stack starts empty.  It is a syntax error if the first thing parsed
12809  * is a binary operator; everything else is pushed on the stack.  When an
12810  * operand is parsed, the top of the stack is examined.  If it is a binary
12811  * operator, the item before it should be an operand, and both are replaced
12812  * by the result of doing that operation on the new operand and the one on
12813  * the stack.   Thus a sequence of binary operands is reduced to a single
12814  * one before the next one is parsed.
12815  *
12816  * A unary operator may immediately follow a binary in the input, for
12817  * example
12818  *      [a] + ! [b]
12819  * When an operand is parsed and the top of the stack is a unary operator,
12820  * the operation is performed, and then the stack is rechecked to see if
12821  * this new operand is part of a binary operation; if so, it is handled as
12822  * above.
12823  *
12824  * A '(' is simply pushed on the stack; it is valid only if the stack is
12825  * empty, or the top element of the stack is an operator or another '('
12826  * (for which the parenthesized expression will become an operand).  By the
12827  * time the corresponding ')' is parsed everything in between should have
12828  * been parsed and evaluated to a single operand (or else is a syntax
12829  * error), and is handled as a regular operand */
12830
12831  sv_2mortal((SV *)(stack = newAV()));
12832
12833  while (RExC_parse < RExC_end) {
12834   I32 top_index = av_tindex(stack);
12835   SV** top_ptr;
12836   SV* current = NULL;
12837
12838   /* Skip white space */
12839   RExC_parse = regpatws(pRExC_state, RExC_parse,
12840         TRUE); /* means recognize comments */
12841   if (RExC_parse >= RExC_end) {
12842    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12843   }
12844   if ((curchar = UCHARAT(RExC_parse)) == ']') {
12845    break;
12846   }
12847
12848   switch (curchar) {
12849
12850    case '?':
12851     if (av_tindex(stack) >= 0   /* This makes sure that we can
12852            safely subtract 1 from
12853            RExC_parse in the next clause.
12854            If we have something on the
12855            stack, we have parsed something
12856            */
12857      && UCHARAT(RExC_parse - 1) == '('
12858      && RExC_parse < RExC_end)
12859     {
12860      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12861      * This happens when we have some thing like
12862      *
12863      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12864      *   ...
12865      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12866      *
12867      * Here we would be handling the interpolated
12868      * '$thai_or_lao'.  We handle this by a recursive call to
12869      * ourselves which returns the inversion list the
12870      * interpolated expression evaluates to.  We use the flags
12871      * from the interpolated pattern. */
12872      U32 save_flags = RExC_flags;
12873      const char * const save_parse = ++RExC_parse;
12874
12875      parse_lparen_question_flags(pRExC_state);
12876
12877      if (RExC_parse == save_parse  /* Makes sure there was at
12878              least one flag (or this
12879              embedding wasn't compiled)
12880             */
12881       || RExC_parse >= RExC_end - 4
12882       || UCHARAT(RExC_parse) != ':'
12883       || UCHARAT(++RExC_parse) != '('
12884       || UCHARAT(++RExC_parse) != '?'
12885       || UCHARAT(++RExC_parse) != '[')
12886      {
12887
12888       /* In combination with the above, this moves the
12889       * pointer to the point just after the first erroneous
12890       * character (or if there are no flags, to where they
12891       * should have been) */
12892       if (RExC_parse >= RExC_end - 4) {
12893        RExC_parse = RExC_end;
12894       }
12895       else if (RExC_parse != save_parse) {
12896        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12897       }
12898       vFAIL("Expecting '(?flags:(?[...'");
12899      }
12900      RExC_parse++;
12901      (void) handle_regex_sets(pRExC_state, &current, flagp,
12902              depth+1, oregcomp_parse);
12903
12904      /* Here, 'current' contains the embedded expression's
12905      * inversion list, and RExC_parse points to the trailing
12906      * ']'; the next character should be the ')' which will be
12907      * paired with the '(' that has been put on the stack, so
12908      * the whole embedded expression reduces to '(operand)' */
12909      RExC_parse++;
12910
12911      RExC_flags = save_flags;
12912      goto handle_operand;
12913     }
12914     /* FALL THROUGH */
12915
12916    default:
12917     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12918     vFAIL("Unexpected character");
12919
12920    case '\\':
12921     /* regclass() can only return RESTART_UTF8 if multi-char
12922     folds are allowed.  */
12923     if (!regclass(pRExC_state, flagp,depth+1,
12924        TRUE, /* means parse just the next thing */
12925        FALSE, /* don't allow multi-char folds */
12926        FALSE, /* don't silence non-portable warnings.  */
12927        &current))
12928      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12929       (UV) *flagp);
12930     /* regclass() will return with parsing just the \ sequence,
12931     * leaving the parse pointer at the next thing to parse */
12932     RExC_parse--;
12933     goto handle_operand;
12934
12935    case '[':   /* Is a bracketed character class */
12936    {
12937     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12938
12939     if (! is_posix_class) {
12940      RExC_parse++;
12941     }
12942
12943     /* regclass() can only return RESTART_UTF8 if multi-char
12944     folds are allowed.  */
12945     if(!regclass(pRExC_state, flagp,depth+1,
12946        is_posix_class, /* parse the whole char class
12947             only if not a posix class */
12948        FALSE, /* don't allow multi-char folds */
12949        FALSE, /* don't silence non-portable warnings.  */
12950        &current))
12951      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12952       (UV) *flagp);
12953     /* function call leaves parse pointing to the ']', except if we
12954     * faked it */
12955     if (is_posix_class) {
12956      RExC_parse--;
12957     }
12958
12959     goto handle_operand;
12960    }
12961
12962    case '&':
12963    case '|':
12964    case '+':
12965    case '-':
12966    case '^':
12967     if (top_index < 0
12968      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12969      || ! IS_OPERAND(*top_ptr))
12970     {
12971      RExC_parse++;
12972      vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12973     }
12974     av_push(stack, newSVuv(curchar));
12975     break;
12976
12977    case '!':
12978     av_push(stack, newSVuv(curchar));
12979     break;
12980
12981    case '(':
12982     if (top_index >= 0) {
12983      top_ptr = av_fetch(stack, top_index, FALSE);
12984      assert(top_ptr);
12985      if (IS_OPERAND(*top_ptr)) {
12986       RExC_parse++;
12987       vFAIL("Unexpected '(' with no preceding operator");
12988      }
12989     }
12990     av_push(stack, newSVuv(curchar));
12991     break;
12992
12993    case ')':
12994    {
12995     SV* lparen;
12996     if (top_index < 1
12997      || ! (current = av_pop(stack))
12998      || ! IS_OPERAND(current)
12999      || ! (lparen = av_pop(stack))
13000      || IS_OPERAND(lparen)
13001      || SvUV(lparen) != '(')
13002     {
13003      SvREFCNT_dec(current);
13004      RExC_parse++;
13005      vFAIL("Unexpected ')'");
13006     }
13007     top_index -= 2;
13008     SvREFCNT_dec_NN(lparen);
13009
13010     /* FALL THROUGH */
13011    }
13012
13013    handle_operand:
13014
13015     /* Here, we have an operand to process, in 'current' */
13016
13017     if (top_index < 0) {    /* Just push if stack is empty */
13018      av_push(stack, current);
13019     }
13020     else {
13021      SV* top = av_pop(stack);
13022      SV *prev = NULL;
13023      char current_operator;
13024
13025      if (IS_OPERAND(top)) {
13026       SvREFCNT_dec_NN(top);
13027       SvREFCNT_dec_NN(current);
13028       vFAIL("Operand with no preceding operator");
13029      }
13030      current_operator = (char) SvUV(top);
13031      switch (current_operator) {
13032       case '(':   /* Push the '(' back on followed by the new
13033          operand */
13034        av_push(stack, top);
13035        av_push(stack, current);
13036        SvREFCNT_inc(top);  /* Counters the '_dec' done
13037             just after the 'break', so
13038             it doesn't get wrongly freed
13039             */
13040        break;
13041
13042       case '!':
13043        _invlist_invert(current);
13044
13045        /* Unlike binary operators, the top of the stack,
13046        * now that this unary one has been popped off, may
13047        * legally be an operator, and we now have operand
13048        * for it. */
13049        top_index--;
13050        SvREFCNT_dec_NN(top);
13051        goto handle_operand;
13052
13053       case '&':
13054        prev = av_pop(stack);
13055        _invlist_intersection(prev,
13056             current,
13057             &current);
13058        av_push(stack, current);
13059        break;
13060
13061       case '|':
13062       case '+':
13063        prev = av_pop(stack);
13064        _invlist_union(prev, current, &current);
13065        av_push(stack, current);
13066        break;
13067
13068       case '-':
13069        prev = av_pop(stack);;
13070        _invlist_subtract(prev, current, &current);
13071        av_push(stack, current);
13072        break;
13073
13074       case '^':   /* The union minus the intersection */
13075       {
13076        SV* i = NULL;
13077        SV* u = NULL;
13078        SV* element;
13079
13080        prev = av_pop(stack);
13081        _invlist_union(prev, current, &u);
13082        _invlist_intersection(prev, current, &i);
13083        /* _invlist_subtract will overwrite current
13084         without freeing what it already contains */
13085        element = current;
13086        _invlist_subtract(u, i, &current);
13087        av_push(stack, current);
13088        SvREFCNT_dec_NN(i);
13089        SvREFCNT_dec_NN(u);
13090        SvREFCNT_dec_NN(element);
13091        break;
13092       }
13093
13094       default:
13095        Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13096     }
13097     SvREFCNT_dec_NN(top);
13098     SvREFCNT_dec(prev);
13099    }
13100   }
13101
13102   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13103  }
13104
13105  if (av_tindex(stack) < 0   /* Was empty */
13106   || ((final = av_pop(stack)) == NULL)
13107   || ! IS_OPERAND(final)
13108   || av_tindex(stack) >= 0)  /* More left on stack */
13109  {
13110   vFAIL("Incomplete expression within '(?[ ])'");
13111  }
13112
13113  /* Here, 'final' is the resultant inversion list from evaluating the
13114  * expression.  Return it if so requested */
13115  if (return_invlist) {
13116   *return_invlist = final;
13117   return END;
13118  }
13119
13120  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13121  * expecting a string of ranges and individual code points */
13122  invlist_iterinit(final);
13123  result_string = newSVpvs("");
13124  while (invlist_iternext(final, &start, &end)) {
13125   if (start == end) {
13126    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13127   }
13128   else {
13129    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13130              start,          end);
13131   }
13132  }
13133
13134  save_parse = RExC_parse;
13135  RExC_parse = SvPV(result_string, len);
13136  save_end = RExC_end;
13137  RExC_end = RExC_parse + len;
13138
13139  /* We turn off folding around the call, as the class we have constructed
13140  * already has all folding taken into consideration, and we don't want
13141  * regclass() to add to that */
13142  RExC_flags &= ~RXf_PMf_FOLD;
13143  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13144  */
13145  node = regclass(pRExC_state, flagp,depth+1,
13146      FALSE, /* means parse the whole char class */
13147      FALSE, /* don't allow multi-char folds */
13148      TRUE, /* silence non-portable warnings.  The above may very
13149        well have generated non-portable code points, but
13150        they're valid on this machine */
13151      NULL);
13152  if (!node)
13153   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13154      PTR2UV(flagp));
13155  if (save_fold) {
13156   RExC_flags |= RXf_PMf_FOLD;
13157  }
13158  RExC_parse = save_parse + 1;
13159  RExC_end = save_end;
13160  SvREFCNT_dec_NN(final);
13161  SvREFCNT_dec_NN(result_string);
13162
13163  nextchar(pRExC_state);
13164  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13165  return node;
13166 }
13167 #undef IS_OPERAND
13168
13169 /* The names of properties whose definitions are not known at compile time are
13170  * stored in this SV, after a constant heading.  So if the length has been
13171  * changed since initialization, then there is a run-time definition. */
13172 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13173           (SvCUR(listsv) != initial_listsv_len)
13174
13175 STATIC regnode *
13176 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13177     const bool stop_at_1,  /* Just parse the next thing, don't
13178           look for a full character class */
13179     bool allow_multi_folds,
13180     const bool silence_non_portable,   /* Don't output warnings
13181              about too large
13182              characters */
13183     SV** ret_invlist)  /* Return an inversion list, not a node */
13184 {
13185  /* parse a bracketed class specification.  Most of these will produce an
13186  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13187  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13188  * under /i with multi-character folds: it will be rewritten following the
13189  * paradigm of this example, where the <multi-fold>s are characters which
13190  * fold to multiple character sequences:
13191  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13192  * gets effectively rewritten as:
13193  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13194  * reg() gets called (recursively) on the rewritten version, and this
13195  * function will return what it constructs.  (Actually the <multi-fold>s
13196  * aren't physically removed from the [abcdefghi], it's just that they are
13197  * ignored in the recursion by means of a flag:
13198  * <RExC_in_multi_char_class>.)
13199  *
13200  * ANYOF nodes contain a bit map for the first 256 characters, with the
13201  * corresponding bit set if that character is in the list.  For characters
13202  * above 255, a range list or swash is used.  There are extra bits for \w,
13203  * etc. in locale ANYOFs, as what these match is not determinable at
13204  * compile time
13205  *
13206  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13207  * to be restarted.  This can only happen if ret_invlist is non-NULL.
13208  */
13209
13210  dVAR;
13211  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13212  IV range = 0;
13213  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13214  regnode *ret;
13215  STRLEN numlen;
13216  IV namedclass = OOB_NAMEDCLASS;
13217  char *rangebegin = NULL;
13218  bool need_class = 0;
13219  SV *listsv = NULL;
13220  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13221          than just initialized.  */
13222  SV* properties = NULL;    /* Code points that match \p{} \P{} */
13223  SV* posixes = NULL;     /* Code points that match classes like [:word:],
13224        extended beyond the Latin1 range.  These have to
13225        be kept separate from other code points for much
13226        of this function because their handling  is
13227        different under /i, and for most classes under
13228        /d as well */
13229  SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13230        separate for a while from the non-complemented
13231        versions because of complications with /d
13232        matching */
13233  UV element_count = 0;   /* Number of distinct elements in the class.
13234        Optimizations may be possible if this is tiny */
13235  AV * multi_char_matches = NULL; /* Code points that fold to more than one
13236          character; used under /i */
13237  UV n;
13238  char * stop_ptr = RExC_end;    /* where to stop parsing */
13239  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13240             space? */
13241  const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13242
13243  /* Unicode properties are stored in a swash; this holds the current one
13244  * being parsed.  If this swash is the only above-latin1 component of the
13245  * character class, an optimization is to pass it directly on to the
13246  * execution engine.  Otherwise, it is set to NULL to indicate that there
13247  * are other things in the class that have to be dealt with at execution
13248  * time */
13249  SV* swash = NULL;  /* Code points that match \p{} \P{} */
13250
13251  /* Set if a component of this character class is user-defined; just passed
13252  * on to the engine */
13253  bool has_user_defined_property = FALSE;
13254
13255  /* inversion list of code points this node matches only when the target
13256  * string is in UTF-8.  (Because is under /d) */
13257  SV* depends_list = NULL;
13258
13259  /* Inversion list of code points this node matches regardless of things
13260  * like locale, folding, utf8ness of the target string */
13261  SV* cp_list = NULL;
13262
13263  /* Like cp_list, but code points on this list need to be checked for things
13264  * that fold to/from them under /i */
13265  SV* cp_foldable_list = NULL;
13266
13267  /* Like cp_list, but code points on this list are valid only when the
13268  * runtime locale is UTF-8 */
13269  SV* only_utf8_locale_list = NULL;
13270
13271 #ifdef EBCDIC
13272  /* In a range, counts how many 0-2 of the ends of it came from literals,
13273  * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13274  UV literal_endpoint = 0;
13275 #endif
13276  bool invert = FALSE;    /* Is this class to be complemented */
13277
13278  bool warn_super = ALWAYS_WARN_SUPER;
13279
13280  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13281   case we need to change the emitted regop to an EXACT. */
13282  const char * orig_parse = RExC_parse;
13283  const SSize_t orig_size = RExC_size;
13284  bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13285  GET_RE_DEBUG_FLAGS_DECL;
13286
13287  PERL_ARGS_ASSERT_REGCLASS;
13288 #ifndef DEBUGGING
13289  PERL_UNUSED_ARG(depth);
13290 #endif
13291
13292  DEBUG_PARSE("clas");
13293
13294  /* Assume we are going to generate an ANYOF node. */
13295  ret = reganode(pRExC_state, ANYOF, 0);
13296
13297  if (SIZE_ONLY) {
13298   RExC_size += ANYOF_SKIP;
13299   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13300  }
13301  else {
13302   ANYOF_FLAGS(ret) = 0;
13303
13304   RExC_emit += ANYOF_SKIP;
13305   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13306   initial_listsv_len = SvCUR(listsv);
13307   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13308  }
13309
13310  if (skip_white) {
13311   RExC_parse = regpatws(pRExC_state, RExC_parse,
13312        FALSE /* means don't recognize comments */);
13313  }
13314
13315  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13316   RExC_parse++;
13317   invert = TRUE;
13318   allow_multi_folds = FALSE;
13319   RExC_naughty++;
13320   if (skip_white) {
13321    RExC_parse = regpatws(pRExC_state, RExC_parse,
13322         FALSE /* means don't recognize comments */);
13323   }
13324  }
13325
13326  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13327  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13328   const char *s = RExC_parse;
13329   const char  c = *s++;
13330
13331   while (isWORDCHAR(*s))
13332    s++;
13333   if (*s && c == *s && s[1] == ']') {
13334    SAVEFREESV(RExC_rx_sv);
13335    ckWARN3reg(s+2,
13336      "POSIX syntax [%c %c] belongs inside character classes",
13337      c, c);
13338    (void)ReREFCNT_inc(RExC_rx_sv);
13339   }
13340  }
13341
13342  /* If the caller wants us to just parse a single element, accomplish this
13343  * by faking the loop ending condition */
13344  if (stop_at_1 && RExC_end > RExC_parse) {
13345   stop_ptr = RExC_parse + 1;
13346  }
13347
13348  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13349  if (UCHARAT(RExC_parse) == ']')
13350   goto charclassloop;
13351
13352 parseit:
13353  while (1) {
13354   if  (RExC_parse >= stop_ptr) {
13355    break;
13356   }
13357
13358   if (skip_white) {
13359    RExC_parse = regpatws(pRExC_state, RExC_parse,
13360         FALSE /* means don't recognize comments */);
13361   }
13362
13363   if  (UCHARAT(RExC_parse) == ']') {
13364    break;
13365   }
13366
13367  charclassloop:
13368
13369   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13370   save_value = value;
13371   save_prevvalue = prevvalue;
13372
13373   if (!range) {
13374    rangebegin = RExC_parse;
13375    element_count++;
13376   }
13377   if (UTF) {
13378    value = utf8n_to_uvchr((U8*)RExC_parse,
13379         RExC_end - RExC_parse,
13380         &numlen, UTF8_ALLOW_DEFAULT);
13381    RExC_parse += numlen;
13382   }
13383   else
13384    value = UCHARAT(RExC_parse++);
13385
13386   if (value == '['
13387    && RExC_parse < RExC_end
13388    && POSIXCC(UCHARAT(RExC_parse)))
13389   {
13390    namedclass = regpposixcc(pRExC_state, value, strict);
13391   }
13392   else if (value == '\\') {
13393    if (UTF) {
13394     value = utf8n_to_uvchr((U8*)RExC_parse,
13395         RExC_end - RExC_parse,
13396         &numlen, UTF8_ALLOW_DEFAULT);
13397     RExC_parse += numlen;
13398    }
13399    else
13400     value = UCHARAT(RExC_parse++);
13401
13402    /* Some compilers cannot handle switching on 64-bit integer
13403    * values, therefore value cannot be an UV.  Yes, this will
13404    * be a problem later if we want switch on Unicode.
13405    * A similar issue a little bit later when switching on
13406    * namedclass. --jhi */
13407
13408    /* If the \ is escaping white space when white space is being
13409    * skipped, it means that that white space is wanted literally, and
13410    * is already in 'value'.  Otherwise, need to translate the escape
13411    * into what it signifies. */
13412    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13413
13414    case 'w': namedclass = ANYOF_WORDCHAR; break;
13415    case 'W': namedclass = ANYOF_NWORDCHAR; break;
13416    case 's': namedclass = ANYOF_SPACE; break;
13417    case 'S': namedclass = ANYOF_NSPACE; break;
13418    case 'd': namedclass = ANYOF_DIGIT; break;
13419    case 'D': namedclass = ANYOF_NDIGIT; break;
13420    case 'v': namedclass = ANYOF_VERTWS; break;
13421    case 'V': namedclass = ANYOF_NVERTWS; break;
13422    case 'h': namedclass = ANYOF_HORIZWS; break;
13423    case 'H': namedclass = ANYOF_NHORIZWS; break;
13424    case 'N':  /* Handle \N{NAME} in class */
13425     {
13426      /* We only pay attention to the first char of
13427      multichar strings being returned. I kinda wonder
13428      if this makes sense as it does change the behaviour
13429      from earlier versions, OTOH that behaviour was broken
13430      as well. */
13431      if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13432          TRUE, /* => charclass */
13433          strict))
13434      {
13435       if (*flagp & RESTART_UTF8)
13436        FAIL("panic: grok_bslash_N set RESTART_UTF8");
13437       goto parseit;
13438      }
13439     }
13440     break;
13441    case 'p':
13442    case 'P':
13443     {
13444     char *e;
13445
13446     /* We will handle any undefined properties ourselves */
13447     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13448          /* And we actually would prefer to get
13449           * the straight inversion list of the
13450           * swash, since we will be accessing it
13451           * anyway, to save a little time */
13452          |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13453
13454     if (RExC_parse >= RExC_end)
13455      vFAIL2("Empty \\%c{}", (U8)value);
13456     if (*RExC_parse == '{') {
13457      const U8 c = (U8)value;
13458      e = strchr(RExC_parse++, '}');
13459      if (!e)
13460       vFAIL2("Missing right brace on \\%c{}", c);
13461      while (isSPACE(UCHARAT(RExC_parse)))
13462       RExC_parse++;
13463      if (e == RExC_parse)
13464       vFAIL2("Empty \\%c{}", c);
13465      n = e - RExC_parse;
13466      while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13467       n--;
13468     }
13469     else {
13470      e = RExC_parse;
13471      n = 1;
13472     }
13473     if (!SIZE_ONLY) {
13474      SV* invlist;
13475      char* formatted;
13476      char* name;
13477
13478      if (UCHARAT(RExC_parse) == '^') {
13479       RExC_parse++;
13480       n--;
13481       /* toggle.  (The rhs xor gets the single bit that
13482       * differs between P and p; the other xor inverts just
13483       * that bit) */
13484       value ^= 'P' ^ 'p';
13485
13486       while (isSPACE(UCHARAT(RExC_parse))) {
13487        RExC_parse++;
13488        n--;
13489       }
13490      }
13491      /* Try to get the definition of the property into
13492      * <invlist>.  If /i is in effect, the effective property
13493      * will have its name be <__NAME_i>.  The design is
13494      * discussed in commit
13495      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13496      formatted = Perl_form(aTHX_
13497           "%s%.*s%s\n",
13498           (FOLD) ? "__" : "",
13499           (int)n,
13500           RExC_parse,
13501           (FOLD) ? "_i" : ""
13502         );
13503      name = savepvn(formatted, strlen(formatted));
13504
13505      /* Look up the property name, and get its swash and
13506      * inversion list, if the property is found  */
13507      if (swash) {
13508       SvREFCNT_dec_NN(swash);
13509      }
13510      swash = _core_swash_init("utf8", name, &PL_sv_undef,
13511            1, /* binary */
13512            0, /* not tr/// */
13513            NULL, /* No inversion list */
13514            &swash_init_flags
13515            );
13516      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13517       if (swash) {
13518        SvREFCNT_dec_NN(swash);
13519        swash = NULL;
13520       }
13521
13522       /* Here didn't find it.  It could be a user-defined
13523       * property that will be available at run-time.  If we
13524       * accept only compile-time properties, is an error;
13525       * otherwise add it to the list for run-time look up */
13526       if (ret_invlist) {
13527        RExC_parse = e + 1;
13528        vFAIL2utf8f(
13529         "Property '%"UTF8f"' is unknown",
13530         UTF8fARG(UTF, n, name));
13531       }
13532       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13533           (value == 'p' ? '+' : '!'),
13534           UTF8fARG(UTF, n, name));
13535       has_user_defined_property = TRUE;
13536
13537       /* We don't know yet, so have to assume that the
13538       * property could match something in the Latin1 range,
13539       * hence something that isn't utf8.  Note that this
13540       * would cause things in <depends_list> to match
13541       * inappropriately, except that any \p{}, including
13542       * this one forces Unicode semantics, which means there
13543       * is no <depends_list> */
13544       ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13545      }
13546      else {
13547
13548       /* Here, did get the swash and its inversion list.  If
13549       * the swash is from a user-defined property, then this
13550       * whole character class should be regarded as such */
13551       if (swash_init_flags
13552        & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13553       {
13554        has_user_defined_property = TRUE;
13555       }
13556       else if
13557        /* We warn on matching an above-Unicode code point
13558        * if the match would return true, except don't
13559        * warn for \p{All}, which has exactly one element
13560        * = 0 */
13561        (_invlist_contains_cp(invlist, 0x110000)
13562         && (! (_invlist_len(invlist) == 1
13563          && *invlist_array(invlist) == 0)))
13564       {
13565        warn_super = TRUE;
13566       }
13567
13568
13569       /* Invert if asking for the complement */
13570       if (value == 'P') {
13571        _invlist_union_complement_2nd(properties,
13572               invlist,
13573               &properties);
13574
13575        /* The swash can't be used as-is, because we've
13576        * inverted things; delay removing it to here after
13577        * have copied its invlist above */
13578        SvREFCNT_dec_NN(swash);
13579        swash = NULL;
13580       }
13581       else {
13582        _invlist_union(properties, invlist, &properties);
13583       }
13584      }
13585      Safefree(name);
13586     }
13587     RExC_parse = e + 1;
13588     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13589             named */
13590
13591     /* \p means they want Unicode semantics */
13592     RExC_uni_semantics = 1;
13593     }
13594     break;
13595    case 'n': value = '\n';   break;
13596    case 'r': value = '\r';   break;
13597    case 't': value = '\t';   break;
13598    case 'f': value = '\f';   break;
13599    case 'b': value = '\b';   break;
13600    case 'e': value = ASCII_TO_NATIVE('\033');break;
13601    case 'a': value = '\a';                   break;
13602    case 'o':
13603     RExC_parse--; /* function expects to be pointed at the 'o' */
13604     {
13605      const char* error_msg;
13606      bool valid = grok_bslash_o(&RExC_parse,
13607            &value,
13608            &error_msg,
13609            SIZE_ONLY,   /* warnings in pass
13610                1 only */
13611            strict,
13612            silence_non_portable,
13613            UTF);
13614      if (! valid) {
13615       vFAIL(error_msg);
13616      }
13617     }
13618     if (PL_encoding && value < 0x100) {
13619      goto recode_encoding;
13620     }
13621     break;
13622    case 'x':
13623     RExC_parse--; /* function expects to be pointed at the 'x' */
13624     {
13625      const char* error_msg;
13626      bool valid = grok_bslash_x(&RExC_parse,
13627            &value,
13628            &error_msg,
13629            TRUE, /* Output warnings */
13630            strict,
13631            silence_non_portable,
13632            UTF);
13633      if (! valid) {
13634       vFAIL(error_msg);
13635      }
13636     }
13637     if (PL_encoding && value < 0x100)
13638      goto recode_encoding;
13639     break;
13640    case 'c':
13641     value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13642     break;
13643    case '0': case '1': case '2': case '3': case '4':
13644    case '5': case '6': case '7':
13645     {
13646      /* Take 1-3 octal digits */
13647      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13648      numlen = (strict) ? 4 : 3;
13649      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13650      RExC_parse += numlen;
13651      if (numlen != 3) {
13652       if (strict) {
13653        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13654        vFAIL("Need exactly 3 octal digits");
13655       }
13656       else if (! SIZE_ONLY /* like \08, \178 */
13657         && numlen < 3
13658         && RExC_parse < RExC_end
13659         && isDIGIT(*RExC_parse)
13660         && ckWARN(WARN_REGEXP))
13661       {
13662        SAVEFREESV(RExC_rx_sv);
13663        reg_warn_non_literal_string(
13664         RExC_parse + 1,
13665         form_short_octal_warning(RExC_parse, numlen));
13666        (void)ReREFCNT_inc(RExC_rx_sv);
13667       }
13668      }
13669      if (PL_encoding && value < 0x100)
13670       goto recode_encoding;
13671      break;
13672     }
13673    recode_encoding:
13674     if (! RExC_override_recoding) {
13675      SV* enc = PL_encoding;
13676      value = reg_recode((const char)(U8)value, &enc);
13677      if (!enc) {
13678       if (strict) {
13679        vFAIL("Invalid escape in the specified encoding");
13680       }
13681       else if (SIZE_ONLY) {
13682        ckWARNreg(RExC_parse,
13683         "Invalid escape in the specified encoding");
13684       }
13685      }
13686      break;
13687     }
13688    default:
13689     /* Allow \_ to not give an error */
13690     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13691      if (strict) {
13692       vFAIL2("Unrecognized escape \\%c in character class",
13693        (int)value);
13694      }
13695      else {
13696       SAVEFREESV(RExC_rx_sv);
13697       ckWARN2reg(RExC_parse,
13698        "Unrecognized escape \\%c in character class passed through",
13699        (int)value);
13700       (void)ReREFCNT_inc(RExC_rx_sv);
13701      }
13702     }
13703     break;
13704    }   /* End of switch on char following backslash */
13705   } /* end of handling backslash escape sequences */
13706 #ifdef EBCDIC
13707   else
13708    literal_endpoint++;
13709 #endif
13710
13711   /* Here, we have the current token in 'value' */
13712
13713   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13714    U8 classnum;
13715
13716    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13717    * literal, as is the character that began the false range, i.e.
13718    * the 'a' in the examples */
13719    if (range) {
13720     if (!SIZE_ONLY) {
13721      const int w = (RExC_parse >= rangebegin)
13722         ? RExC_parse - rangebegin
13723         : 0;
13724      if (strict) {
13725       vFAIL2utf8f(
13726        "False [] range \"%"UTF8f"\"",
13727        UTF8fARG(UTF, w, rangebegin));
13728      }
13729      else {
13730       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13731       ckWARN2reg(RExC_parse,
13732        "False [] range \"%"UTF8f"\"",
13733        UTF8fARG(UTF, w, rangebegin));
13734       (void)ReREFCNT_inc(RExC_rx_sv);
13735       cp_list = add_cp_to_invlist(cp_list, '-');
13736       cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13737                prevvalue);
13738      }
13739     }
13740
13741     range = 0; /* this was not a true range */
13742     element_count += 2; /* So counts for three values */
13743    }
13744
13745    classnum = namedclass_to_classnum(namedclass);
13746
13747    if (LOC && namedclass < ANYOF_POSIXL_MAX
13748 #ifndef HAS_ISASCII
13749     && classnum != _CC_ASCII
13750 #endif
13751    ) {
13752     /* What the Posix classes (like \w, [:space:]) match in locale
13753     * isn't knowable under locale until actual match time.  Room
13754     * must be reserved (one time per outer bracketed class) to
13755     * store such classes.  The space will contain a bit for each
13756     * named class that is to be matched against.  This isn't
13757     * needed for \p{} and pseudo-classes, as they are not affected
13758     * by locale, and hence are dealt with separately */
13759     if (! need_class) {
13760      need_class = 1;
13761      if (SIZE_ONLY) {
13762       RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13763      }
13764      else {
13765       RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13766      }
13767      ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13768      ANYOF_POSIXL_ZERO(ret);
13769     }
13770
13771     /* See if it already matches the complement of this POSIX
13772     * class */
13773     if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13774      && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13775                ? -1
13776                : 1)))
13777     {
13778      posixl_matches_all = TRUE;
13779      break;  /* No need to continue.  Since it matches both
13780        e.g., \w and \W, it matches everything, and the
13781        bracketed class can be optimized into qr/./s */
13782     }
13783
13784     /* Add this class to those that should be checked at runtime */
13785     ANYOF_POSIXL_SET(ret, namedclass);
13786
13787     /* The above-Latin1 characters are not subject to locale rules.
13788     * Just add them, in the second pass, to the
13789     * unconditionally-matched list */
13790     if (! SIZE_ONLY) {
13791      SV* scratch_list = NULL;
13792
13793      /* Get the list of the above-Latin1 code points this
13794      * matches */
13795      _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13796           PL_XPosix_ptrs[classnum],
13797
13798           /* Odd numbers are complements, like
13799           * NDIGIT, NASCII, ... */
13800           namedclass % 2 != 0,
13801           &scratch_list);
13802      /* Checking if 'cp_list' is NULL first saves an extra
13803      * clone.  Its reference count will be decremented at the
13804      * next union, etc, or if this is the only instance, at the
13805      * end of the routine */
13806      if (! cp_list) {
13807       cp_list = scratch_list;
13808      }
13809      else {
13810       _invlist_union(cp_list, scratch_list, &cp_list);
13811       SvREFCNT_dec_NN(scratch_list);
13812      }
13813      continue;   /* Go get next character */
13814     }
13815    }
13816    else if (! SIZE_ONLY) {
13817
13818     /* Here, not in pass1 (in that pass we skip calculating the
13819     * contents of this class), and is /l, or is a POSIX class for
13820     * which /l doesn't matter (or is a Unicode property, which is
13821     * skipped here). */
13822     if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13823      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13824
13825       /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13826       * nor /l make a difference in what these match,
13827       * therefore we just add what they match to cp_list. */
13828       if (classnum != _CC_VERTSPACE) {
13829        assert(   namedclass == ANYOF_HORIZWS
13830         || namedclass == ANYOF_NHORIZWS);
13831
13832        /* It turns out that \h is just a synonym for
13833        * XPosixBlank */
13834        classnum = _CC_BLANK;
13835       }
13836
13837       _invlist_union_maybe_complement_2nd(
13838         cp_list,
13839         PL_XPosix_ptrs[classnum],
13840         namedclass % 2 != 0,    /* Complement if odd
13841               (NHORIZWS, NVERTWS)
13842               */
13843         &cp_list);
13844      }
13845     }
13846     else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13847       complement and use nposixes */
13848      SV** posixes_ptr = namedclass % 2 == 0
13849          ? &posixes
13850          : &nposixes;
13851      SV** source_ptr = &PL_XPosix_ptrs[classnum];
13852      _invlist_union_maybe_complement_2nd(
13853              *posixes_ptr,
13854              *source_ptr,
13855              namedclass % 2 != 0,
13856              posixes_ptr);
13857     }
13858     continue;   /* Go get next character */
13859    }
13860   } /* end of namedclass \blah */
13861
13862   /* Here, we have a single value.  If 'range' is set, it is the ending
13863   * of a range--check its validity.  Later, we will handle each
13864   * individual code point in the range.  If 'range' isn't set, this
13865   * could be the beginning of a range, so check for that by looking
13866   * ahead to see if the next real character to be processed is the range
13867   * indicator--the minus sign */
13868
13869   if (skip_white) {
13870    RExC_parse = regpatws(pRExC_state, RExC_parse,
13871         FALSE /* means don't recognize comments */);
13872   }
13873
13874   if (range) {
13875    if (prevvalue > value) /* b-a */ {
13876     const int w = RExC_parse - rangebegin;
13877     vFAIL2utf8f(
13878      "Invalid [] range \"%"UTF8f"\"",
13879      UTF8fARG(UTF, w, rangebegin));
13880     range = 0; /* not a valid range */
13881    }
13882   }
13883   else {
13884    prevvalue = value; /* save the beginning of the potential range */
13885    if (! stop_at_1     /* Can't be a range if parsing just one thing */
13886     && *RExC_parse == '-')
13887    {
13888     char* next_char_ptr = RExC_parse + 1;
13889     if (skip_white) {   /* Get the next real char after the '-' */
13890      next_char_ptr = regpatws(pRExC_state,
13891            RExC_parse + 1,
13892            FALSE); /* means don't recognize
13893               comments */
13894     }
13895
13896     /* If the '-' is at the end of the class (just before the ']',
13897     * it is a literal minus; otherwise it is a range */
13898     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13899      RExC_parse = next_char_ptr;
13900
13901      /* a bad range like \w-, [:word:]- ? */
13902      if (namedclass > OOB_NAMEDCLASS) {
13903       if (strict || ckWARN(WARN_REGEXP)) {
13904        const int w =
13905         RExC_parse >= rangebegin ?
13906         RExC_parse - rangebegin : 0;
13907        if (strict) {
13908         vFAIL4("False [] range \"%*.*s\"",
13909          w, w, rangebegin);
13910        }
13911        else {
13912         vWARN4(RExC_parse,
13913          "False [] range \"%*.*s\"",
13914          w, w, rangebegin);
13915        }
13916       }
13917       if (!SIZE_ONLY) {
13918        cp_list = add_cp_to_invlist(cp_list, '-');
13919       }
13920       element_count++;
13921      } else
13922       range = 1; /* yeah, it's a range! */
13923      continue; /* but do it the next time */
13924     }
13925    }
13926   }
13927
13928   /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13929   * if not */
13930
13931   /* non-Latin1 code point implies unicode semantics.  Must be set in
13932   * pass1 so is there for the whole of pass 2 */
13933   if (value > 255) {
13934    RExC_uni_semantics = 1;
13935   }
13936
13937   /* Ready to process either the single value, or the completed range.
13938   * For single-valued non-inverted ranges, we consider the possibility
13939   * of multi-char folds.  (We made a conscious decision to not do this
13940   * for the other cases because it can often lead to non-intuitive
13941   * results.  For example, you have the peculiar case that:
13942   *  "s s" =~ /^[^\xDF]+$/i => Y
13943   *  "ss"  =~ /^[^\xDF]+$/i => N
13944   *
13945   * See [perl #89750] */
13946   if (FOLD && allow_multi_folds && value == prevvalue) {
13947    if (value == LATIN_SMALL_LETTER_SHARP_S
13948     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13949               value)))
13950    {
13951     /* Here <value> is indeed a multi-char fold.  Get what it is */
13952
13953     U8 foldbuf[UTF8_MAXBYTES_CASE];
13954     STRLEN foldlen;
13955
13956     UV folded = _to_uni_fold_flags(
13957         value,
13958         foldbuf,
13959         &foldlen,
13960         FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
13961             ? FOLD_FLAGS_NOMIX_ASCII
13962             : 0)
13963         );
13964
13965     /* Here, <folded> should be the first character of the
13966     * multi-char fold of <value>, with <foldbuf> containing the
13967     * whole thing.  But, if this fold is not allowed (because of
13968     * the flags), <fold> will be the same as <value>, and should
13969     * be processed like any other character, so skip the special
13970     * handling */
13971     if (folded != value) {
13972
13973      /* Skip if we are recursed, currently parsing the class
13974      * again.  Otherwise add this character to the list of
13975      * multi-char folds. */
13976      if (! RExC_in_multi_char_class) {
13977       AV** this_array_ptr;
13978       AV* this_array;
13979       STRLEN cp_count = utf8_length(foldbuf,
13980              foldbuf + foldlen);
13981       SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13982
13983       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13984
13985
13986       if (! multi_char_matches) {
13987        multi_char_matches = newAV();
13988       }
13989
13990       /* <multi_char_matches> is actually an array of arrays.
13991       * There will be one or two top-level elements: [2],
13992       * and/or [3].  The [2] element is an array, each
13993       * element thereof is a character which folds to TWO
13994       * characters; [3] is for folds to THREE characters.
13995       * (Unicode guarantees a maximum of 3 characters in any
13996       * fold.)  When we rewrite the character class below,
13997       * we will do so such that the longest folds are
13998       * written first, so that it prefers the longest
13999       * matching strings first.  This is done even if it
14000       * turns out that any quantifier is non-greedy, out of
14001       * programmer laziness.  Tom Christiansen has agreed
14002       * that this is ok.  This makes the test for the
14003       * ligature 'ffi' come before the test for 'ff' */
14004       if (av_exists(multi_char_matches, cp_count)) {
14005        this_array_ptr = (AV**) av_fetch(multi_char_matches,
14006                cp_count, FALSE);
14007        this_array = *this_array_ptr;
14008       }
14009       else {
14010        this_array = newAV();
14011        av_store(multi_char_matches, cp_count,
14012          (SV*) this_array);
14013       }
14014       av_push(this_array, multi_fold);
14015      }
14016
14017      /* This element should not be processed further in this
14018      * class */
14019      element_count--;
14020      value = save_value;
14021      prevvalue = save_prevvalue;
14022      continue;
14023     }
14024    }
14025   }
14026
14027   /* Deal with this element of the class */
14028   if (! SIZE_ONLY) {
14029 #ifndef EBCDIC
14030    cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14031              prevvalue, value);
14032 #else
14033    SV* this_range = _new_invlist(1);
14034    _append_range_to_invlist(this_range, prevvalue, value);
14035
14036    /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14037    * If this range was specified using something like 'i-j', we want
14038    * to include only the 'i' and the 'j', and not anything in
14039    * between, so exclude non-ASCII, non-alphabetics from it.
14040    * However, if the range was specified with something like
14041    * [\x89-\x91] or [\x89-j], all code points within it should be
14042    * included.  literal_endpoint==2 means both ends of the range used
14043    * a literal character, not \x{foo} */
14044    if (literal_endpoint == 2
14045     && ((prevvalue >= 'a' && value <= 'z')
14046      || (prevvalue >= 'A' && value <= 'Z')))
14047    {
14048     _invlist_intersection(this_range, PL_ASCII,
14049          &this_range);
14050
14051     /* Since this above only contains ascii, the intersection of it
14052     * with anything will still yield only ascii */
14053     _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14054          &this_range);
14055    }
14056    _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14057    literal_endpoint = 0;
14058 #endif
14059   }
14060
14061   range = 0; /* this range (if it was one) is done now */
14062  } /* End of loop through all the text within the brackets */
14063
14064  /* If anything in the class expands to more than one character, we have to
14065  * deal with them by building up a substitute parse string, and recursively
14066  * calling reg() on it, instead of proceeding */
14067  if (multi_char_matches) {
14068   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14069   I32 cp_count;
14070   STRLEN len;
14071   char *save_end = RExC_end;
14072   char *save_parse = RExC_parse;
14073   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14074          a "|" */
14075   I32 reg_flags;
14076
14077   assert(! invert);
14078 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14079   because too confusing */
14080   if (invert) {
14081    sv_catpv(substitute_parse, "(?:");
14082   }
14083 #endif
14084
14085   /* Look at the longest folds first */
14086   for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14087
14088    if (av_exists(multi_char_matches, cp_count)) {
14089     AV** this_array_ptr;
14090     SV* this_sequence;
14091
14092     this_array_ptr = (AV**) av_fetch(multi_char_matches,
14093             cp_count, FALSE);
14094     while ((this_sequence = av_pop(*this_array_ptr)) !=
14095                 &PL_sv_undef)
14096     {
14097      if (! first_time) {
14098       sv_catpv(substitute_parse, "|");
14099      }
14100      first_time = FALSE;
14101
14102      sv_catpv(substitute_parse, SvPVX(this_sequence));
14103     }
14104    }
14105   }
14106
14107   /* If the character class contains anything else besides these
14108   * multi-character folds, have to include it in recursive parsing */
14109   if (element_count) {
14110    sv_catpv(substitute_parse, "|[");
14111    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14112    sv_catpv(substitute_parse, "]");
14113   }
14114
14115   sv_catpv(substitute_parse, ")");
14116 #if 0
14117   if (invert) {
14118    /* This is a way to get the parse to skip forward a whole named
14119    * sequence instead of matching the 2nd character when it fails the
14120    * first */
14121    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14122   }
14123 #endif
14124
14125   RExC_parse = SvPV(substitute_parse, len);
14126   RExC_end = RExC_parse + len;
14127   RExC_in_multi_char_class = 1;
14128   RExC_emit = (regnode *)orig_emit;
14129
14130   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14131
14132   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14133
14134   RExC_parse = save_parse;
14135   RExC_end = save_end;
14136   RExC_in_multi_char_class = 0;
14137   SvREFCNT_dec_NN(multi_char_matches);
14138   return ret;
14139  }
14140
14141  /* Here, we've gone through the entire class and dealt with multi-char
14142  * folds.  We are now in a position that we can do some checks to see if we
14143  * can optimize this ANYOF node into a simpler one, even in Pass 1.
14144  * Currently we only do two checks:
14145  * 1) is in the unlikely event that the user has specified both, eg. \w and
14146  *    \W under /l, then the class matches everything.  (This optimization
14147  *    is done only to make the optimizer code run later work.)
14148  * 2) if the character class contains only a single element (including a
14149  *    single range), we see if there is an equivalent node for it.
14150  * Other checks are possible */
14151  if (! ret_invlist   /* Can't optimize if returning the constructed
14152       inversion list */
14153   && (UNLIKELY(posixl_matches_all) || element_count == 1))
14154  {
14155   U8 op = END;
14156   U8 arg = 0;
14157
14158   if (UNLIKELY(posixl_matches_all)) {
14159    op = SANY;
14160   }
14161   else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14162             \w or [:digit:] or \p{foo}
14163             */
14164
14165    /* All named classes are mapped into POSIXish nodes, with its FLAG
14166    * argument giving which class it is */
14167    switch ((I32)namedclass) {
14168     case ANYOF_UNIPROP:
14169      break;
14170
14171     /* These don't depend on the charset modifiers.  They always
14172     * match under /u rules */
14173     case ANYOF_NHORIZWS:
14174     case ANYOF_HORIZWS:
14175      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14176      /* FALLTHROUGH */
14177
14178     case ANYOF_NVERTWS:
14179     case ANYOF_VERTWS:
14180      op = POSIXU;
14181      goto join_posix;
14182
14183     /* The actual POSIXish node for all the rest depends on the
14184     * charset modifier.  The ones in the first set depend only on
14185     * ASCII or, if available on this platform, locale */
14186     case ANYOF_ASCII:
14187     case ANYOF_NASCII:
14188 #ifdef HAS_ISASCII
14189      op = (LOC) ? POSIXL : POSIXA;
14190 #else
14191      op = POSIXA;
14192 #endif
14193      goto join_posix;
14194
14195     case ANYOF_NCASED:
14196     case ANYOF_LOWER:
14197     case ANYOF_NLOWER:
14198     case ANYOF_UPPER:
14199     case ANYOF_NUPPER:
14200      /* under /a could be alpha */
14201      if (FOLD) {
14202       if (ASCII_RESTRICTED) {
14203        namedclass = ANYOF_ALPHA + (namedclass % 2);
14204       }
14205       else if (! LOC) {
14206        break;
14207       }
14208      }
14209      /* FALLTHROUGH */
14210
14211     /* The rest have more possibilities depending on the charset.
14212     * We take advantage of the enum ordering of the charset
14213     * modifiers to get the exact node type, */
14214     default:
14215      op = POSIXD + get_regex_charset(RExC_flags);
14216      if (op > POSIXA) { /* /aa is same as /a */
14217       op = POSIXA;
14218      }
14219
14220     join_posix:
14221      /* The odd numbered ones are the complements of the
14222      * next-lower even number one */
14223      if (namedclass % 2 == 1) {
14224       invert = ! invert;
14225       namedclass--;
14226      }
14227      arg = namedclass_to_classnum(namedclass);
14228      break;
14229    }
14230   }
14231   else if (value == prevvalue) {
14232
14233    /* Here, the class consists of just a single code point */
14234
14235    if (invert) {
14236     if (! LOC && value == '\n') {
14237      op = REG_ANY; /* Optimize [^\n] */
14238      *flagp |= HASWIDTH|SIMPLE;
14239      RExC_naughty++;
14240     }
14241    }
14242    else if (value < 256 || UTF) {
14243
14244     /* Optimize a single value into an EXACTish node, but not if it
14245     * would require converting the pattern to UTF-8. */
14246     op = compute_EXACTish(pRExC_state);
14247    }
14248   } /* Otherwise is a range */
14249   else if (! LOC) {   /* locale could vary these */
14250    if (prevvalue == '0') {
14251     if (value == '9') {
14252      arg = _CC_DIGIT;
14253      op = POSIXA;
14254     }
14255    }
14256   }
14257
14258   /* Here, we have changed <op> away from its initial value iff we found
14259   * an optimization */
14260   if (op != END) {
14261
14262    /* Throw away this ANYOF regnode, and emit the calculated one,
14263    * which should correspond to the beginning, not current, state of
14264    * the parse */
14265    const char * cur_parse = RExC_parse;
14266    RExC_parse = (char *)orig_parse;
14267    if ( SIZE_ONLY) {
14268     if (! LOC) {
14269
14270      /* To get locale nodes to not use the full ANYOF size would
14271      * require moving the code above that writes the portions
14272      * of it that aren't in other nodes to after this point.
14273      * e.g.  ANYOF_POSIXL_SET */
14274      RExC_size = orig_size;
14275     }
14276    }
14277    else {
14278     RExC_emit = (regnode *)orig_emit;
14279     if (PL_regkind[op] == POSIXD) {
14280      if (op == POSIXL) {
14281       RExC_contains_locale = 1;
14282      }
14283      if (invert) {
14284       op += NPOSIXD - POSIXD;
14285      }
14286     }
14287    }
14288
14289    ret = reg_node(pRExC_state, op);
14290
14291    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14292     if (! SIZE_ONLY) {
14293      FLAGS(ret) = arg;
14294     }
14295     *flagp |= HASWIDTH|SIMPLE;
14296    }
14297    else if (PL_regkind[op] == EXACT) {
14298     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14299           TRUE /* downgradable to EXACT */
14300           );
14301    }
14302
14303    RExC_parse = (char *) cur_parse;
14304
14305    SvREFCNT_dec(posixes);
14306    SvREFCNT_dec(nposixes);
14307    SvREFCNT_dec(cp_list);
14308    SvREFCNT_dec(cp_foldable_list);
14309    return ret;
14310   }
14311  }
14312
14313  if (SIZE_ONLY)
14314   return ret;
14315  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14316
14317  /* If folding, we calculate all characters that could fold to or from the
14318  * ones already on the list */
14319  if (cp_foldable_list) {
14320   if (FOLD) {
14321    UV start, end; /* End points of code point ranges */
14322
14323    SV* fold_intersection = NULL;
14324    SV** use_list;
14325
14326    /* Our calculated list will be for Unicode rules.  For locale
14327    * matching, we have to keep a separate list that is consulted at
14328    * runtime only when the locale indicates Unicode rules.  For
14329    * non-locale, we just use to the general list */
14330    if (LOC) {
14331     use_list = &only_utf8_locale_list;
14332    }
14333    else {
14334     use_list = &cp_list;
14335    }
14336
14337    /* Only the characters in this class that participate in folds need
14338    * be checked.  Get the intersection of this class and all the
14339    * possible characters that are foldable.  This can quickly narrow
14340    * down a large class */
14341    _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14342         &fold_intersection);
14343
14344    /* The folds for all the Latin1 characters are hard-coded into this
14345    * program, but we have to go out to disk to get the others. */
14346    if (invlist_highest(cp_foldable_list) >= 256) {
14347
14348     /* This is a hash that for a particular fold gives all
14349     * characters that are involved in it */
14350     if (! PL_utf8_foldclosures) {
14351
14352      /* If the folds haven't been read in, call a fold function
14353      * to force that */
14354      if (! PL_utf8_tofold) {
14355       U8 dummy[UTF8_MAXBYTES_CASE+1];
14356
14357       /* This string is just a short named one above \xff */
14358       to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14359       assert(PL_utf8_tofold); /* Verify that worked */
14360      }
14361      PL_utf8_foldclosures
14362          = _swash_inversion_hash(PL_utf8_tofold);
14363     }
14364    }
14365
14366    /* Now look at the foldable characters in this class individually */
14367    invlist_iterinit(fold_intersection);
14368    while (invlist_iternext(fold_intersection, &start, &end)) {
14369     UV j;
14370
14371     /* Look at every character in the range */
14372     for (j = start; j <= end; j++) {
14373      U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14374      STRLEN foldlen;
14375      SV** listp;
14376
14377      if (j < 256) {
14378
14379       /* We have the latin1 folding rules hard-coded here so
14380       * that an innocent-looking character class, like
14381       * /[ks]/i won't have to go out to disk to find the
14382       * possible matches.  XXX It would be better to
14383       * generate these via regen, in case a new version of
14384       * the Unicode standard adds new mappings, though that
14385       * is not really likely, and may be caught by the
14386       * default: case of the switch below. */
14387
14388       if (IS_IN_SOME_FOLD_L1(j)) {
14389
14390        /* ASCII is always matched; non-ASCII is matched
14391        * only under Unicode rules (which could happen
14392        * under /l if the locale is a UTF-8 one */
14393        if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14394         *use_list = add_cp_to_invlist(*use_list,
14395                PL_fold_latin1[j]);
14396        }
14397        else {
14398         depends_list =
14399         add_cp_to_invlist(depends_list,
14400             PL_fold_latin1[j]);
14401        }
14402       }
14403
14404       if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14405        && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14406       {
14407        /* Certain Latin1 characters have matches outside
14408        * Latin1.  To get here, <j> is one of those
14409        * characters.   None of these matches is valid for
14410        * ASCII characters under /aa, which is why the 'if'
14411        * just above excludes those.  These matches only
14412        * happen when the target string is utf8.  The code
14413        * below adds the single fold closures for <j> to the
14414        * inversion list. */
14415
14416        switch (j) {
14417         case 'k':
14418         case 'K':
14419         *use_list =
14420          add_cp_to_invlist(*use_list, KELVIN_SIGN);
14421          break;
14422         case 's':
14423         case 'S':
14424         *use_list = add_cp_to_invlist(*use_list,
14425              LATIN_SMALL_LETTER_LONG_S);
14426          break;
14427         case MICRO_SIGN:
14428         *use_list = add_cp_to_invlist(*use_list,
14429              GREEK_CAPITAL_LETTER_MU);
14430         *use_list = add_cp_to_invlist(*use_list,
14431               GREEK_SMALL_LETTER_MU);
14432          break;
14433         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14434         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14435         *use_list =
14436         add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
14437          break;
14438         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14439         *use_list = add_cp_to_invlist(*use_list,
14440           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14441          break;
14442         case LATIN_SMALL_LETTER_SHARP_S:
14443         *use_list = add_cp_to_invlist(*use_list,
14444             LATIN_CAPITAL_LETTER_SHARP_S);
14445          break;
14446         case 'F': case 'f':
14447         case 'I': case 'i':
14448         case 'L': case 'l':
14449         case 'T': case 't':
14450         case 'A': case 'a':
14451         case 'H': case 'h':
14452         case 'J': case 'j':
14453         case 'N': case 'n':
14454         case 'W': case 'w':
14455         case 'Y': case 'y':
14456          /* These all are targets of multi-character
14457          * folds from code points that require UTF8
14458          * to express, so they can't match unless
14459          * the target string is in UTF-8, so no
14460          * action here is necessary, as regexec.c
14461          * properly handles the general case for
14462          * UTF-8 matching and multi-char folds */
14463          break;
14464         default:
14465          /* Use deprecated warning to increase the
14466          * chances of this being output */
14467          ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14468          break;
14469        }
14470       }
14471       continue;
14472      }
14473
14474      /* Here is an above Latin1 character.  We don't have the
14475      * rules hard-coded for it.  First, get its fold.  This is
14476      * the simple fold, as the multi-character folds have been
14477      * handled earlier and separated out */
14478      _to_uni_fold_flags(j, foldbuf, &foldlen,
14479               (ASCII_FOLD_RESTRICTED)
14480               ? FOLD_FLAGS_NOMIX_ASCII
14481               : 0);
14482
14483      /* Single character fold of above Latin1.  Add everything in
14484      * its fold closure to the list that this node should match.
14485      * The fold closures data structure is a hash with the keys
14486      * being the UTF-8 of every character that is folded to, like
14487      * 'k', and the values each an array of all code points that
14488      * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14489      * Multi-character folds are not included */
14490      if ((listp = hv_fetch(PL_utf8_foldclosures,
14491           (char *) foldbuf, foldlen, FALSE)))
14492      {
14493       AV* list = (AV*) *listp;
14494       IV k;
14495       for (k = 0; k <= av_tindex(list); k++) {
14496        SV** c_p = av_fetch(list, k, FALSE);
14497        UV c;
14498        if (c_p == NULL) {
14499         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14500        }
14501        c = SvUV(*c_p);
14502
14503        /* /aa doesn't allow folds between ASCII and non- */
14504        if ((ASCII_FOLD_RESTRICTED
14505         && (isASCII(c) != isASCII(j))))
14506        {
14507         continue;
14508        }
14509
14510        /* Folds under /l which cross the 255/256 boundary
14511        * are added to a separate list.  (These are valid
14512        * only when the locale is UTF-8.) */
14513        if (c < 256 && LOC) {
14514         *use_list = add_cp_to_invlist(*use_list, c);
14515         continue;
14516        }
14517
14518        if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14519        {
14520         cp_list = add_cp_to_invlist(cp_list, c);
14521        }
14522        else {
14523         /* Similarly folds involving non-ascii Latin1
14524         * characters under /d are added to their list */
14525         depends_list = add_cp_to_invlist(depends_list,
14526                 c);
14527        }
14528       }
14529      }
14530     }
14531    }
14532    SvREFCNT_dec_NN(fold_intersection);
14533   }
14534
14535   /* Now that we have finished adding all the folds, there is no reason
14536   * to keep the foldable list separate */
14537   _invlist_union(cp_list, cp_foldable_list, &cp_list);
14538   SvREFCNT_dec_NN(cp_foldable_list);
14539  }
14540
14541  /* And combine the result (if any) with any inversion list from posix
14542  * classes.  The lists are kept separate up to now because we don't want to
14543  * fold the classes (folding of those is automatically handled by the swash
14544  * fetching code) */
14545  if (posixes || nposixes) {
14546   if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14547    /* Under /a and /aa, nothing above ASCII matches these */
14548    _invlist_intersection(posixes,
14549         PL_XPosix_ptrs[_CC_ASCII],
14550         &posixes);
14551   }
14552   if (nposixes) {
14553    if (DEPENDS_SEMANTICS) {
14554     /* Under /d, everything in the upper half of the Latin1 range
14555     * matches these complements */
14556     ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14557    }
14558    else if (AT_LEAST_ASCII_RESTRICTED) {
14559     /* Under /a and /aa, everything above ASCII matches these
14560     * complements */
14561     _invlist_union_complement_2nd(nposixes,
14562            PL_XPosix_ptrs[_CC_ASCII],
14563            &nposixes);
14564    }
14565    if (posixes) {
14566     _invlist_union(posixes, nposixes, &posixes);
14567     SvREFCNT_dec_NN(nposixes);
14568    }
14569    else {
14570     posixes = nposixes;
14571    }
14572   }
14573   if (! DEPENDS_SEMANTICS) {
14574    if (cp_list) {
14575     _invlist_union(cp_list, posixes, &cp_list);
14576     SvREFCNT_dec_NN(posixes);
14577    }
14578    else {
14579     cp_list = posixes;
14580    }
14581   }
14582   else {
14583    /* Under /d, we put into a separate list the Latin1 things that
14584    * match only when the target string is utf8 */
14585    SV* nonascii_but_latin1_properties = NULL;
14586    _invlist_intersection(posixes, PL_UpperLatin1,
14587         &nonascii_but_latin1_properties);
14588    _invlist_subtract(posixes, nonascii_but_latin1_properties,
14589        &posixes);
14590    if (cp_list) {
14591     _invlist_union(cp_list, posixes, &cp_list);
14592     SvREFCNT_dec_NN(posixes);
14593    }
14594    else {
14595     cp_list = posixes;
14596    }
14597
14598    if (depends_list) {
14599     _invlist_union(depends_list, nonascii_but_latin1_properties,
14600        &depends_list);
14601     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14602    }
14603    else {
14604     depends_list = nonascii_but_latin1_properties;
14605    }
14606   }
14607  }
14608
14609  /* And combine the result (if any) with any inversion list from properties.
14610  * The lists are kept separate up to now so that we can distinguish the two
14611  * in regards to matching above-Unicode.  A run-time warning is generated
14612  * if a Unicode property is matched against a non-Unicode code point. But,
14613  * we allow user-defined properties to match anything, without any warning,
14614  * and we also suppress the warning if there is a portion of the character
14615  * class that isn't a Unicode property, and which matches above Unicode, \W
14616  * or [\x{110000}] for example.
14617  * (Note that in this case, unlike the Posix one above, there is no
14618  * <depends_list>, because having a Unicode property forces Unicode
14619  * semantics */
14620  if (properties) {
14621   if (cp_list) {
14622
14623    /* If it matters to the final outcome, see if a non-property
14624    * component of the class matches above Unicode.  If so, the
14625    * warning gets suppressed.  This is true even if just a single
14626    * such code point is specified, as though not strictly correct if
14627    * another such code point is matched against, the fact that they
14628    * are using above-Unicode code points indicates they should know
14629    * the issues involved */
14630    if (warn_super) {
14631     warn_super = ! (invert
14632        ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14633    }
14634
14635    _invlist_union(properties, cp_list, &cp_list);
14636    SvREFCNT_dec_NN(properties);
14637   }
14638   else {
14639    cp_list = properties;
14640   }
14641
14642   if (warn_super) {
14643    ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14644   }
14645  }
14646
14647  /* Here, we have calculated what code points should be in the character
14648  * class.
14649  *
14650  * Now we can see about various optimizations.  Fold calculation (which we
14651  * did above) needs to take place before inversion.  Otherwise /[^k]/i
14652  * would invert to include K, which under /i would match k, which it
14653  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14654  * folded until runtime */
14655
14656  /* If we didn't do folding, it's because some information isn't available
14657  * until runtime; set the run-time fold flag for these.  (We don't have to
14658  * worry about properties folding, as that is taken care of by the swash
14659  * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14660  * locales, or the class matches at least one 0-255 range code point */
14661  if (LOC && FOLD) {
14662   if (only_utf8_locale_list) {
14663    ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14664   }
14665   else if (cp_list) { /* Look to see if there a 0-255 code point is in
14666        the list */
14667    UV start, end;
14668    invlist_iterinit(cp_list);
14669    if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14670     ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14671    }
14672    invlist_iterfinish(cp_list);
14673   }
14674  }
14675
14676  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14677  * at compile time.  Besides not inverting folded locale now, we can't
14678  * invert if there are things such as \w, which aren't known until runtime
14679  * */
14680  if (cp_list
14681   && invert
14682   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14683   && ! depends_list
14684   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14685  {
14686   _invlist_invert(cp_list);
14687
14688   /* Any swash can't be used as-is, because we've inverted things */
14689   if (swash) {
14690    SvREFCNT_dec_NN(swash);
14691    swash = NULL;
14692   }
14693
14694   /* Clear the invert flag since have just done it here */
14695   invert = FALSE;
14696  }
14697
14698  if (ret_invlist) {
14699   *ret_invlist = cp_list;
14700   SvREFCNT_dec(swash);
14701
14702   /* Discard the generated node */
14703   if (SIZE_ONLY) {
14704    RExC_size = orig_size;
14705   }
14706   else {
14707    RExC_emit = orig_emit;
14708   }
14709   return orig_emit;
14710  }
14711
14712  /* Some character classes are equivalent to other nodes.  Such nodes take
14713  * up less room and generally fewer operations to execute than ANYOF nodes.
14714  * Above, we checked for and optimized into some such equivalents for
14715  * certain common classes that are easy to test.  Getting to this point in
14716  * the code means that the class didn't get optimized there.  Since this
14717  * code is only executed in Pass 2, it is too late to save space--it has
14718  * been allocated in Pass 1, and currently isn't given back.  But turning
14719  * things into an EXACTish node can allow the optimizer to join it to any
14720  * adjacent such nodes.  And if the class is equivalent to things like /./,
14721  * expensive run-time swashes can be avoided.  Now that we have more
14722  * complete information, we can find things necessarily missed by the
14723  * earlier code.  I (khw) am not sure how much to look for here.  It would
14724  * be easy, but perhaps too slow, to check any candidates against all the
14725  * node types they could possibly match using _invlistEQ(). */
14726
14727  if (cp_list
14728   && ! invert
14729   && ! depends_list
14730   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14731   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14732
14733   /* We don't optimize if we are supposed to make sure all non-Unicode
14734    * code points raise a warning, as only ANYOF nodes have this check.
14735    * */
14736   && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14737  {
14738   UV start, end;
14739   U8 op = END;  /* The optimzation node-type */
14740   const char * cur_parse= RExC_parse;
14741
14742   invlist_iterinit(cp_list);
14743   if (! invlist_iternext(cp_list, &start, &end)) {
14744
14745    /* Here, the list is empty.  This happens, for example, when a
14746    * Unicode property is the only thing in the character class, and
14747    * it doesn't match anything.  (perluniprops.pod notes such
14748    * properties) */
14749    op = OPFAIL;
14750    *flagp |= HASWIDTH|SIMPLE;
14751   }
14752   else if (start == end) {    /* The range is a single code point */
14753    if (! invlist_iternext(cp_list, &start, &end)
14754
14755      /* Don't do this optimization if it would require changing
14756      * the pattern to UTF-8 */
14757     && (start < 256 || UTF))
14758    {
14759     /* Here, the list contains a single code point.  Can optimize
14760     * into an EXACTish node */
14761
14762     value = start;
14763
14764     if (! FOLD) {
14765      op = EXACT;
14766     }
14767     else if (LOC) {
14768
14769      /* A locale node under folding with one code point can be
14770      * an EXACTFL, as its fold won't be calculated until
14771      * runtime */
14772      op = EXACTFL;
14773     }
14774     else {
14775
14776      /* Here, we are generally folding, but there is only one
14777      * code point to match.  If we have to, we use an EXACT
14778      * node, but it would be better for joining with adjacent
14779      * nodes in the optimization pass if we used the same
14780      * EXACTFish node that any such are likely to be.  We can
14781      * do this iff the code point doesn't participate in any
14782      * folds.  For example, an EXACTF of a colon is the same as
14783      * an EXACT one, since nothing folds to or from a colon. */
14784      if (value < 256) {
14785       if (IS_IN_SOME_FOLD_L1(value)) {
14786        op = EXACT;
14787       }
14788      }
14789      else {
14790       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14791        op = EXACT;
14792       }
14793      }
14794
14795      /* If we haven't found the node type, above, it means we
14796      * can use the prevailing one */
14797      if (op == END) {
14798       op = compute_EXACTish(pRExC_state);
14799      }
14800     }
14801    }
14802   }
14803   else if (start == 0) {
14804    if (end == UV_MAX) {
14805     op = SANY;
14806     *flagp |= HASWIDTH|SIMPLE;
14807     RExC_naughty++;
14808    }
14809    else if (end == '\n' - 1
14810      && invlist_iternext(cp_list, &start, &end)
14811      && start == '\n' + 1 && end == UV_MAX)
14812    {
14813     op = REG_ANY;
14814     *flagp |= HASWIDTH|SIMPLE;
14815     RExC_naughty++;
14816    }
14817   }
14818   invlist_iterfinish(cp_list);
14819
14820   if (op != END) {
14821    RExC_parse = (char *)orig_parse;
14822    RExC_emit = (regnode *)orig_emit;
14823
14824    ret = reg_node(pRExC_state, op);
14825
14826    RExC_parse = (char *)cur_parse;
14827
14828    if (PL_regkind[op] == EXACT) {
14829     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14830           TRUE /* downgradable to EXACT */
14831           );
14832    }
14833
14834    SvREFCNT_dec_NN(cp_list);
14835    return ret;
14836   }
14837  }
14838
14839  /* Here, <cp_list> contains all the code points we can determine at
14840  * compile time that match under all conditions.  Go through it, and
14841  * for things that belong in the bitmap, put them there, and delete from
14842  * <cp_list>.  While we are at it, see if everything above 255 is in the
14843  * list, and if so, set a flag to speed up execution */
14844
14845  populate_ANYOF_from_invlist(ret, &cp_list);
14846
14847  if (invert) {
14848   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14849  }
14850
14851  /* Here, the bitmap has been populated with all the Latin1 code points that
14852  * always match.  Can now add to the overall list those that match only
14853  * when the target string is UTF-8 (<depends_list>). */
14854  if (depends_list) {
14855   if (cp_list) {
14856    _invlist_union(cp_list, depends_list, &cp_list);
14857    SvREFCNT_dec_NN(depends_list);
14858   }
14859   else {
14860    cp_list = depends_list;
14861   }
14862   ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14863  }
14864
14865  /* If there is a swash and more than one element, we can't use the swash in
14866  * the optimization below. */
14867  if (swash && element_count > 1) {
14868   SvREFCNT_dec_NN(swash);
14869   swash = NULL;
14870  }
14871
14872  set_ANYOF_arg(pRExC_state, ret, cp_list,
14873     (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14874     ? listsv : NULL,
14875     only_utf8_locale_list,
14876     swash, has_user_defined_property);
14877
14878  *flagp |= HASWIDTH|SIMPLE;
14879
14880  if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14881   RExC_contains_locale = 1;
14882  }
14883
14884  return ret;
14885 }
14886
14887 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14888
14889 STATIC void
14890 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14891     regnode* const node,
14892     SV* const cp_list,
14893     SV* const runtime_defns,
14894     SV* const only_utf8_locale_list,
14895     SV* const swash,
14896     const bool has_user_defined_property)
14897 {
14898  /* Sets the arg field of an ANYOF-type node 'node', using information about
14899  * the node passed-in.  If there is nothing outside the node's bitmap, the
14900  * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14901  * the count returned by add_data(), having allocated and stored an array,
14902  * av, that that count references, as follows:
14903  *  av[0] stores the character class description in its textual form.
14904  *        This is used later (regexec.c:Perl_regclass_swash()) to
14905  *        initialize the appropriate swash, and is also useful for dumping
14906  *        the regnode.  This is set to &PL_sv_undef if the textual
14907  *        description is not needed at run-time (as happens if the other
14908  *        elements completely define the class)
14909  *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14910  *        computed from av[0].  But if no further computation need be done,
14911  *        the swash is stored here now (and av[0] is &PL_sv_undef).
14912  *  av[2] stores the inversion list of code points that match only if the
14913  *        current locale is UTF-8
14914  *  av[3] stores the cp_list inversion list for use in addition or instead
14915  *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14916  *        (Otherwise everything needed is already in av[0] and av[1])
14917  *  av[4] is set if any component of the class is from a user-defined
14918  *        property; used only if av[3] exists */
14919
14920  UV n;
14921
14922  PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14923
14924  if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14925   assert(! (ANYOF_FLAGS(node)
14926      & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14927   ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14928  }
14929  else {
14930   AV * const av = newAV();
14931   SV *rv;
14932
14933   assert(ANYOF_FLAGS(node)
14934      & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14935
14936   av_store(av, 0, (runtime_defns)
14937       ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14938   if (swash) {
14939    av_store(av, 1, swash);
14940    SvREFCNT_dec_NN(cp_list);
14941   }
14942   else {
14943    av_store(av, 1, &PL_sv_undef);
14944    if (cp_list) {
14945     av_store(av, 3, cp_list);
14946     av_store(av, 4, newSVuv(has_user_defined_property));
14947    }
14948   }
14949
14950   if (only_utf8_locale_list) {
14951    av_store(av, 2, only_utf8_locale_list);
14952   }
14953   else {
14954    av_store(av, 2, &PL_sv_undef);
14955   }
14956
14957   rv = newRV_noinc(MUTABLE_SV(av));
14958   n = add_data(pRExC_state, STR_WITH_LEN("s"));
14959   RExC_rxi->data->data[n] = (void*)rv;
14960   ARG_SET(node, n);
14961  }
14962 }
14963
14964
14965 /* reg_skipcomment()
14966
14967    Absorbs an /x style # comments from the input stream.
14968    Returns true if there is more text remaining in the stream.
14969    Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
14970    terminates the pattern without including a newline.
14971
14972    Note its the callers responsibility to ensure that we are
14973    actually in /x mode
14974
14975 */
14976
14977 STATIC bool
14978 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14979 {
14980  bool ended = 0;
14981
14982  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14983
14984  while (RExC_parse < RExC_end)
14985   if (*RExC_parse++ == '\n') {
14986    ended = 1;
14987    break;
14988   }
14989  if (!ended) {
14990   /* we ran off the end of the pattern without ending
14991   the comment, so we have to add an \n when wrapping */
14992   RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
14993   return 0;
14994  } else
14995   return 1;
14996 }
14997
14998 /* nextchar()
14999
15000    Advances the parse position, and optionally absorbs
15001    "whitespace" from the inputstream.
15002
15003    Without /x "whitespace" means (?#...) style comments only,
15004    with /x this means (?#...) and # comments and whitespace proper.
15005
15006    Returns the RExC_parse point from BEFORE the scan occurs.
15007
15008    This is the /x friendly way of saying RExC_parse++.
15009 */
15010
15011 STATIC char*
15012 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15013 {
15014  char* const retval = RExC_parse++;
15015
15016  PERL_ARGS_ASSERT_NEXTCHAR;
15017
15018  for (;;) {
15019   if (RExC_end - RExC_parse >= 3
15020    && *RExC_parse == '('
15021    && RExC_parse[1] == '?'
15022    && RExC_parse[2] == '#')
15023   {
15024    while (*RExC_parse != ')') {
15025     if (RExC_parse == RExC_end)
15026      FAIL("Sequence (?#... not terminated");
15027     RExC_parse++;
15028    }
15029    RExC_parse++;
15030    continue;
15031   }
15032   if (RExC_flags & RXf_PMf_EXTENDED) {
15033    if (isSPACE(*RExC_parse)) {
15034     RExC_parse++;
15035     continue;
15036    }
15037    else if (*RExC_parse == '#') {
15038     if ( reg_skipcomment( pRExC_state ) )
15039      continue;
15040    }
15041   }
15042   return retval;
15043  }
15044 }
15045
15046 /*
15047 - reg_node - emit a node
15048 */
15049 STATIC regnode *   /* Location. */
15050 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15051 {
15052  dVAR;
15053  regnode *ptr;
15054  regnode * const ret = RExC_emit;
15055  GET_RE_DEBUG_FLAGS_DECL;
15056
15057  PERL_ARGS_ASSERT_REG_NODE;
15058
15059  if (SIZE_ONLY) {
15060   SIZE_ALIGN(RExC_size);
15061   RExC_size += 1;
15062   return(ret);
15063  }
15064  if (RExC_emit >= RExC_emit_bound)
15065   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15066     op, RExC_emit, RExC_emit_bound);
15067
15068  NODE_ALIGN_FILL(ret);
15069  ptr = ret;
15070  FILL_ADVANCE_NODE(ptr, op);
15071  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
15072 #ifdef RE_TRACK_PATTERN_OFFSETS
15073  if (RExC_offsets) {         /* MJD */
15074   MJD_OFFSET_DEBUG(
15075    ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15076    "reg_node", __LINE__,
15077    PL_reg_name[op],
15078    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15079     ? "Overwriting end of array!\n" : "OK",
15080    (UV)(RExC_emit - RExC_emit_start),
15081    (UV)(RExC_parse - RExC_start),
15082    (UV)RExC_offsets[0]));
15083   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15084  }
15085 #endif
15086  RExC_emit = ptr;
15087  return(ret);
15088 }
15089
15090 /*
15091 - reganode - emit a node with an argument
15092 */
15093 STATIC regnode *   /* Location. */
15094 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15095 {
15096  dVAR;
15097  regnode *ptr;
15098  regnode * const ret = RExC_emit;
15099  GET_RE_DEBUG_FLAGS_DECL;
15100
15101  PERL_ARGS_ASSERT_REGANODE;
15102
15103  if (SIZE_ONLY) {
15104   SIZE_ALIGN(RExC_size);
15105   RExC_size += 2;
15106   /*
15107   We can't do this:
15108
15109   assert(2==regarglen[op]+1);
15110
15111   Anything larger than this has to allocate the extra amount.
15112   If we changed this to be:
15113
15114   RExC_size += (1 + regarglen[op]);
15115
15116   then it wouldn't matter. Its not clear what side effect
15117   might come from that so its not done so far.
15118   -- dmq
15119   */
15120   return(ret);
15121  }
15122  if (RExC_emit >= RExC_emit_bound)
15123   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15124     op, RExC_emit, RExC_emit_bound);
15125
15126  NODE_ALIGN_FILL(ret);
15127  ptr = ret;
15128  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15129  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
15130 #ifdef RE_TRACK_PATTERN_OFFSETS
15131  if (RExC_offsets) {         /* MJD */
15132   MJD_OFFSET_DEBUG(
15133    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15134    "reganode",
15135    __LINE__,
15136    PL_reg_name[op],
15137    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15138    "Overwriting end of array!\n" : "OK",
15139    (UV)(RExC_emit - RExC_emit_start),
15140    (UV)(RExC_parse - RExC_start),
15141    (UV)RExC_offsets[0]));
15142   Set_Cur_Node_Offset;
15143  }
15144 #endif
15145  RExC_emit = ptr;
15146  return(ret);
15147 }
15148
15149 /*
15150 - reguni - emit (if appropriate) a Unicode character
15151 */
15152 PERL_STATIC_INLINE STRLEN
15153 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15154 {
15155  dVAR;
15156
15157  PERL_ARGS_ASSERT_REGUNI;
15158
15159  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15160 }
15161
15162 /*
15163 - reginsert - insert an operator in front of already-emitted operand
15164 *
15165 * Means relocating the operand.
15166 */
15167 STATIC void
15168 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15169 {
15170  dVAR;
15171  regnode *src;
15172  regnode *dst;
15173  regnode *place;
15174  const int offset = regarglen[(U8)op];
15175  const int size = NODE_STEP_REGNODE + offset;
15176  GET_RE_DEBUG_FLAGS_DECL;
15177
15178  PERL_ARGS_ASSERT_REGINSERT;
15179  PERL_UNUSED_ARG(depth);
15180 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15181  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15182  if (SIZE_ONLY) {
15183   RExC_size += size;
15184   return;
15185  }
15186
15187  src = RExC_emit;
15188  RExC_emit += size;
15189  dst = RExC_emit;
15190  if (RExC_open_parens) {
15191   int paren;
15192   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15193   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15194    if ( RExC_open_parens[paren] >= opnd ) {
15195     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15196     RExC_open_parens[paren] += size;
15197    } else {
15198     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15199    }
15200    if ( RExC_close_parens[paren] >= opnd ) {
15201     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15202     RExC_close_parens[paren] += size;
15203    } else {
15204     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15205    }
15206   }
15207  }
15208
15209  while (src > opnd) {
15210   StructCopy(--src, --dst, regnode);
15211 #ifdef RE_TRACK_PATTERN_OFFSETS
15212   if (RExC_offsets) {     /* MJD 20010112 */
15213    MJD_OFFSET_DEBUG(
15214     ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15215     "reg_insert",
15216     __LINE__,
15217     PL_reg_name[op],
15218     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15219      ? "Overwriting end of array!\n" : "OK",
15220     (UV)(src - RExC_emit_start),
15221     (UV)(dst - RExC_emit_start),
15222     (UV)RExC_offsets[0]));
15223    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15224    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15225   }
15226 #endif
15227  }
15228
15229
15230  place = opnd;  /* Op node, where operand used to be. */
15231 #ifdef RE_TRACK_PATTERN_OFFSETS
15232  if (RExC_offsets) {         /* MJD */
15233   MJD_OFFSET_DEBUG(
15234    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15235    "reginsert",
15236    __LINE__,
15237    PL_reg_name[op],
15238    (UV)(place - RExC_emit_start) > RExC_offsets[0]
15239    ? "Overwriting end of array!\n" : "OK",
15240    (UV)(place - RExC_emit_start),
15241    (UV)(RExC_parse - RExC_start),
15242    (UV)RExC_offsets[0]));
15243   Set_Node_Offset(place, RExC_parse);
15244   Set_Node_Length(place, 1);
15245  }
15246 #endif
15247  src = NEXTOPER(place);
15248  FILL_ADVANCE_NODE(place, op);
15249  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
15250  Zero(src, offset, regnode);
15251 }
15252
15253 /*
15254 - regtail - set the next-pointer at the end of a node chain of p to val.
15255 - SEE ALSO: regtail_study
15256 */
15257 /* TODO: All three parms should be const */
15258 STATIC void
15259 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15260     const regnode *val,U32 depth)
15261 {
15262  dVAR;
15263  regnode *scan;
15264  GET_RE_DEBUG_FLAGS_DECL;
15265
15266  PERL_ARGS_ASSERT_REGTAIL;
15267 #ifndef DEBUGGING
15268  PERL_UNUSED_ARG(depth);
15269 #endif
15270
15271  if (SIZE_ONLY)
15272   return;
15273
15274  /* Find last node. */
15275  scan = p;
15276  for (;;) {
15277   regnode * const temp = regnext(scan);
15278   DEBUG_PARSE_r({
15279    SV * const mysv=sv_newmortal();
15280    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15281    regprop(RExC_rx, mysv, scan, NULL);
15282    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15283     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15284      (temp == NULL ? "->" : ""),
15285      (temp == NULL ? PL_reg_name[OP(val)] : "")
15286    );
15287   });
15288   if (temp == NULL)
15289    break;
15290   scan = temp;
15291  }
15292
15293  if (reg_off_by_arg[OP(scan)]) {
15294   ARG_SET(scan, val - scan);
15295  }
15296  else {
15297   NEXT_OFF(scan) = val - scan;
15298  }
15299 }
15300
15301 #ifdef DEBUGGING
15302 /*
15303 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15304 - Look for optimizable sequences at the same time.
15305 - currently only looks for EXACT chains.
15306
15307 This is experimental code. The idea is to use this routine to perform
15308 in place optimizations on branches and groups as they are constructed,
15309 with the long term intention of removing optimization from study_chunk so
15310 that it is purely analytical.
15311
15312 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15313 to control which is which.
15314
15315 */
15316 /* TODO: All four parms should be const */
15317
15318 STATIC U8
15319 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15320      const regnode *val,U32 depth)
15321 {
15322  dVAR;
15323  regnode *scan;
15324  U8 exact = PSEUDO;
15325 #ifdef EXPERIMENTAL_INPLACESCAN
15326  I32 min = 0;
15327 #endif
15328  GET_RE_DEBUG_FLAGS_DECL;
15329
15330  PERL_ARGS_ASSERT_REGTAIL_STUDY;
15331
15332
15333  if (SIZE_ONLY)
15334   return exact;
15335
15336  /* Find last node. */
15337
15338  scan = p;
15339  for (;;) {
15340   regnode * const temp = regnext(scan);
15341 #ifdef EXPERIMENTAL_INPLACESCAN
15342   if (PL_regkind[OP(scan)] == EXACT) {
15343    bool unfolded_multi_char; /* Unexamined in this routine */
15344    if (join_exact(pRExC_state, scan, &min,
15345       &unfolded_multi_char, 1, val, depth+1))
15346     return EXACT;
15347   }
15348 #endif
15349   if ( exact ) {
15350    switch (OP(scan)) {
15351     case EXACT:
15352     case EXACTF:
15353     case EXACTFA_NO_TRIE:
15354     case EXACTFA:
15355     case EXACTFU:
15356     case EXACTFU_SS:
15357     case EXACTFL:
15358       if( exact == PSEUDO )
15359        exact= OP(scan);
15360       else if ( exact != OP(scan) )
15361        exact= 0;
15362     case NOTHING:
15363      break;
15364     default:
15365      exact= 0;
15366    }
15367   }
15368   DEBUG_PARSE_r({
15369    SV * const mysv=sv_newmortal();
15370    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15371    regprop(RExC_rx, mysv, scan, NULL);
15372    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15373     SvPV_nolen_const(mysv),
15374     REG_NODE_NUM(scan),
15375     PL_reg_name[exact]);
15376   });
15377   if (temp == NULL)
15378    break;
15379   scan = temp;
15380  }
15381  DEBUG_PARSE_r({
15382   SV * const mysv_val=sv_newmortal();
15383   DEBUG_PARSE_MSG("");
15384   regprop(RExC_rx, mysv_val, val, NULL);
15385   PerlIO_printf(Perl_debug_log,
15386      "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15387      SvPV_nolen_const(mysv_val),
15388      (IV)REG_NODE_NUM(val),
15389      (IV)(val - scan)
15390   );
15391  });
15392  if (reg_off_by_arg[OP(scan)]) {
15393   ARG_SET(scan, val - scan);
15394  }
15395  else {
15396   NEXT_OFF(scan) = val - scan;
15397  }
15398
15399  return exact;
15400 }
15401 #endif
15402
15403 /*
15404  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15405  */
15406 #ifdef DEBUGGING
15407
15408 static void
15409 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15410 {
15411  int bit;
15412  int set=0;
15413
15414  ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15415
15416  for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15417   if (flags & (1<<bit)) {
15418    if (!set++ && lead)
15419     PerlIO_printf(Perl_debug_log, "%s",lead);
15420    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15421   }
15422  }
15423  if (lead)  {
15424   if (set)
15425    PerlIO_printf(Perl_debug_log, "\n");
15426   else
15427    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15428  }
15429 }
15430
15431 static void
15432 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15433 {
15434  int bit;
15435  int set=0;
15436  regex_charset cs;
15437
15438  ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15439
15440  for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15441   if (flags & (1<<bit)) {
15442    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15443     continue;
15444    }
15445    if (!set++ && lead)
15446     PerlIO_printf(Perl_debug_log, "%s",lead);
15447    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15448   }
15449  }
15450  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15451    if (!set++ && lead) {
15452     PerlIO_printf(Perl_debug_log, "%s",lead);
15453    }
15454    switch (cs) {
15455     case REGEX_UNICODE_CHARSET:
15456      PerlIO_printf(Perl_debug_log, "UNICODE");
15457      break;
15458     case REGEX_LOCALE_CHARSET:
15459      PerlIO_printf(Perl_debug_log, "LOCALE");
15460      break;
15461     case REGEX_ASCII_RESTRICTED_CHARSET:
15462      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15463      break;
15464     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15465      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15466      break;
15467     default:
15468      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15469      break;
15470    }
15471  }
15472  if (lead)  {
15473   if (set)
15474    PerlIO_printf(Perl_debug_log, "\n");
15475   else
15476    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15477  }
15478 }
15479 #endif
15480
15481 void
15482 Perl_regdump(pTHX_ const regexp *r)
15483 {
15484 #ifdef DEBUGGING
15485  dVAR;
15486  SV * const sv = sv_newmortal();
15487  SV *dsv= sv_newmortal();
15488  RXi_GET_DECL(r,ri);
15489  GET_RE_DEBUG_FLAGS_DECL;
15490
15491  PERL_ARGS_ASSERT_REGDUMP;
15492
15493  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15494
15495  /* Header fields of interest. */
15496  if (r->anchored_substr) {
15497   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15498    RE_SV_DUMPLEN(r->anchored_substr), 30);
15499   PerlIO_printf(Perl_debug_log,
15500      "anchored %s%s at %"IVdf" ",
15501      s, RE_SV_TAIL(r->anchored_substr),
15502      (IV)r->anchored_offset);
15503  } else if (r->anchored_utf8) {
15504   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15505    RE_SV_DUMPLEN(r->anchored_utf8), 30);
15506   PerlIO_printf(Perl_debug_log,
15507      "anchored utf8 %s%s at %"IVdf" ",
15508      s, RE_SV_TAIL(r->anchored_utf8),
15509      (IV)r->anchored_offset);
15510  }
15511  if (r->float_substr) {
15512   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15513    RE_SV_DUMPLEN(r->float_substr), 30);
15514   PerlIO_printf(Perl_debug_log,
15515      "floating %s%s at %"IVdf"..%"UVuf" ",
15516      s, RE_SV_TAIL(r->float_substr),
15517      (IV)r->float_min_offset, (UV)r->float_max_offset);
15518  } else if (r->float_utf8) {
15519   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15520    RE_SV_DUMPLEN(r->float_utf8), 30);
15521   PerlIO_printf(Perl_debug_log,
15522      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15523      s, RE_SV_TAIL(r->float_utf8),
15524      (IV)r->float_min_offset, (UV)r->float_max_offset);
15525  }
15526  if (r->check_substr || r->check_utf8)
15527   PerlIO_printf(Perl_debug_log,
15528      (const char *)
15529      (r->check_substr == r->float_substr
15530      && r->check_utf8 == r->float_utf8
15531      ? "(checking floating" : "(checking anchored"));
15532  if (r->intflags & PREGf_NOSCAN)
15533   PerlIO_printf(Perl_debug_log, " noscan");
15534  if (r->extflags & RXf_CHECK_ALL)
15535   PerlIO_printf(Perl_debug_log, " isall");
15536  if (r->check_substr || r->check_utf8)
15537   PerlIO_printf(Perl_debug_log, ") ");
15538
15539  if (ri->regstclass) {
15540   regprop(r, sv, ri->regstclass, NULL);
15541   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15542  }
15543  if (r->intflags & PREGf_ANCH) {
15544   PerlIO_printf(Perl_debug_log, "anchored");
15545   if (r->intflags & PREGf_ANCH_BOL)
15546    PerlIO_printf(Perl_debug_log, "(BOL)");
15547   if (r->intflags & PREGf_ANCH_MBOL)
15548    PerlIO_printf(Perl_debug_log, "(MBOL)");
15549   if (r->intflags & PREGf_ANCH_SBOL)
15550    PerlIO_printf(Perl_debug_log, "(SBOL)");
15551   if (r->intflags & PREGf_ANCH_GPOS)
15552    PerlIO_printf(Perl_debug_log, "(GPOS)");
15553   PerlIO_putc(Perl_debug_log, ' ');
15554  }
15555  if (r->intflags & PREGf_GPOS_SEEN)
15556   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15557  if (r->intflags & PREGf_SKIP)
15558   PerlIO_printf(Perl_debug_log, "plus ");
15559  if (r->intflags & PREGf_IMPLICIT)
15560   PerlIO_printf(Perl_debug_log, "implicit ");
15561  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15562  if (r->extflags & RXf_EVAL_SEEN)
15563   PerlIO_printf(Perl_debug_log, "with eval ");
15564  PerlIO_printf(Perl_debug_log, "\n");
15565  DEBUG_FLAGS_r({
15566   regdump_extflags("r->extflags: ",r->extflags);
15567   regdump_intflags("r->intflags: ",r->intflags);
15568  });
15569 #else
15570  PERL_ARGS_ASSERT_REGDUMP;
15571  PERL_UNUSED_CONTEXT;
15572  PERL_UNUSED_ARG(r);
15573 #endif /* DEBUGGING */
15574 }
15575
15576 /*
15577 - regprop - printable representation of opcode, with run time support
15578 */
15579
15580 void
15581 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15582 {
15583 #ifdef DEBUGGING
15584  dVAR;
15585  int k;
15586
15587  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15588  static const char * const anyofs[] = {
15589 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15590  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15591  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15592  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15593  || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15594  || _CC_VERTSPACE != 16
15595   #error Need to adjust order of anyofs[]
15596 #endif
15597   "\\w",
15598   "\\W",
15599   "\\d",
15600   "\\D",
15601   "[:alpha:]",
15602   "[:^alpha:]",
15603   "[:lower:]",
15604   "[:^lower:]",
15605   "[:upper:]",
15606   "[:^upper:]",
15607   "[:punct:]",
15608   "[:^punct:]",
15609   "[:print:]",
15610   "[:^print:]",
15611   "[:alnum:]",
15612   "[:^alnum:]",
15613   "[:graph:]",
15614   "[:^graph:]",
15615   "[:cased:]",
15616   "[:^cased:]",
15617   "\\s",
15618   "\\S",
15619   "[:blank:]",
15620   "[:^blank:]",
15621   "[:xdigit:]",
15622   "[:^xdigit:]",
15623   "[:space:]",
15624   "[:^space:]",
15625   "[:cntrl:]",
15626   "[:^cntrl:]",
15627   "[:ascii:]",
15628   "[:^ascii:]",
15629   "\\v",
15630   "\\V"
15631  };
15632  RXi_GET_DECL(prog,progi);
15633  GET_RE_DEBUG_FLAGS_DECL;
15634
15635  PERL_ARGS_ASSERT_REGPROP;
15636
15637  sv_setpvs(sv, "");
15638
15639  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
15640   /* It would be nice to FAIL() here, but this may be called from
15641   regexec.c, and it would be hard to supply pRExC_state. */
15642   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15643            (int)OP(o), (int)REGNODE_MAX);
15644  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15645
15646  k = PL_regkind[OP(o)];
15647
15648  if (k == EXACT) {
15649   sv_catpvs(sv, " ");
15650   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15651   * is a crude hack but it may be the best for now since
15652   * we have no flag "this EXACTish node was UTF-8"
15653   * --jhi */
15654   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15655     PERL_PV_ESCAPE_UNI_DETECT |
15656     PERL_PV_ESCAPE_NONASCII   |
15657     PERL_PV_PRETTY_ELLIPSES   |
15658     PERL_PV_PRETTY_LTGT       |
15659     PERL_PV_PRETTY_NOCLEAR
15660     );
15661  } else if (k == TRIE) {
15662   /* print the details of the trie in dumpuntil instead, as
15663   * progi->data isn't available here */
15664   const char op = OP(o);
15665   const U32 n = ARG(o);
15666   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15667    (reg_ac_data *)progi->data->data[n] :
15668    NULL;
15669   const reg_trie_data * const trie
15670    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15671
15672   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15673   DEBUG_TRIE_COMPILE_r(
15674   Perl_sv_catpvf(aTHX_ sv,
15675    "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15676    (UV)trie->startstate,
15677    (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15678    (UV)trie->wordcount,
15679    (UV)trie->minlen,
15680    (UV)trie->maxlen,
15681    (UV)TRIE_CHARCOUNT(trie),
15682    (UV)trie->uniquecharcount
15683   );
15684   );
15685   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15686    sv_catpvs(sv, "[");
15687    (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15688             ? ANYOF_BITMAP(o)
15689             : TRIE_BITMAP(trie));
15690    sv_catpvs(sv, "]");
15691   }
15692
15693  } else if (k == CURLY) {
15694   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15695    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15696   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15697  }
15698  else if (k == WHILEM && o->flags)   /* Ordinal/of */
15699   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15700  else if (k == REF || k == OPEN || k == CLOSE
15701    || k == GROUPP || OP(o)==ACCEPT)
15702  {
15703   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15704   if ( RXp_PAREN_NAMES(prog) ) {
15705    if ( k != REF || (OP(o) < NREF)) {
15706     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15707     SV **name= av_fetch(list, ARG(o), 0 );
15708     if (name)
15709      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15710    }
15711    else {
15712     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15713     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15714     I32 *nums=(I32*)SvPVX(sv_dat);
15715     SV **name= av_fetch(list, nums[0], 0 );
15716     I32 n;
15717     if (name) {
15718      for ( n=0; n<SvIVX(sv_dat); n++ ) {
15719       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15720          (n ? "," : ""), (IV)nums[n]);
15721      }
15722      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15723     }
15724    }
15725   }
15726   if ( k == REF && reginfo) {
15727    U32 n = ARG(o);  /* which paren pair */
15728    I32 ln = prog->offs[n].start;
15729    if (prog->lastparen < n || ln == -1)
15730     Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15731    else if (ln == prog->offs[n].end)
15732     Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15733    else {
15734     const char *s = reginfo->strbeg + ln;
15735     Perl_sv_catpvf(aTHX_ sv, ": ");
15736     Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15737      PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15738    }
15739   }
15740  } else if (k == GOSUB)
15741   /* Paren and offset */
15742   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15743  else if (k == VERB) {
15744   if (!o->flags)
15745    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15746       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15747  } else if (k == LOGICAL)
15748   /* 2: embedded, otherwise 1 */
15749   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15750  else if (k == ANYOF) {
15751   const U8 flags = ANYOF_FLAGS(o);
15752   int do_sep = 0;
15753
15754
15755   if (flags & ANYOF_LOCALE_FLAGS)
15756    sv_catpvs(sv, "{loc}");
15757   if (flags & ANYOF_LOC_FOLD)
15758    sv_catpvs(sv, "{i}");
15759   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15760   if (flags & ANYOF_INVERT)
15761    sv_catpvs(sv, "^");
15762
15763   /* output what the standard cp 0-255 bitmap matches */
15764   do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15765
15766   /* output any special charclass tests (used entirely under use
15767   * locale) * */
15768   if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15769    int i;
15770    for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15771     if (ANYOF_POSIXL_TEST(o,i)) {
15772      sv_catpv(sv, anyofs[i]);
15773      do_sep = 1;
15774     }
15775    }
15776   }
15777
15778   if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15779      |ANYOF_UTF8
15780      |ANYOF_NONBITMAP_NON_UTF8
15781      |ANYOF_LOC_FOLD)))
15782   {
15783    if (do_sep) {
15784     Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15785     if (flags & ANYOF_INVERT)
15786      /*make sure the invert info is in each */
15787      sv_catpvs(sv, "^");
15788    }
15789
15790    if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15791     sv_catpvs(sv, "{non-utf8-latin1-all}");
15792    }
15793
15794    /* output information about the unicode matching */
15795    if (flags & ANYOF_ABOVE_LATIN1_ALL)
15796     sv_catpvs(sv, "{unicode_all}");
15797    else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15798     SV *lv; /* Set if there is something outside the bit map. */
15799     bool byte_output = FALSE;   /* If something in the bitmap has
15800            been output */
15801     SV *only_utf8_locale;
15802
15803     /* Get the stuff that wasn't in the bitmap */
15804     (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15805              &lv, &only_utf8_locale);
15806     if (lv && lv != &PL_sv_undef) {
15807      char *s = savesvpv(lv);
15808      char * const origs = s;
15809
15810      while (*s && *s != '\n')
15811       s++;
15812
15813      if (*s == '\n') {
15814       const char * const t = ++s;
15815
15816       if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15817        sv_catpvs(sv, "{outside bitmap}");
15818       }
15819       else {
15820        sv_catpvs(sv, "{utf8}");
15821       }
15822
15823       if (byte_output) {
15824        sv_catpvs(sv, " ");
15825       }
15826
15827       while (*s) {
15828        if (*s == '\n') {
15829
15830         /* Truncate very long output */
15831         if (s - origs > 256) {
15832          Perl_sv_catpvf(aTHX_ sv,
15833             "%.*s...",
15834             (int) (s - origs - 1),
15835             t);
15836          goto out_dump;
15837         }
15838         *s = ' ';
15839        }
15840        else if (*s == '\t') {
15841         *s = '-';
15842        }
15843        s++;
15844       }
15845       if (s[-1] == ' ')
15846        s[-1] = 0;
15847
15848       sv_catpv(sv, t);
15849      }
15850
15851     out_dump:
15852
15853      Safefree(origs);
15854      SvREFCNT_dec_NN(lv);
15855     }
15856
15857     if ((flags & ANYOF_LOC_FOLD)
15858      && only_utf8_locale
15859      && only_utf8_locale != &PL_sv_undef)
15860     {
15861      UV start, end;
15862      int max_entries = 256;
15863
15864      sv_catpvs(sv, "{utf8 locale}");
15865      invlist_iterinit(only_utf8_locale);
15866      while (invlist_iternext(only_utf8_locale,
15867            &start, &end)) {
15868       put_range(sv, start, end);
15869       max_entries --;
15870       if (max_entries < 0) {
15871        sv_catpvs(sv, "...");
15872        break;
15873       }
15874      }
15875      invlist_iterfinish(only_utf8_locale);
15876     }
15877    }
15878   }
15879
15880   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15881  }
15882  else if (k == POSIXD || k == NPOSIXD) {
15883   U8 index = FLAGS(o) * 2;
15884   if (index < C_ARRAY_LENGTH(anyofs)) {
15885    if (*anyofs[index] != '[')  {
15886     sv_catpv(sv, "[");
15887    }
15888    sv_catpv(sv, anyofs[index]);
15889    if (*anyofs[index] != '[')  {
15890     sv_catpv(sv, "]");
15891    }
15892   }
15893   else {
15894    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15895   }
15896  }
15897  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15898   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15899 #else
15900  PERL_UNUSED_CONTEXT;
15901  PERL_UNUSED_ARG(sv);
15902  PERL_UNUSED_ARG(o);
15903  PERL_UNUSED_ARG(prog);
15904  PERL_UNUSED_ARG(reginfo);
15905 #endif /* DEBUGGING */
15906 }
15907
15908
15909
15910 SV *
15911 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15912 {    /* Assume that RE_INTUIT is set */
15913  dVAR;
15914  struct regexp *const prog = ReANY(r);
15915  GET_RE_DEBUG_FLAGS_DECL;
15916
15917  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15918  PERL_UNUSED_CONTEXT;
15919
15920  DEBUG_COMPILE_r(
15921   {
15922    const char * const s = SvPV_nolen_const(prog->check_substr
15923      ? prog->check_substr : prog->check_utf8);
15924
15925    if (!PL_colorset) reginitcolors();
15926    PerlIO_printf(Perl_debug_log,
15927      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15928      PL_colors[4],
15929      prog->check_substr ? "" : "utf8 ",
15930      PL_colors[5],PL_colors[0],
15931      s,
15932      PL_colors[1],
15933      (strlen(s) > 60 ? "..." : ""));
15934   } );
15935
15936  return prog->check_substr ? prog->check_substr : prog->check_utf8;
15937 }
15938
15939 /*
15940    pregfree()
15941
15942    handles refcounting and freeing the perl core regexp structure. When
15943    it is necessary to actually free the structure the first thing it
15944    does is call the 'free' method of the regexp_engine associated to
15945    the regexp, allowing the handling of the void *pprivate; member
15946    first. (This routine is not overridable by extensions, which is why
15947    the extensions free is called first.)
15948
15949    See regdupe and regdupe_internal if you change anything here.
15950 */
15951 #ifndef PERL_IN_XSUB_RE
15952 void
15953 Perl_pregfree(pTHX_ REGEXP *r)
15954 {
15955  SvREFCNT_dec(r);
15956 }
15957
15958 void
15959 Perl_pregfree2(pTHX_ REGEXP *rx)
15960 {
15961  dVAR;
15962  struct regexp *const r = ReANY(rx);
15963  GET_RE_DEBUG_FLAGS_DECL;
15964
15965  PERL_ARGS_ASSERT_PREGFREE2;
15966
15967  if (r->mother_re) {
15968   ReREFCNT_dec(r->mother_re);
15969  } else {
15970   CALLREGFREE_PVT(rx); /* free the private data */
15971   SvREFCNT_dec(RXp_PAREN_NAMES(r));
15972   Safefree(r->xpv_len_u.xpvlenu_pv);
15973  }
15974  if (r->substrs) {
15975   SvREFCNT_dec(r->anchored_substr);
15976   SvREFCNT_dec(r->anchored_utf8);
15977   SvREFCNT_dec(r->float_substr);
15978   SvREFCNT_dec(r->float_utf8);
15979   Safefree(r->substrs);
15980  }
15981  RX_MATCH_COPY_FREE(rx);
15982 #ifdef PERL_ANY_COW
15983  SvREFCNT_dec(r->saved_copy);
15984 #endif
15985  Safefree(r->offs);
15986  SvREFCNT_dec(r->qr_anoncv);
15987  rx->sv_u.svu_rx = 0;
15988 }
15989
15990 /*  reg_temp_copy()
15991
15992  This is a hacky workaround to the structural issue of match results
15993  being stored in the regexp structure which is in turn stored in
15994  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15995  could be PL_curpm in multiple contexts, and could require multiple
15996  result sets being associated with the pattern simultaneously, such
15997  as when doing a recursive match with (??{$qr})
15998
15999  The solution is to make a lightweight copy of the regexp structure
16000  when a qr// is returned from the code executed by (??{$qr}) this
16001  lightweight copy doesn't actually own any of its data except for
16002  the starp/end and the actual regexp structure itself.
16003
16004 */
16005
16006
16007 REGEXP *
16008 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16009 {
16010  struct regexp *ret;
16011  struct regexp *const r = ReANY(rx);
16012  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16013
16014  PERL_ARGS_ASSERT_REG_TEMP_COPY;
16015
16016  if (!ret_x)
16017   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16018  else {
16019   SvOK_off((SV *)ret_x);
16020   if (islv) {
16021    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16022    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16023    made both spots point to the same regexp body.) */
16024    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16025    assert(!SvPVX(ret_x));
16026    ret_x->sv_u.svu_rx = temp->sv_any;
16027    temp->sv_any = NULL;
16028    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16029    SvREFCNT_dec_NN(temp);
16030    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16031    ing below will not set it. */
16032    SvCUR_set(ret_x, SvCUR(rx));
16033   }
16034  }
16035  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16036  sv_force_normal(sv) is called.  */
16037  SvFAKE_on(ret_x);
16038  ret = ReANY(ret_x);
16039
16040  SvFLAGS(ret_x) |= SvUTF8(rx);
16041  /* We share the same string buffer as the original regexp, on which we
16042  hold a reference count, incremented when mother_re is set below.
16043  The string pointer is copied here, being part of the regexp struct.
16044  */
16045  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16046   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16047  if (r->offs) {
16048   const I32 npar = r->nparens+1;
16049   Newx(ret->offs, npar, regexp_paren_pair);
16050   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16051  }
16052  if (r->substrs) {
16053   Newx(ret->substrs, 1, struct reg_substr_data);
16054   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16055
16056   SvREFCNT_inc_void(ret->anchored_substr);
16057   SvREFCNT_inc_void(ret->anchored_utf8);
16058   SvREFCNT_inc_void(ret->float_substr);
16059   SvREFCNT_inc_void(ret->float_utf8);
16060
16061   /* check_substr and check_utf8, if non-NULL, point to either their
16062   anchored or float namesakes, and don't hold a second reference.  */
16063  }
16064  RX_MATCH_COPIED_off(ret_x);
16065 #ifdef PERL_ANY_COW
16066  ret->saved_copy = NULL;
16067 #endif
16068  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16069  SvREFCNT_inc_void(ret->qr_anoncv);
16070
16071  return ret_x;
16072 }
16073 #endif
16074
16075 /* regfree_internal()
16076
16077    Free the private data in a regexp. This is overloadable by
16078    extensions. Perl takes care of the regexp structure in pregfree(),
16079    this covers the *pprivate pointer which technically perl doesn't
16080    know about, however of course we have to handle the
16081    regexp_internal structure when no extension is in use.
16082
16083    Note this is called before freeing anything in the regexp
16084    structure.
16085  */
16086
16087 void
16088 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16089 {
16090  dVAR;
16091  struct regexp *const r = ReANY(rx);
16092  RXi_GET_DECL(r,ri);
16093  GET_RE_DEBUG_FLAGS_DECL;
16094
16095  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16096
16097  DEBUG_COMPILE_r({
16098   if (!PL_colorset)
16099    reginitcolors();
16100   {
16101    SV *dsv= sv_newmortal();
16102    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16103     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16104    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16105     PL_colors[4],PL_colors[5],s);
16106   }
16107  });
16108 #ifdef RE_TRACK_PATTERN_OFFSETS
16109  if (ri->u.offsets)
16110   Safefree(ri->u.offsets);             /* 20010421 MJD */
16111 #endif
16112  if (ri->code_blocks) {
16113   int n;
16114   for (n = 0; n < ri->num_code_blocks; n++)
16115    SvREFCNT_dec(ri->code_blocks[n].src_regex);
16116   Safefree(ri->code_blocks);
16117  }
16118
16119  if (ri->data) {
16120   int n = ri->data->count;
16121
16122   while (--n >= 0) {
16123   /* If you add a ->what type here, update the comment in regcomp.h */
16124    switch (ri->data->what[n]) {
16125    case 'a':
16126    case 'r':
16127    case 's':
16128    case 'S':
16129    case 'u':
16130     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16131     break;
16132    case 'f':
16133     Safefree(ri->data->data[n]);
16134     break;
16135    case 'l':
16136    case 'L':
16137     break;
16138    case 'T':
16139     { /* Aho Corasick add-on structure for a trie node.
16140      Used in stclass optimization only */
16141      U32 refcount;
16142      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16143      OP_REFCNT_LOCK;
16144      refcount = --aho->refcount;
16145      OP_REFCNT_UNLOCK;
16146      if ( !refcount ) {
16147       PerlMemShared_free(aho->states);
16148       PerlMemShared_free(aho->fail);
16149       /* do this last!!!! */
16150       PerlMemShared_free(ri->data->data[n]);
16151       PerlMemShared_free(ri->regstclass);
16152      }
16153     }
16154     break;
16155    case 't':
16156     {
16157      /* trie structure. */
16158      U32 refcount;
16159      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16160      OP_REFCNT_LOCK;
16161      refcount = --trie->refcount;
16162      OP_REFCNT_UNLOCK;
16163      if ( !refcount ) {
16164       PerlMemShared_free(trie->charmap);
16165       PerlMemShared_free(trie->states);
16166       PerlMemShared_free(trie->trans);
16167       if (trie->bitmap)
16168        PerlMemShared_free(trie->bitmap);
16169       if (trie->jump)
16170        PerlMemShared_free(trie->jump);
16171       PerlMemShared_free(trie->wordinfo);
16172       /* do this last!!!! */
16173       PerlMemShared_free(ri->data->data[n]);
16174      }
16175     }
16176     break;
16177    default:
16178     Perl_croak(aTHX_ "panic: regfree data code '%c'",
16179              ri->data->what[n]);
16180    }
16181   }
16182   Safefree(ri->data->what);
16183   Safefree(ri->data);
16184  }
16185
16186  Safefree(ri);
16187 }
16188
16189 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16190 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16191 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16192
16193 /*
16194    re_dup - duplicate a regexp.
16195
16196    This routine is expected to clone a given regexp structure. It is only
16197    compiled under USE_ITHREADS.
16198
16199    After all of the core data stored in struct regexp is duplicated
16200    the regexp_engine.dupe method is used to copy any private data
16201    stored in the *pprivate pointer. This allows extensions to handle
16202    any duplication it needs to do.
16203
16204    See pregfree() and regfree_internal() if you change anything here.
16205 */
16206 #if defined(USE_ITHREADS)
16207 #ifndef PERL_IN_XSUB_RE
16208 void
16209 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16210 {
16211  dVAR;
16212  I32 npar;
16213  const struct regexp *r = ReANY(sstr);
16214  struct regexp *ret = ReANY(dstr);
16215
16216  PERL_ARGS_ASSERT_RE_DUP_GUTS;
16217
16218  npar = r->nparens+1;
16219  Newx(ret->offs, npar, regexp_paren_pair);
16220  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16221
16222  if (ret->substrs) {
16223   /* Do it this way to avoid reading from *r after the StructCopy().
16224   That way, if any of the sv_dup_inc()s dislodge *r from the L1
16225   cache, it doesn't matter.  */
16226   const bool anchored = r->check_substr
16227    ? r->check_substr == r->anchored_substr
16228    : r->check_utf8 == r->anchored_utf8;
16229   Newx(ret->substrs, 1, struct reg_substr_data);
16230   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16231
16232   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16233   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16234   ret->float_substr = sv_dup_inc(ret->float_substr, param);
16235   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16236
16237   /* check_substr and check_utf8, if non-NULL, point to either their
16238   anchored or float namesakes, and don't hold a second reference.  */
16239
16240   if (ret->check_substr) {
16241    if (anchored) {
16242     assert(r->check_utf8 == r->anchored_utf8);
16243     ret->check_substr = ret->anchored_substr;
16244     ret->check_utf8 = ret->anchored_utf8;
16245    } else {
16246     assert(r->check_substr == r->float_substr);
16247     assert(r->check_utf8 == r->float_utf8);
16248     ret->check_substr = ret->float_substr;
16249     ret->check_utf8 = ret->float_utf8;
16250    }
16251   } else if (ret->check_utf8) {
16252    if (anchored) {
16253     ret->check_utf8 = ret->anchored_utf8;
16254    } else {
16255     ret->check_utf8 = ret->float_utf8;
16256    }
16257   }
16258  }
16259
16260  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16261  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16262
16263  if (ret->pprivate)
16264   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16265
16266  if (RX_MATCH_COPIED(dstr))
16267   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16268  else
16269   ret->subbeg = NULL;
16270 #ifdef PERL_ANY_COW
16271  ret->saved_copy = NULL;
16272 #endif
16273
16274  /* Whether mother_re be set or no, we need to copy the string.  We
16275  cannot refrain from copying it when the storage points directly to
16276  our mother regexp, because that's
16277    1: a buffer in a different thread
16278    2: something we no longer hold a reference on
16279    so we need to copy it locally.  */
16280  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16281  ret->mother_re   = NULL;
16282 }
16283 #endif /* PERL_IN_XSUB_RE */
16284
16285 /*
16286    regdupe_internal()
16287
16288    This is the internal complement to regdupe() which is used to copy
16289    the structure pointed to by the *pprivate pointer in the regexp.
16290    This is the core version of the extension overridable cloning hook.
16291    The regexp structure being duplicated will be copied by perl prior
16292    to this and will be provided as the regexp *r argument, however
16293    with the /old/ structures pprivate pointer value. Thus this routine
16294    may override any copying normally done by perl.
16295
16296    It returns a pointer to the new regexp_internal structure.
16297 */
16298
16299 void *
16300 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16301 {
16302  dVAR;
16303  struct regexp *const r = ReANY(rx);
16304  regexp_internal *reti;
16305  int len;
16306  RXi_GET_DECL(r,ri);
16307
16308  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16309
16310  len = ProgLen(ri);
16311
16312  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16313   char, regexp_internal);
16314  Copy(ri->program, reti->program, len+1, regnode);
16315
16316  reti->num_code_blocks = ri->num_code_blocks;
16317  if (ri->code_blocks) {
16318   int n;
16319   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16320     struct reg_code_block);
16321   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16322     struct reg_code_block);
16323   for (n = 0; n < ri->num_code_blocks; n++)
16324    reti->code_blocks[n].src_regex = (REGEXP*)
16325      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16326  }
16327  else
16328   reti->code_blocks = NULL;
16329
16330  reti->regstclass = NULL;
16331
16332  if (ri->data) {
16333   struct reg_data *d;
16334   const int count = ri->data->count;
16335   int i;
16336
16337   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16338     char, struct reg_data);
16339   Newx(d->what, count, U8);
16340
16341   d->count = count;
16342   for (i = 0; i < count; i++) {
16343    d->what[i] = ri->data->what[i];
16344    switch (d->what[i]) {
16345     /* see also regcomp.h and regfree_internal() */
16346    case 'a': /* actually an AV, but the dup function is identical.  */
16347    case 'r':
16348    case 's':
16349    case 'S':
16350    case 'u': /* actually an HV, but the dup function is identical.  */
16351     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16352     break;
16353    case 'f':
16354     /* This is cheating. */
16355     Newx(d->data[i], 1, regnode_ssc);
16356     StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16357     reti->regstclass = (regnode*)d->data[i];
16358     break;
16359    case 'T':
16360     /* Trie stclasses are readonly and can thus be shared
16361     * without duplication. We free the stclass in pregfree
16362     * when the corresponding reg_ac_data struct is freed.
16363     */
16364     reti->regstclass= ri->regstclass;
16365     /* Fall through */
16366    case 't':
16367     OP_REFCNT_LOCK;
16368     ((reg_trie_data*)ri->data->data[i])->refcount++;
16369     OP_REFCNT_UNLOCK;
16370     /* Fall through */
16371    case 'l':
16372    case 'L':
16373     d->data[i] = ri->data->data[i];
16374     break;
16375    default:
16376     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16377               ri->data->what[i]);
16378    }
16379   }
16380
16381   reti->data = d;
16382  }
16383  else
16384   reti->data = NULL;
16385
16386  reti->name_list_idx = ri->name_list_idx;
16387
16388 #ifdef RE_TRACK_PATTERN_OFFSETS
16389  if (ri->u.offsets) {
16390   Newx(reti->u.offsets, 2*len+1, U32);
16391   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16392  }
16393 #else
16394  SetProgLen(reti,len);
16395 #endif
16396
16397  return (void*)reti;
16398 }
16399
16400 #endif    /* USE_ITHREADS */
16401
16402 #ifndef PERL_IN_XSUB_RE
16403
16404 /*
16405  - regnext - dig the "next" pointer out of a node
16406  */
16407 regnode *
16408 Perl_regnext(pTHX_ regnode *p)
16409 {
16410  dVAR;
16411  I32 offset;
16412
16413  if (!p)
16414   return(NULL);
16415
16416  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
16417   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16418             (int)OP(p), (int)REGNODE_MAX);
16419  }
16420
16421  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16422  if (offset == 0)
16423   return(NULL);
16424
16425  return(p+offset);
16426 }
16427 #endif
16428
16429 STATIC void
16430 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16431 {
16432  va_list args;
16433  STRLEN l1 = strlen(pat1);
16434  STRLEN l2 = strlen(pat2);
16435  char buf[512];
16436  SV *msv;
16437  const char *message;
16438
16439  PERL_ARGS_ASSERT_RE_CROAK2;
16440
16441  if (l1 > 510)
16442   l1 = 510;
16443  if (l1 + l2 > 510)
16444   l2 = 510 - l1;
16445  Copy(pat1, buf, l1 , char);
16446  Copy(pat2, buf + l1, l2 , char);
16447  buf[l1 + l2] = '\n';
16448  buf[l1 + l2 + 1] = '\0';
16449  va_start(args, pat2);
16450  msv = vmess(buf, &args);
16451  va_end(args);
16452  message = SvPV_const(msv,l1);
16453  if (l1 > 512)
16454   l1 = 512;
16455  Copy(message, buf, l1 , char);
16456  /* l1-1 to avoid \n */
16457  Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16458 }
16459
16460 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16461
16462 #ifndef PERL_IN_XSUB_RE
16463 void
16464 Perl_save_re_context(pTHX)
16465 {
16466  dVAR;
16467
16468  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16469  if (PL_curpm) {
16470   const REGEXP * const rx = PM_GETRE(PL_curpm);
16471   if (rx) {
16472    U32 i;
16473    for (i = 1; i <= RX_NPARENS(rx); i++) {
16474     char digits[TYPE_CHARS(long)];
16475     const STRLEN len = my_snprintf(digits, sizeof(digits),
16476            "%lu", (long)i);
16477     GV *const *const gvp
16478      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16479
16480     if (gvp) {
16481      GV * const gv = *gvp;
16482      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16483       save_scalar(gv);
16484     }
16485    }
16486   }
16487  }
16488 }
16489 #endif
16490
16491 #ifdef DEBUGGING
16492
16493 STATIC void
16494 S_put_byte(pTHX_ SV *sv, int c)
16495 {
16496  PERL_ARGS_ASSERT_PUT_BYTE;
16497
16498  if (!isPRINT(c)) {
16499   switch (c) {
16500    case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16501    case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16502    case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16503    case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16504    case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16505
16506    default:
16507     Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16508     break;
16509   }
16510  }
16511  else {
16512   const char string = c;
16513   if (c == '-' || c == ']' || c == '\\' || c == '^')
16514    sv_catpvs(sv, "\\");
16515   sv_catpvn(sv, &string, 1);
16516  }
16517 }
16518
16519 STATIC void
16520 S_put_range(pTHX_ SV *sv, UV start, UV end)
16521 {
16522
16523  /* Appends to 'sv' a displayable version of the range of code points from
16524  * 'start' to 'end' */
16525
16526  assert(start <= end);
16527
16528  PERL_ARGS_ASSERT_PUT_RANGE;
16529
16530  if (end - start < 3) {  /* Individual chars in short ranges */
16531   for (; start <= end; start++)
16532    put_byte(sv, start);
16533  }
16534  else if (   end > 255
16535    || ! isALPHANUMERIC(start)
16536    || ! isALPHANUMERIC(end)
16537    || isDIGIT(start) != isDIGIT(end)
16538    || isUPPER(start) != isUPPER(end)
16539    || isLOWER(start) != isLOWER(end)
16540
16541     /* This final test should get optimized out except on EBCDIC
16542     * platforms, where it causes ranges that cross discontinuities
16543     * like i/j to be shown as hex instead of the misleading,
16544     * e.g. H-K (since that range includes more than H, I, J, K).
16545     * */
16546    || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16547  {
16548   Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16549      start,
16550      (end < 256) ? end : 255);
16551  }
16552  else { /* Here, the ends of the range are both digits, or both uppercase,
16553    or both lowercase; and there's no discontinuity in the range
16554    (which could happen on EBCDIC platforms) */
16555   put_byte(sv, start);
16556   sv_catpvs(sv, "-");
16557   put_byte(sv, end);
16558  }
16559 }
16560
16561 STATIC bool
16562 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16563 {
16564  /* Appends to 'sv' a displayable version of the innards of the bracketed
16565  * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16566  * output anything */
16567
16568  int i;
16569  bool has_output_anything = FALSE;
16570
16571  PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16572
16573  for (i = 0; i < 256; i++) {
16574   if (BITMAP_TEST((U8 *) bitmap,i)) {
16575
16576    /* The character at index i should be output.  Find the next
16577    * character that should NOT be output */
16578    int j;
16579    for (j = i + 1; j < 256; j++) {
16580     if (! BITMAP_TEST((U8 *) bitmap, j)) {
16581      break;
16582     }
16583    }
16584
16585    /* Everything between them is a single range that should be output
16586    * */
16587    put_range(sv, i, j - 1);
16588    has_output_anything = TRUE;
16589    i = j;
16590   }
16591  }
16592
16593  return has_output_anything;
16594 }
16595
16596 #define CLEAR_OPTSTART \
16597  if (optstart) STMT_START {                                               \
16598   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16599        " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16600   optstart=NULL;                                                       \
16601  } STMT_END
16602
16603 #define DUMPUNTIL(b,e)                                                       \
16604      CLEAR_OPTSTART;                                          \
16605      node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16606
16607 STATIC const regnode *
16608 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16609    const regnode *last, const regnode *plast,
16610    SV* sv, I32 indent, U32 depth)
16611 {
16612  dVAR;
16613  U8 op = PSEUDO; /* Arbitrary non-END op. */
16614  const regnode *next;
16615  const regnode *optstart= NULL;
16616
16617  RXi_GET_DECL(r,ri);
16618  GET_RE_DEBUG_FLAGS_DECL;
16619
16620  PERL_ARGS_ASSERT_DUMPUNTIL;
16621
16622 #ifdef DEBUG_DUMPUNTIL
16623  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16624   last ? last-start : 0,plast ? plast-start : 0);
16625 #endif
16626
16627  if (plast && plast < last)
16628   last= plast;
16629
16630  while (PL_regkind[op] != END && (!last || node < last)) {
16631   /* While that wasn't END last time... */
16632   NODE_ALIGN(node);
16633   op = OP(node);
16634   if (op == CLOSE || op == WHILEM)
16635    indent--;
16636   next = regnext((regnode *)node);
16637
16638   /* Where, what. */
16639   if (OP(node) == OPTIMIZED) {
16640    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16641     optstart = node;
16642    else
16643     goto after_print;
16644   } else
16645    CLEAR_OPTSTART;
16646
16647   regprop(r, sv, node, NULL);
16648   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16649      (int)(2*indent + 1), "", SvPVX_const(sv));
16650
16651   if (OP(node) != OPTIMIZED) {
16652    if (next == NULL)  /* Next ptr. */
16653     PerlIO_printf(Perl_debug_log, " (0)");
16654    else if (PL_regkind[(U8)op] == BRANCH
16655      && PL_regkind[OP(next)] != BRANCH )
16656     PerlIO_printf(Perl_debug_log, " (FAIL)");
16657    else
16658     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16659    (void)PerlIO_putc(Perl_debug_log, '\n');
16660   }
16661
16662  after_print:
16663   if (PL_regkind[(U8)op] == BRANCHJ) {
16664    assert(next);
16665    {
16666     const regnode *nnode = (OP(next) == LONGJMP
16667          ? regnext((regnode *)next)
16668          : next);
16669     if (last && nnode > last)
16670      nnode = last;
16671     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16672    }
16673   }
16674   else if (PL_regkind[(U8)op] == BRANCH) {
16675    assert(next);
16676    DUMPUNTIL(NEXTOPER(node), next);
16677   }
16678   else if ( PL_regkind[(U8)op]  == TRIE ) {
16679    const regnode *this_trie = node;
16680    const char op = OP(node);
16681    const U32 n = ARG(node);
16682    const reg_ac_data * const ac = op>=AHOCORASICK ?
16683    (reg_ac_data *)ri->data->data[n] :
16684    NULL;
16685    const reg_trie_data * const trie =
16686     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16687 #ifdef DEBUGGING
16688    AV *const trie_words
16689       = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16690 #endif
16691    const regnode *nextbranch= NULL;
16692    I32 word_idx;
16693    sv_setpvs(sv, "");
16694    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16695     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16696
16697     PerlIO_printf(Perl_debug_log, "%*s%s ",
16698     (int)(2*(indent+3)), "",
16699      elem_ptr
16700      ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16701         SvCUR(*elem_ptr), 60,
16702         PL_colors[0], PL_colors[1],
16703         (SvUTF8(*elem_ptr)
16704         ? PERL_PV_ESCAPE_UNI
16705         : 0)
16706         | PERL_PV_PRETTY_ELLIPSES
16707         | PERL_PV_PRETTY_LTGT
16708        )
16709      : "???"
16710     );
16711     if (trie->jump) {
16712      U16 dist= trie->jump[word_idx+1];
16713      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16714        (UV)((dist ? this_trie + dist : next) - start));
16715      if (dist) {
16716       if (!nextbranch)
16717        nextbranch= this_trie + trie->jump[0];
16718       DUMPUNTIL(this_trie + dist, nextbranch);
16719      }
16720      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16721       nextbranch= regnext((regnode *)nextbranch);
16722     } else {
16723      PerlIO_printf(Perl_debug_log, "\n");
16724     }
16725    }
16726    if (last && next > last)
16727     node= last;
16728    else
16729     node= next;
16730   }
16731   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16732    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16733      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16734   }
16735   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16736    assert(next);
16737    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16738   }
16739   else if ( op == PLUS || op == STAR) {
16740    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16741   }
16742   else if (PL_regkind[(U8)op] == ANYOF) {
16743    /* arglen 1 + class block */
16744    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16745       ? ANYOF_POSIXL_SKIP
16746       : ANYOF_SKIP);
16747    node = NEXTOPER(node);
16748   }
16749   else if (PL_regkind[(U8)op] == EXACT) {
16750    /* Literal string, where present. */
16751    node += NODE_SZ_STR(node) - 1;
16752    node = NEXTOPER(node);
16753   }
16754   else {
16755    node = NEXTOPER(node);
16756    node += regarglen[(U8)op];
16757   }
16758   if (op == CURLYX || op == OPEN)
16759    indent++;
16760  }
16761  CLEAR_OPTSTART;
16762 #ifdef DEBUG_DUMPUNTIL
16763  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16764 #endif
16765  return node;
16766 }
16767
16768 #endif /* DEBUGGING */
16769
16770 /*
16771  * Local variables:
16772  * c-indentation-style: bsd
16773  * c-basic-offset: 4
16774  * indent-tabs-mode: nil
16775  * End:
16776  *
16777  * ex: set ts=8 sts=4 sw=4 et:
16778  */