src/5020000/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #undef PERL_IN_XSUB_RE
  76 #define PERL_IN_XSUB_RE 1
  77 #include "perl.h"
  78 #undef PERL_IN_XSUB_RE
  79
  80 #ifndef PERL_IN_XSUB_RE
  81 #include "re_defs.h"
  82 #endif
  83
  84 #define REG_COMP_C
  85 #ifdef PERL_IN_XSUB_RE
  86 #  include "re_comp.h"
  87 EXTERN_C const struct regexp_engine my_reg_engine;
  88 #else
  89 #  include "regcomp.h"
  90 #endif
  91
  92 #include "dquote_static.c"
  93 #include "charclass_invlists.h"
  94 #include "inline_invlist.c"
  95 #include "unicode_constants.h"
  96
  97 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  98  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  99 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 101
 102 #ifndef STATIC
 103 #define STATIC static
 104 #endif
 105
 106
 107 struct RExC_state_t {
 108  U32  flags;   /* RXf_* are we folding, multilining? */
 109  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 110  char *precomp;  /* uncompiled string. */
 111  REGEXP *rx_sv;   /* The SV that is the regexp. */
 112  regexp *rx;                    /* perl core regexp structure */
 113  regexp_internal *rxi;           /* internal data for regexp object
 114           pprivate field */
 115  char *start;   /* Start of input for compile */
 116  char *end;   /* End of input for compile */
 117  char *parse;   /* Input-scan pointer. */
 118  SSize_t whilem_seen;  /* number of WHILEM in this expr */
 119  regnode *emit_start;  /* Start of emitted-code area */
 120  regnode *emit_bound;  /* First regnode outside of the
 121           allocated space */
 122  regnode *emit;   /* Code-emit pointer; if = &emit_dummy,
 123           implies compiling, so don't emit */
 124  regnode_ssc emit_dummy;  /* placeholder for emit to point to;
 125           large enough for the largest
 126           non-EXACTish node, so can use it as
 127           scratch in pass1 */
 128  I32  naughty;  /* How bad is this pattern? */
 129  I32  sawback;  /* Did we see \1, ...? */
 130  U32  seen;
 131  SSize_t size;   /* Code size. */
 132  I32                npar;            /* Capture buffer count, (OPEN) plus
 133           one. ("par" 0 is the whole
 134           pattern)*/
 135  I32  nestroot;  /* root parens we are in - used by
 136           accept */
 137  I32  extralen;
 138  I32  seen_zerolen;
 139  regnode **open_parens;  /* pointers to open parens */
 140  regnode **close_parens;  /* pointers to close parens */
 141  regnode *opend;   /* END node in program */
 142  I32  utf8;  /* whether the pattern is utf8 or not */
 143  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 144         /* XXX use this for future optimisation of case
 145         * where pattern must be upgraded to utf8. */
 146  I32  uni_semantics; /* If a d charset modifier should use unicode
 147         rules, even if the pattern is not in
 148         utf8 */
 149  HV  *paren_names;  /* Paren names */
 150
 151  regnode **recurse;  /* Recurse regops */
 152  I32  recurse_count;  /* Number of recurse regops */
 153  U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 154           through */
 155  U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 156  I32  in_lookbehind;
 157  I32  contains_locale;
 158  I32  contains_i;
 159  I32  override_recoding;
 160  I32  in_multi_char_class;
 161  struct reg_code_block *code_blocks; /* positions of literal (?{})
 162            within pattern */
 163  int  num_code_blocks; /* size of code_blocks[] */
 164  int  code_index;  /* next code_blocks[] slot */
 165  SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 166 #ifdef ADD_TO_REGEXEC
 167  char  *starttry;  /* -Dr: where regtry was called. */
 168 #define RExC_starttry (pRExC_state->starttry)
 169 #endif
 170  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 171 #ifdef DEBUGGING
 172  const char  *lastparse;
 173  I32         lastnum;
 174  AV          *paren_name_list;       /* idx -> name */
 175 #define RExC_lastparse (pRExC_state->lastparse)
 176 #define RExC_lastnum (pRExC_state->lastnum)
 177 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 178 #endif
 179 };
 180
 181 #define RExC_flags (pRExC_state->flags)
 182 #define RExC_pm_flags (pRExC_state->pm_flags)
 183 #define RExC_precomp (pRExC_state->precomp)
 184 #define RExC_rx_sv (pRExC_state->rx_sv)
 185 #define RExC_rx  (pRExC_state->rx)
 186 #define RExC_rxi (pRExC_state->rxi)
 187 #define RExC_start (pRExC_state->start)
 188 #define RExC_end (pRExC_state->end)
 189 #define RExC_parse (pRExC_state->parse)
 190 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 191 #ifdef RE_TRACK_PATTERN_OFFSETS
 192 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
 193               others */
 194 #endif
 195 #define RExC_emit (pRExC_state->emit)
 196 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 197 #define RExC_emit_start (pRExC_state->emit_start)
 198 #define RExC_emit_bound (pRExC_state->emit_bound)
 199 #define RExC_naughty (pRExC_state->naughty)
 200 #define RExC_sawback (pRExC_state->sawback)
 201 #define RExC_seen (pRExC_state->seen)
 202 #define RExC_size (pRExC_state->size)
 203 #define RExC_maxlen        (pRExC_state->maxlen)
 204 #define RExC_npar (pRExC_state->npar)
 205 #define RExC_nestroot   (pRExC_state->nestroot)
 206 #define RExC_extralen (pRExC_state->extralen)
 207 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 208 #define RExC_utf8 (pRExC_state->utf8)
 209 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 210 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 211 #define RExC_open_parens (pRExC_state->open_parens)
 212 #define RExC_close_parens (pRExC_state->close_parens)
 213 #define RExC_opend (pRExC_state->opend)
 214 #define RExC_paren_names (pRExC_state->paren_names)
 215 #define RExC_recurse (pRExC_state->recurse)
 216 #define RExC_recurse_count (pRExC_state->recurse_count)
 217 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 218 #define RExC_study_chunk_recursed_bytes  \
 219         (pRExC_state->study_chunk_recursed_bytes)
 220 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 221 #define RExC_contains_locale (pRExC_state->contains_locale)
 222 #define RExC_contains_i (pRExC_state->contains_i)
 223 #define RExC_override_recoding (pRExC_state->override_recoding)
 224 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 225
 226
 227 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 228 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 229   ((*s) == '{' && regcurly(s, FALSE)))
 230
 231 /*
 232  * Flags to be passed up and down.
 233  */
 234 #define WORST  0 /* Worst case. */
 235 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 236
 237 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 238  * character.  (There needs to be a case: in the switch statement in regexec.c
 239  * for any node marked SIMPLE.)  Note that this is not the same thing as
 240  * REGNODE_SIMPLE */
 241 #define SIMPLE  0x02
 242 #define SPSTART  0x04 /* Starts with * or + */
 243 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 244 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 245 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 246
 247 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 248
 249 /* whether trie related optimizations are enabled */
 250 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 251 #define TRIE_STUDY_OPT
 252 #define FULL_TRIE_STUDY
 253 #define TRIE_STCLASS
 254 #endif
 255
 256
 257
 258 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 259 #define PBITVAL(paren) (1 << ((paren) & 7))
 260 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 261 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 262 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 263
 264 #define REQUIRE_UTF8 STMT_START {                                       \
 265          if (!UTF) {                           \
 266           *flagp = RESTART_UTF8;            \
 267           return NULL;                      \
 268          }                                     \
 269       } STMT_END
 270
 271 /* This converts the named class defined in regcomp.h to its equivalent class
 272  * number defined in handy.h. */
 273 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 274 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 275
 276 #define _invlist_union_complement_2nd(a, b, output) \
 277       _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 278 #define _invlist_intersection_complement_2nd(a, b, output) \
 279     _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 280
 281 /* About scan_data_t.
 282
 283   During optimisation we recurse through the regexp program performing
 284   various inplace (keyhole style) optimisations. In addition study_chunk
 285   and scan_commit populate this data structure with information about
 286   what strings MUST appear in the pattern. We look for the longest
 287   string that must appear at a fixed location, and we look for the
 288   longest string that may appear at a floating location. So for instance
 289   in the pattern:
 290
 291  /FOO[xX]A.*B[xX]BAR/
 292
 293   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 294   strings (because they follow a .* construct). study_chunk will identify
 295   both FOO and BAR as being the longest fixed and floating strings respectively.
 296
 297   The strings can be composites, for instance
 298
 299  /(f)(o)(o)/
 300
 301   will result in a composite fixed substring 'foo'.
 302
 303   For each string some basic information is maintained:
 304
 305   - offset or min_offset
 306  This is the position the string must appear at, or not before.
 307  It also implicitly (when combined with minlenp) tells us how many
 308  characters must match before the string we are searching for.
 309  Likewise when combined with minlenp and the length of the string it
 310  tells us how many characters must appear after the string we have
 311  found.
 312
 313   - max_offset
 314  Only used for floating strings. This is the rightmost point that
 315  the string can appear at. If set to SSize_t_MAX it indicates that the
 316  string can occur infinitely far to the right.
 317
 318   - minlenp
 319  A pointer to the minimum number of characters of the pattern that the
 320  string was found inside. This is important as in the case of positive
 321  lookahead or positive lookbehind we can have multiple patterns
 322  involved. Consider
 323
 324  /(?=FOO).*F/
 325
 326  The minimum length of the pattern overall is 3, the minimum length
 327  of the lookahead part is 3, but the minimum length of the part that
 328  will actually match is 1. So 'FOO's minimum length is 3, but the
 329  minimum length for the F is 1. This is important as the minimum length
 330  is used to determine offsets in front of and behind the string being
 331  looked for.  Since strings can be composites this is the length of the
 332  pattern at the time it was committed with a scan_commit. Note that
 333  the length is calculated by study_chunk, so that the minimum lengths
 334  are not known until the full pattern has been compiled, thus the
 335  pointer to the value.
 336
 337   - lookbehind
 338
 339  In the case of lookbehind the string being searched for can be
 340  offset past the start point of the final matching string.
 341  If this value was just blithely removed from the min_offset it would
 342  invalidate some of the calculations for how many chars must match
 343  before or after (as they are derived from min_offset and minlen and
 344  the length of the string being searched for).
 345  When the final pattern is compiled and the data is moved from the
 346  scan_data_t structure into the regexp structure the information
 347  about lookbehind is factored in, with the information that would
 348  have been lost precalculated in the end_shift field for the
 349  associated string.
 350
 351   The fields pos_min and pos_delta are used to store the minimum offset
 352   and the delta to the maximum offset at the current point in the pattern.
 353
 354 */
 355
 356 typedef struct scan_data_t {
 357  /*I32 len_min;      unused */
 358  /*I32 len_delta;    unused */
 359  SSize_t pos_min;
 360  SSize_t pos_delta;
 361  SV *last_found;
 362  SSize_t last_end;     /* min value, <0 unless valid. */
 363  SSize_t last_start_min;
 364  SSize_t last_start_max;
 365  SV **longest;     /* Either &l_fixed, or &l_float. */
 366  SV *longest_fixed;      /* longest fixed string found in pattern */
 367  SSize_t offset_fixed;   /* offset where it starts */
 368  SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 369  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 370  SV *longest_float;      /* longest floating string found in pattern */
 371  SSize_t offset_float_min; /* earliest point in string it can appear */
 372  SSize_t offset_float_max; /* latest point in string it can appear */
 373  SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 374  SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 375  I32 flags;
 376  I32 whilem_c;
 377  SSize_t *last_closep;
 378  regnode_ssc *start_class;
 379 } scan_data_t;
 380
 381 /* The below is perhaps overboard, but this allows us to save a test at the
 382  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 383  * and 'a' differ by a single bit; the same with the upper and lower case of
 384  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 385  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 386  * then inverts it to form a mask, with just a single 0, in the bit position
 387  * where the upper- and lowercase differ.  XXX There are about 40 other
 388  * instances in the Perl core where this micro-optimization could be used.
 389  * Should decide if maintenance cost is worse, before changing those
 390  *
 391  * Returns a boolean as to whether or not 'v' is either a lowercase or
 392  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 393  * compile-time constant, the generated code is better than some optimizing
 394  * compilers figure out, amounting to a mask and test.  The results are
 395  * meaningless if 'c' is not one of [A-Za-z] */
 396 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 397        (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 398
 399 /*
 400  * Forward declarations for pregcomp()'s friends.
 401  */
 402
 403 static const scan_data_t zero_scan_data =
 404   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 405
 406 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 407 #define SF_BEFORE_SEOL  0x0001
 408 #define SF_BEFORE_MEOL  0x0002
 409 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 410 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 411
 412 #define SF_FIX_SHIFT_EOL (+2)
 413 #define SF_FL_SHIFT_EOL  (+4)
 414
 415 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 416 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 417
 418 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 419 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 420 #define SF_IS_INF  0x0040
 421 #define SF_HAS_PAR  0x0080
 422 #define SF_IN_PAR  0x0100
 423 #define SF_HAS_EVAL  0x0200
 424 #define SCF_DO_SUBSTR  0x0400
 425 #define SCF_DO_STCLASS_AND 0x0800
 426 #define SCF_DO_STCLASS_OR 0x1000
 427 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 428 #define SCF_WHILEM_VISITED_POS 0x2000
 429
 430 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 431 #define SCF_SEEN_ACCEPT         0x8000
 432 #define SCF_TRIE_DOING_RESTUDY 0x10000
 433
 434 #define UTF cBOOL(RExC_utf8)
 435
 436 /* The enums for all these are ordered so things work out correctly */
 437 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 438 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 439              == REGEX_DEPENDS_CHARSET)
 440 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 441 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 442              >= REGEX_UNICODE_CHARSET)
 443 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 444            == REGEX_ASCII_RESTRICTED_CHARSET)
 445 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 446            >= REGEX_ASCII_RESTRICTED_CHARSET)
 447 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 448           == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 449
 450 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 451
 452 /* For programs that want to be strictly Unicode compatible by dying if any
 453  * attempt is made to match a non-Unicode code point against a Unicode
 454  * property.  */
 455 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 456
 457 #define OOB_NAMEDCLASS  -1
 458
 459 /* There is no code point that is out-of-bounds, so this is problematic.  But
 460  * its only current use is to initialize a variable that is always set before
 461  * looked at. */
 462 #define OOB_UNICODE  0xDEADBEEF
 463
 464 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 465 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 466
 467
 468 /* length of regex to show in messages that don't mark a position within */
 469 #define RegexLengthToShowInErrorMessages 127
 470
 471 /*
 472  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 473  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 474  * op/pragma/warn/regcomp.
 475  */
 476 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 477 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 478
 479 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 480       " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 481
 482 #define REPORT_LOCATION_ARGS(offset)            \
 483     UTF8fARG(UTF, offset, RExC_precomp), \
 484     UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 485
 486 /*
 487  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 488  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 489  * "...".
 490  */
 491 #define _FAIL(code) STMT_START {     \
 492  const char *ellipses = "";      \
 493  IV len = RExC_end - RExC_precomp;     \
 494                   \
 495  if (!SIZE_ONLY)       \
 496   SAVEFREESV(RExC_rx_sv);      \
 497  if (len > RegexLengthToShowInErrorMessages) {   \
 498   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 499   len = RegexLengthToShowInErrorMessages - 10;   \
 500   ellipses = "...";      \
 501  }         \
 502  code;                                                               \
 503 } STMT_END
 504
 505 #define FAIL(msg) _FAIL(       \
 506  Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",     \
 507    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 508
 509 #define FAIL2(msg,arg) _FAIL(       \
 510  Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",     \
 511    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 512
 513 /*
 514  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 515  */
 516 #define Simple_vFAIL(m) STMT_START {     \
 517  const IV offset = RExC_parse - RExC_precomp;   \
 518  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 519    m, REPORT_LOCATION_ARGS(offset)); \
 520 } STMT_END
 521
 522 /*
 523  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 524  */
 525 #define vFAIL(m) STMT_START {    \
 526  if (!SIZE_ONLY)     \
 527   SAVEFREESV(RExC_rx_sv);    \
 528  Simple_vFAIL(m);     \
 529 } STMT_END
 530
 531 /*
 532  * Like Simple_vFAIL(), but accepts two arguments.
 533  */
 534 #define Simple_vFAIL2(m,a1) STMT_START {   \
 535  const IV offset = RExC_parse - RExC_precomp;   \
 536  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,   \
 537      REPORT_LOCATION_ARGS(offset)); \
 538 } STMT_END
 539
 540 /*
 541  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 542  */
 543 #define vFAIL2(m,a1) STMT_START {   \
 544  if (!SIZE_ONLY)     \
 545   SAVEFREESV(RExC_rx_sv);    \
 546  Simple_vFAIL2(m, a1);    \
 547 } STMT_END
 548
 549
 550 /*
 551  * Like Simple_vFAIL(), but accepts three arguments.
 552  */
 553 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 554  const IV offset = RExC_parse - RExC_precomp;  \
 555  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 556    REPORT_LOCATION_ARGS(offset)); \
 557 } STMT_END
 558
 559 /*
 560  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 561  */
 562 #define vFAIL3(m,a1,a2) STMT_START {   \
 563  if (!SIZE_ONLY)     \
 564   SAVEFREESV(RExC_rx_sv);    \
 565  Simple_vFAIL3(m, a1, a2);    \
 566 } STMT_END
 567
 568 /*
 569  * Like Simple_vFAIL(), but accepts four arguments.
 570  */
 571 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 572  const IV offset = RExC_parse - RExC_precomp;  \
 573  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,  \
 574    REPORT_LOCATION_ARGS(offset)); \
 575 } STMT_END
 576
 577 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 578  if (!SIZE_ONLY)     \
 579   SAVEFREESV(RExC_rx_sv);    \
 580  Simple_vFAIL4(m, a1, a2, a3);   \
 581 } STMT_END
 582
 583 /* A specialized version of vFAIL2 that works with UTF8f */
 584 #define vFAIL2utf8f(m, a1) STMT_START { \
 585  const IV offset = RExC_parse - RExC_precomp;   \
 586  if (!SIZE_ONLY)                                \
 587   SAVEFREESV(RExC_rx_sv);                    \
 588  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 589    REPORT_LOCATION_ARGS(offset));         \
 590 } STMT_END
 591
 592
 593 /* m is not necessarily a "literal string", in this macro */
 594 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 595  const IV offset = loc - RExC_precomp;                               \
 596  Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 597    m, REPORT_LOCATION_ARGS(offset));       \
 598 } STMT_END
 599
 600 #define ckWARNreg(loc,m) STMT_START {     \
 601  const IV offset = loc - RExC_precomp;    \
 602  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 603    REPORT_LOCATION_ARGS(offset));  \
 604 } STMT_END
 605
 606 #define vWARN_dep(loc, m) STMT_START {            \
 607  const IV offset = loc - RExC_precomp;    \
 608  Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 609    REPORT_LOCATION_ARGS(offset));         \
 610 } STMT_END
 611
 612 #define ckWARNdep(loc,m) STMT_START {            \
 613  const IV offset = loc - RExC_precomp;    \
 614  Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 615    m REPORT_LOCATION,      \
 616    REPORT_LOCATION_ARGS(offset));  \
 617 } STMT_END
 618
 619 #define ckWARNregdep(loc,m) STMT_START {    \
 620  const IV offset = loc - RExC_precomp;    \
 621  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 622    m REPORT_LOCATION,      \
 623    REPORT_LOCATION_ARGS(offset));  \
 624 } STMT_END
 625
 626 #define ckWARN2reg_d(loc,m, a1) STMT_START {    \
 627  const IV offset = loc - RExC_precomp;    \
 628  Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),   \
 629    m REPORT_LOCATION,      \
 630    a1, REPORT_LOCATION_ARGS(offset)); \
 631 } STMT_END
 632
 633 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 634  const IV offset = loc - RExC_precomp;    \
 635  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 636    a1, REPORT_LOCATION_ARGS(offset)); \
 637 } STMT_END
 638
 639 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 640  const IV offset = loc - RExC_precomp;    \
 641  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 642    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 643 } STMT_END
 644
 645 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 646  const IV offset = loc - RExC_precomp;    \
 647  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 648    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 649 } STMT_END
 650
 651 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 652  const IV offset = loc - RExC_precomp;    \
 653  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 654    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 655 } STMT_END
 656
 657 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 658  const IV offset = loc - RExC_precomp;    \
 659  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 660    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 661 } STMT_END
 662
 663 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 664  const IV offset = loc - RExC_precomp;    \
 665  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 666    a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 667 } STMT_END
 668
 669
 670 /* Allow for side effects in s */
 671 #define REGC(c,s) STMT_START {   \
 672  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 673 } STMT_END
 674
 675 /* Macros for recording node offsets.   20001227 mjd@plover.com
 676  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 677  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 678  * Element 0 holds the number n.
 679  * Position is 1 indexed.
 680  */
 681 #ifndef RE_TRACK_PATTERN_OFFSETS
 682 #define Set_Node_Offset_To_R(node,byte)
 683 #define Set_Node_Offset(node,byte)
 684 #define Set_Cur_Node_Offset
 685 #define Set_Node_Length_To_R(node,len)
 686 #define Set_Node_Length(node,len)
 687 #define Set_Node_Cur_Length(node,start)
 688 #define Node_Offset(n)
 689 #define Node_Length(n)
 690 #define Set_Node_Offset_Length(node,offset,len)
 691 #define ProgLen(ri) ri->u.proglen
 692 #define SetProgLen(ri,x) ri->u.proglen = x
 693 #else
 694 #define ProgLen(ri) ri->u.offsets[0]
 695 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 696 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 697  if (! SIZE_ONLY) {       \
 698   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 699      __LINE__, (int)(node), (int)(byte)));  \
 700   if((node) < 0) {      \
 701    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 702           (int)(node));                  \
 703   } else {       \
 704    RExC_offsets[2*(node)-1] = (byte);    \
 705   }        \
 706  }         \
 707 } STMT_END
 708
 709 #define Set_Node_Offset(node,byte) \
 710  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 711 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 712
 713 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 714  if (! SIZE_ONLY) {       \
 715   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 716     __LINE__, (int)(node), (int)(len)));   \
 717   if((node) < 0) {      \
 718    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 719           (int)(node));                  \
 720   } else {       \
 721    RExC_offsets[2*(node)] = (len);    \
 722   }        \
 723  }         \
 724 } STMT_END
 725
 726 #define Set_Node_Length(node,len) \
 727  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 728 #define Set_Node_Cur_Length(node, start)                \
 729  Set_Node_Length(node, RExC_parse - start)
 730
 731 /* Get offsets and lengths */
 732 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 733 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 734
 735 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 736  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 737  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 738 } STMT_END
 739 #endif
 740
 741 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 742 #define EXPERIMENTAL_INPLACESCAN
 743 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 744
 745 #define DEBUG_RExC_seen() \
 746   DEBUG_OPTIMISE_MORE_r({                                             \
 747    PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 748                    \
 749    if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 750     PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 751                    \
 752    if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 753     PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 754                    \
 755    if (RExC_seen & REG_GPOS_SEEN)                                  \
 756     PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 757                    \
 758    if (RExC_seen & REG_CANY_SEEN)                                  \
 759     PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 760                    \
 761    if (RExC_seen & REG_RECURSE_SEEN)                               \
 762     PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 763                    \
 764    if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 765     PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 766                    \
 767    if (RExC_seen & REG_VERBARG_SEEN)                               \
 768     PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 769                    \
 770    if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 771     PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 772                    \
 773    if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 774     PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 775                    \
 776    if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 777     PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 778                    \
 779    if (RExC_seen & REG_GOSTART_SEEN)                               \
 780     PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 781                    \
 782    if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 783     PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 784                    \
 785    PerlIO_printf(Perl_debug_log,"\n");                             \
 786   });
 787
 788 #define DEBUG_STUDYDATA(str,data,depth)                              \
 789 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 790  PerlIO_printf(Perl_debug_log,                                    \
 791   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 792   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 793   (int)(depth)*2, "",                                          \
 794   (IV)((data)->pos_min),                                       \
 795   (IV)((data)->pos_delta),                                     \
 796   (UV)((data)->flags),                                         \
 797   (IV)((data)->whilem_c),                                      \
 798   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 799   is_inf ? "INF " : ""                                         \
 800  );                                                               \
 801  if ((data)->last_found)                                          \
 802   PerlIO_printf(Perl_debug_log,                                \
 803    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 804    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 805    SvPVX_const((data)->last_found),                         \
 806    (IV)((data)->last_end),                                  \
 807    (IV)((data)->last_start_min),                            \
 808    (IV)((data)->last_start_max),                            \
 809    ((data)->longest &&                                      \
 810    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 811    SvPVX_const((data)->longest_fixed),                      \
 812    (IV)((data)->offset_fixed),                              \
 813    ((data)->longest &&                                      \
 814    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 815    SvPVX_const((data)->longest_float),                      \
 816    (IV)((data)->offset_float_min),                          \
 817    (IV)((data)->offset_float_max)                           \
 818   );                                                           \
 819  PerlIO_printf(Perl_debug_log,"\n");                              \
 820 });
 821
 822 /* Mark that we cannot extend a found fixed substring at this point.
 823    Update the longest found anchored substring and the longest found
 824    floating substrings if needed. */
 825
 826 STATIC void
 827 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 828      SSize_t *minlenp, int is_inf)
 829 {
 830  const STRLEN l = CHR_SVLEN(data->last_found);
 831  const STRLEN old_l = CHR_SVLEN(*data->longest);
 832  GET_RE_DEBUG_FLAGS_DECL;
 833
 834  PERL_ARGS_ASSERT_SCAN_COMMIT;
 835
 836  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 837   SvSetMagicSV(*data->longest, data->last_found);
 838   if (*data->longest == data->longest_fixed) {
 839    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 840    if (data->flags & SF_BEFORE_EOL)
 841     data->flags
 842      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 843    else
 844     data->flags &= ~SF_FIX_BEFORE_EOL;
 845    data->minlen_fixed=minlenp;
 846    data->lookbehind_fixed=0;
 847   }
 848   else { /* *data->longest == data->longest_float */
 849    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 850    data->offset_float_max = (l
 851          ? data->last_start_max
 852          : (data->pos_delta == SSize_t_MAX
 853           ? SSize_t_MAX
 854           : data->pos_min + data->pos_delta));
 855    if (is_inf
 856     || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 857     data->offset_float_max = SSize_t_MAX;
 858    if (data->flags & SF_BEFORE_EOL)
 859     data->flags
 860      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 861    else
 862     data->flags &= ~SF_FL_BEFORE_EOL;
 863    data->minlen_float=minlenp;
 864    data->lookbehind_float=0;
 865   }
 866  }
 867  SvCUR_set(data->last_found, 0);
 868  {
 869   SV * const sv = data->last_found;
 870   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 871    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 872    if (mg)
 873     mg->mg_len = 0;
 874   }
 875  }
 876  data->last_end = -1;
 877  data->flags &= ~SF_BEFORE_EOL;
 878  DEBUG_STUDYDATA("commit: ",data,0);
 879 }
 880
 881 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 882  * list that describes which code points it matches */
 883
 884 STATIC void
 885 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 886 {
 887  /* Set the SSC 'ssc' to match an empty string or any code point */
 888
 889  PERL_ARGS_ASSERT_SSC_ANYTHING;
 890
 891  assert(is_ANYOF_SYNTHETIC(ssc));
 892
 893  ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 894  _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 895  ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 896 }
 897
 898 STATIC int
 899 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 900 {
 901  /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 902  * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 903  * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 904  * in any way, so there's no point in using it */
 905
 906  UV start, end;
 907  bool ret;
 908
 909  PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 910
 911  assert(is_ANYOF_SYNTHETIC(ssc));
 912
 913  if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 914   return FALSE;
 915  }
 916
 917  /* See if the list consists solely of the range 0 - Infinity */
 918  invlist_iterinit(ssc->invlist);
 919  ret = invlist_iternext(ssc->invlist, &start, &end)
 920   && start == 0
 921   && end == UV_MAX;
 922
 923  invlist_iterfinish(ssc->invlist);
 924
 925  if (ret) {
 926   return TRUE;
 927  }
 928
 929  /* If e.g., both \w and \W are set, matches everything */
 930  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 931   int i;
 932   for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 933    if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 934     return TRUE;
 935    }
 936   }
 937  }
 938
 939  return FALSE;
 940 }
 941
 942 STATIC void
 943 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 944 {
 945  /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 946  * string, any code point, or any posix class under locale */
 947
 948  PERL_ARGS_ASSERT_SSC_INIT;
 949
 950  Zero(ssc, 1, regnode_ssc);
 951  set_ANYOF_SYNTHETIC(ssc);
 952  ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 953  ssc_anything(ssc);
 954
 955  /* If any portion of the regex is to operate under locale rules,
 956  * initialization includes it.  The reason this isn't done for all regexes
 957  * is that the optimizer was written under the assumption that locale was
 958  * all-or-nothing.  Given the complexity and lack of documentation in the
 959  * optimizer, and that there are inadequate test cases for locale, many
 960  * parts of it may not work properly, it is safest to avoid locale unless
 961  * necessary. */
 962  if (RExC_contains_locale) {
 963   ANYOF_POSIXL_SETALL(ssc);
 964  }
 965  else {
 966   ANYOF_POSIXL_ZERO(ssc);
 967  }
 968 }
 969
 970 STATIC int
 971 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 972        const regnode_ssc *ssc)
 973 {
 974  /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 975  * to the list of code points matched, and locale posix classes; hence does
 976  * not check its flags) */
 977
 978  UV start, end;
 979  bool ret;
 980
 981  PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 982
 983  assert(is_ANYOF_SYNTHETIC(ssc));
 984
 985  invlist_iterinit(ssc->invlist);
 986  ret = invlist_iternext(ssc->invlist, &start, &end)
 987   && start == 0
 988   && end == UV_MAX;
 989
 990  invlist_iterfinish(ssc->invlist);
 991
 992  if (! ret) {
 993   return FALSE;
 994  }
 995
 996  if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 997   return FALSE;
 998  }
 999
1000  return TRUE;
1001 }
1002
1003 STATIC SV*
1004 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1005        const regnode_charclass* const node)
1006 {
1007  /* Returns a mortal inversion list defining which code points are matched
1008  * by 'node', which is of type ANYOF.  Handles complementing the result if
1009  * appropriate.  If some code points aren't knowable at this time, the
1010  * returned list must, and will, contain every code point that is a
1011  * possibility. */
1012
1013  SV* invlist = sv_2mortal(_new_invlist(0));
1014  SV* only_utf8_locale_invlist = NULL;
1015  unsigned int i;
1016  const U32 n = ARG(node);
1017  bool new_node_has_latin1 = FALSE;
1018
1019  PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1020
1021  /* Look at the data structure created by S_set_ANYOF_arg() */
1022  if (n != ANYOF_NONBITMAP_EMPTY) {
1023   SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1024   AV * const av = MUTABLE_AV(SvRV(rv));
1025   SV **const ary = AvARRAY(av);
1026   assert(RExC_rxi->data->what[n] == 's');
1027
1028   if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1029    invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1030   }
1031   else if (ary[0] && ary[0] != &PL_sv_undef) {
1032
1033    /* Here, no compile-time swash, and there are things that won't be
1034    * known until runtime -- we have to assume it could be anything */
1035    return _add_range_to_invlist(invlist, 0, UV_MAX);
1036   }
1037   else if (ary[3] && ary[3] != &PL_sv_undef) {
1038
1039    /* Here no compile-time swash, and no run-time only data.  Use the
1040    * node's inversion list */
1041    invlist = sv_2mortal(invlist_clone(ary[3]));
1042   }
1043
1044   /* Get the code points valid only under UTF-8 locales */
1045   if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1046    && ary[2] && ary[2] != &PL_sv_undef)
1047   {
1048    only_utf8_locale_invlist = ary[2];
1049   }
1050  }
1051
1052  /* An ANYOF node contains a bitmap for the first 256 code points, and an
1053  * inversion list for the others, but if there are code points that should
1054  * match only conditionally on the target string being UTF-8, those are
1055  * placed in the inversion list, and not the bitmap.  Since there are
1056  * circumstances under which they could match, they are included in the
1057  * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1058  * here, so that when we invert below, the end result actually does include
1059  * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1060  * before we add the unconditionally matched code points */
1061  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1062   _invlist_intersection_complement_2nd(invlist,
1063            PL_UpperLatin1,
1064            &invlist);
1065  }
1066
1067  /* Add in the points from the bit map */
1068  for (i = 0; i < 256; i++) {
1069   if (ANYOF_BITMAP_TEST(node, i)) {
1070    invlist = add_cp_to_invlist(invlist, i);
1071    new_node_has_latin1 = TRUE;
1072   }
1073  }
1074
1075  /* If this can match all upper Latin1 code points, have to add them
1076  * as well */
1077  if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1078   _invlist_union(invlist, PL_UpperLatin1, &invlist);
1079  }
1080
1081  /* Similarly for these */
1082  if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1083   invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1084  }
1085
1086  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1087   _invlist_invert(invlist);
1088  }
1089  else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1090
1091   /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1092   * locale.  We can skip this if there are no 0-255 at all. */
1093   _invlist_union(invlist, PL_Latin1, &invlist);
1094  }
1095
1096  /* Similarly add the UTF-8 locale possible matches.  These have to be
1097  * deferred until after the non-UTF-8 locale ones are taken care of just
1098  * above, or it leads to wrong results under ANYOF_INVERT */
1099  if (only_utf8_locale_invlist) {
1100   _invlist_union_maybe_complement_2nd(invlist,
1101            only_utf8_locale_invlist,
1102            ANYOF_FLAGS(node) & ANYOF_INVERT,
1103            &invlist);
1104  }
1105
1106  return invlist;
1107 }
1108
1109 /* These two functions currently do the exact same thing */
1110 #define ssc_init_zero  ssc_init
1111
1112 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1113 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1114
1115 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1116  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1117  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1118
1119 STATIC void
1120 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1121     const regnode_charclass *and_with)
1122 {
1123  /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1124  * another SSC or a regular ANYOF class.  Can create false positives. */
1125
1126  SV* anded_cp_list;
1127  U8  anded_flags;
1128
1129  PERL_ARGS_ASSERT_SSC_AND;
1130
1131  assert(is_ANYOF_SYNTHETIC(ssc));
1132
1133  /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1134  * the code point inversion list and just the relevant flags */
1135  if (is_ANYOF_SYNTHETIC(and_with)) {
1136   anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1137   anded_flags = ANYOF_FLAGS(and_with);
1138
1139   /* XXX This is a kludge around what appears to be deficiencies in the
1140   * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1141   * there are paths through the optimizer where it doesn't get weeded
1142   * out when it should.  And if we don't make some extra provision for
1143   * it like the code just below, it doesn't get added when it should.
1144   * This solution is to add it only when AND'ing, which is here, and
1145   * only when what is being AND'ed is the pristine, original node
1146   * matching anything.  Thus it is like adding it to ssc_anything() but
1147   * only when the result is to be AND'ed.  Probably the same solution
1148   * could be adopted for the same problem we have with /l matching,
1149   * which is solved differently in S_ssc_init(), and that would lead to
1150   * fewer false positives than that solution has.  But if this solution
1151   * creates bugs, the consequences are only that a warning isn't raised
1152   * that should be; while the consequences for having /l bugs is
1153   * incorrect matches */
1154   if (ssc_is_anything((regnode_ssc *)and_with)) {
1155    anded_flags |= ANYOF_WARN_SUPER;
1156   }
1157  }
1158  else {
1159   anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1160   anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1161  }
1162
1163  ANYOF_FLAGS(ssc) &= anded_flags;
1164
1165  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1166  * C2 is the list of code points in 'and-with'; P2, its posix classes.
1167  * 'and_with' may be inverted.  When not inverted, we have the situation of
1168  * computing:
1169  *  (C1 | P1) & (C2 | P2)
1170  *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1171  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1172  *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1173  *                    <=  ((C1 & C2) | P1 | P2)
1174  * Alternatively, the last few steps could be:
1175  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1176  *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1177  *                    <=  (C1 | C2 | (P1 & P2))
1178  * We favor the second approach if either P1 or P2 is non-empty.  This is
1179  * because these components are a barrier to doing optimizations, as what
1180  * they match cannot be known until the moment of matching as they are
1181  * dependent on the current locale, 'AND"ing them likely will reduce or
1182  * eliminate them.
1183  * But we can do better if we know that C1,P1 are in their initial state (a
1184  * frequent occurrence), each matching everything:
1185  *  (<everything>) & (C2 | P2) =  C2 | P2
1186  * Similarly, if C2,P2 are in their initial state (again a frequent
1187  * occurrence), the result is a no-op
1188  *  (C1 | P1) & (<everything>) =  C1 | P1
1189  *
1190  * Inverted, we have
1191  *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1192  *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1193  *                         <=  (C1 & ~C2) | (P1 & ~P2)
1194  * */
1195
1196  if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1197   && ! is_ANYOF_SYNTHETIC(and_with))
1198  {
1199   unsigned int i;
1200
1201   ssc_intersection(ssc,
1202       anded_cp_list,
1203       FALSE /* Has already been inverted */
1204       );
1205
1206   /* If either P1 or P2 is empty, the intersection will be also; can skip
1207   * the loop */
1208   if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1209    ANYOF_POSIXL_ZERO(ssc);
1210   }
1211   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1212
1213    /* Note that the Posix class component P from 'and_with' actually
1214    * looks like:
1215    *      P = Pa | Pb | ... | Pn
1216    * where each component is one posix class, such as in [\w\s].
1217    * Thus
1218    *      ~P = ~(Pa | Pb | ... | Pn)
1219    *         = ~Pa & ~Pb & ... & ~Pn
1220    *        <= ~Pa | ~Pb | ... | ~Pn
1221    * The last is something we can easily calculate, but unfortunately
1222    * is likely to have many false positives.  We could do better
1223    * in some (but certainly not all) instances if two classes in
1224    * P have known relationships.  For example
1225    *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1226    * So
1227    *      :lower: & :print: = :lower:
1228    * And similarly for classes that must be disjoint.  For example,
1229    * since \s and \w can have no elements in common based on rules in
1230    * the POSIX standard,
1231    *      \w & ^\S = nothing
1232    * Unfortunately, some vendor locales do not meet the Posix
1233    * standard, in particular almost everything by Microsoft.
1234    * The loop below just changes e.g., \w into \W and vice versa */
1235
1236    regnode_charclass_posixl temp;
1237    int add = 1;    /* To calculate the index of the complement */
1238
1239    ANYOF_POSIXL_ZERO(&temp);
1240    for (i = 0; i < ANYOF_MAX; i++) {
1241     assert(i % 2 != 0
1242      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1243      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1244
1245     if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1246      ANYOF_POSIXL_SET(&temp, i + add);
1247     }
1248     add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1249    }
1250    ANYOF_POSIXL_AND(&temp, ssc);
1251
1252   } /* else ssc already has no posixes */
1253  } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1254   in its initial state */
1255  else if (! is_ANYOF_SYNTHETIC(and_with)
1256    || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1257  {
1258   /* But if 'ssc' is in its initial state, the result is just 'and_with';
1259   * copy it over 'ssc' */
1260   if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1261    if (is_ANYOF_SYNTHETIC(and_with)) {
1262     StructCopy(and_with, ssc, regnode_ssc);
1263    }
1264    else {
1265     ssc->invlist = anded_cp_list;
1266     ANYOF_POSIXL_ZERO(ssc);
1267     if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1268      ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1269     }
1270    }
1271   }
1272   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1273     || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1274   {
1275    /* One or the other of P1, P2 is non-empty. */
1276    if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1277     ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1278    }
1279    ssc_union(ssc, anded_cp_list, FALSE);
1280   }
1281   else { /* P1 = P2 = empty */
1282    ssc_intersection(ssc, anded_cp_list, FALSE);
1283   }
1284  }
1285 }
1286
1287 STATIC void
1288 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1289    const regnode_charclass *or_with)
1290 {
1291  /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1292  * another SSC or a regular ANYOF class.  Can create false positives if
1293  * 'or_with' is to be inverted. */
1294
1295  SV* ored_cp_list;
1296  U8 ored_flags;
1297
1298  PERL_ARGS_ASSERT_SSC_OR;
1299
1300  assert(is_ANYOF_SYNTHETIC(ssc));
1301
1302  /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1303  * the code point inversion list and just the relevant flags */
1304  if (is_ANYOF_SYNTHETIC(or_with)) {
1305   ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1306   ored_flags = ANYOF_FLAGS(or_with);
1307  }
1308  else {
1309   ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1310   ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1311  }
1312
1313  ANYOF_FLAGS(ssc) |= ored_flags;
1314
1315  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1316  * C2 is the list of code points in 'or-with'; P2, its posix classes.
1317  * 'or_with' may be inverted.  When not inverted, we have the simple
1318  * situation of computing:
1319  *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1320  * If P1|P2 yields a situation with both a class and its complement are
1321  * set, like having both \w and \W, this matches all code points, and we
1322  * can delete these from the P component of the ssc going forward.  XXX We
1323  * might be able to delete all the P components, but I (khw) am not certain
1324  * about this, and it is better to be safe.
1325  *
1326  * Inverted, we have
1327  *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1328  *                         <=  (C1 | P1) | ~C2
1329  *                         <=  (C1 | ~C2) | P1
1330  * (which results in actually simpler code than the non-inverted case)
1331  * */
1332
1333  if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1334   && ! is_ANYOF_SYNTHETIC(or_with))
1335  {
1336   /* We ignore P2, leaving P1 going forward */
1337  }   /* else  Not inverted */
1338  else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1339   ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1340   if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1341    unsigned int i;
1342    for (i = 0; i < ANYOF_MAX; i += 2) {
1343     if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1344     {
1345      ssc_match_all_cp(ssc);
1346      ANYOF_POSIXL_CLEAR(ssc, i);
1347      ANYOF_POSIXL_CLEAR(ssc, i+1);
1348     }
1349    }
1350   }
1351  }
1352
1353  ssc_union(ssc,
1354    ored_cp_list,
1355    FALSE /* Already has been inverted */
1356    );
1357 }
1358
1359 PERL_STATIC_INLINE void
1360 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1361 {
1362  PERL_ARGS_ASSERT_SSC_UNION;
1363
1364  assert(is_ANYOF_SYNTHETIC(ssc));
1365
1366  _invlist_union_maybe_complement_2nd(ssc->invlist,
1367           invlist,
1368           invert2nd,
1369           &ssc->invlist);
1370 }
1371
1372 PERL_STATIC_INLINE void
1373 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1374       SV* const invlist,
1375       const bool invert2nd)
1376 {
1377  PERL_ARGS_ASSERT_SSC_INTERSECTION;
1378
1379  assert(is_ANYOF_SYNTHETIC(ssc));
1380
1381  _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1382            invlist,
1383            invert2nd,
1384            &ssc->invlist);
1385 }
1386
1387 PERL_STATIC_INLINE void
1388 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1389 {
1390  PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1391
1392  assert(is_ANYOF_SYNTHETIC(ssc));
1393
1394  ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1395 }
1396
1397 PERL_STATIC_INLINE void
1398 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1399 {
1400  /* AND just the single code point 'cp' into the SSC 'ssc' */
1401
1402  SV* cp_list = _new_invlist(2);
1403
1404  PERL_ARGS_ASSERT_SSC_CP_AND;
1405
1406  assert(is_ANYOF_SYNTHETIC(ssc));
1407
1408  cp_list = add_cp_to_invlist(cp_list, cp);
1409  ssc_intersection(ssc, cp_list,
1410      FALSE /* Not inverted */
1411      );
1412  SvREFCNT_dec_NN(cp_list);
1413 }
1414
1415 PERL_STATIC_INLINE void
1416 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1417 {
1418  /* Set the SSC 'ssc' to not match any locale things */
1419
1420  PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1421
1422  assert(is_ANYOF_SYNTHETIC(ssc));
1423
1424  ANYOF_POSIXL_ZERO(ssc);
1425  ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1426 }
1427
1428 STATIC void
1429 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1430 {
1431  /* The inversion list in the SSC is marked mortal; now we need a more
1432  * permanent copy, which is stored the same way that is done in a regular
1433  * ANYOF node, with the first 256 code points in a bit map */
1434
1435  SV* invlist = invlist_clone(ssc->invlist);
1436
1437  PERL_ARGS_ASSERT_SSC_FINALIZE;
1438
1439  assert(is_ANYOF_SYNTHETIC(ssc));
1440
1441  /* The code in this file assumes that all but these flags aren't relevant
1442  * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1443  * time we reach here */
1444  assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1445
1446  populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1447
1448  set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1449         NULL, NULL, NULL, FALSE);
1450
1451  /* Make sure is clone-safe */
1452  ssc->invlist = NULL;
1453
1454  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1455   ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1456  }
1457
1458  assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1459 }
1460
1461 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1462 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1463 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1464 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1465        ? (TRIE_LIST_CUR( idx ) - 1)           \
1466        : 0 )
1467
1468
1469 #ifdef DEBUGGING
1470 /*
1471    dump_trie(trie,widecharmap,revcharmap)
1472    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1473    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1474
1475    These routines dump out a trie in a somewhat readable format.
1476    The _interim_ variants are used for debugging the interim
1477    tables that are used to generate the final compressed
1478    representation which is what dump_trie expects.
1479
1480    Part of the reason for their existence is to provide a form
1481    of documentation as to how the different representations function.
1482
1483 */
1484
1485 /*
1486   Dumps the final compressed table form of the trie to Perl_debug_log.
1487   Used for debugging make_trie().
1488 */
1489
1490 STATIC void
1491 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1492    AV *revcharmap, U32 depth)
1493 {
1494  U32 state;
1495  SV *sv=sv_newmortal();
1496  int colwidth= widecharmap ? 6 : 4;
1497  U16 word;
1498  GET_RE_DEBUG_FLAGS_DECL;
1499
1500  PERL_ARGS_ASSERT_DUMP_TRIE;
1501
1502  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1503   (int)depth * 2 + 2,"",
1504   "Match","Base","Ofs" );
1505
1506  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1507   SV ** const tmp = av_fetch( revcharmap, state, 0);
1508   if ( tmp ) {
1509    PerlIO_printf( Perl_debug_log, "%*s",
1510     colwidth,
1511     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1512        PL_colors[0], PL_colors[1],
1513        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1514        PERL_PV_ESCAPE_FIRSTCHAR
1515     )
1516    );
1517   }
1518  }
1519  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1520   (int)depth * 2 + 2,"");
1521
1522  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1523   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1524  PerlIO_printf( Perl_debug_log, "\n");
1525
1526  for( state = 1 ; state < trie->statecount ; state++ ) {
1527   const U32 base = trie->states[ state ].trans.base;
1528
1529   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1530          (int)depth * 2 + 2,"", (UV)state);
1531
1532   if ( trie->states[ state ].wordnum ) {
1533    PerlIO_printf( Perl_debug_log, " W%4X",
1534           trie->states[ state ].wordnum );
1535   } else {
1536    PerlIO_printf( Perl_debug_log, "%6s", "" );
1537   }
1538
1539   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1540
1541   if ( base ) {
1542    U32 ofs = 0;
1543
1544    while( ( base + ofs  < trie->uniquecharcount ) ||
1545     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1546      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1547                  != state))
1548      ofs++;
1549
1550    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1551
1552    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1553     if ( ( base + ofs >= trie->uniquecharcount )
1554       && ( base + ofs - trie->uniquecharcount
1555               < trie->lasttrans )
1556       && trie->trans[ base + ofs
1557          - trie->uniquecharcount ].check == state )
1558     {
1559     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1560      colwidth,
1561      (UV)trie->trans[ base + ofs
1562            - trie->uniquecharcount ].next );
1563     } else {
1564      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1565     }
1566    }
1567
1568    PerlIO_printf( Perl_debug_log, "]");
1569
1570   }
1571   PerlIO_printf( Perl_debug_log, "\n" );
1572  }
1573  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1574         (int)depth*2, "");
1575  for (word=1; word <= trie->wordcount; word++) {
1576   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1577    (int)word, (int)(trie->wordinfo[word].prev),
1578    (int)(trie->wordinfo[word].len));
1579  }
1580  PerlIO_printf(Perl_debug_log, "\n" );
1581 }
1582 /*
1583   Dumps a fully constructed but uncompressed trie in list form.
1584   List tries normally only are used for construction when the number of
1585   possible chars (trie->uniquecharcount) is very high.
1586   Used for debugging make_trie().
1587 */
1588 STATIC void
1589 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1590       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1591       U32 depth)
1592 {
1593  U32 state;
1594  SV *sv=sv_newmortal();
1595  int colwidth= widecharmap ? 6 : 4;
1596  GET_RE_DEBUG_FLAGS_DECL;
1597
1598  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1599
1600  /* print out the table precompression.  */
1601  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1602   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1603   "------:-----+-----------------\n" );
1604
1605  for( state=1 ; state < next_alloc ; state ++ ) {
1606   U16 charid;
1607
1608   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1609    (int)depth * 2 + 2,"", (UV)state  );
1610   if ( ! trie->states[ state ].wordnum ) {
1611    PerlIO_printf( Perl_debug_log, "%5s| ","");
1612   } else {
1613    PerlIO_printf( Perl_debug_log, "W%4x| ",
1614     trie->states[ state ].wordnum
1615    );
1616   }
1617   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1618    SV ** const tmp = av_fetch( revcharmap,
1619           TRIE_LIST_ITEM(state,charid).forid, 0);
1620    if ( tmp ) {
1621     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1622      colwidth,
1623      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1624        colwidth,
1625        PL_colors[0], PL_colors[1],
1626        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1627        | PERL_PV_ESCAPE_FIRSTCHAR
1628      ) ,
1629      TRIE_LIST_ITEM(state,charid).forid,
1630      (UV)TRIE_LIST_ITEM(state,charid).newstate
1631     );
1632     if (!(charid % 10))
1633      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1634       (int)((depth * 2) + 14), "");
1635    }
1636   }
1637   PerlIO_printf( Perl_debug_log, "\n");
1638  }
1639 }
1640
1641 /*
1642   Dumps a fully constructed but uncompressed trie in table form.
1643   This is the normal DFA style state transition table, with a few
1644   twists to facilitate compression later.
1645   Used for debugging make_trie().
1646 */
1647 STATIC void
1648 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1649       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1650       U32 depth)
1651 {
1652  U32 state;
1653  U16 charid;
1654  SV *sv=sv_newmortal();
1655  int colwidth= widecharmap ? 6 : 4;
1656  GET_RE_DEBUG_FLAGS_DECL;
1657
1658  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1659
1660  /*
1661  print out the table precompression so that we can do a visual check
1662  that they are identical.
1663  */
1664
1665  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1666
1667  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1668   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1669   if ( tmp ) {
1670    PerlIO_printf( Perl_debug_log, "%*s",
1671     colwidth,
1672     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1673        PL_colors[0], PL_colors[1],
1674        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1675        PERL_PV_ESCAPE_FIRSTCHAR
1676     )
1677    );
1678   }
1679  }
1680
1681  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1682
1683  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1684   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1685  }
1686
1687  PerlIO_printf( Perl_debug_log, "\n" );
1688
1689  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1690
1691   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1692    (int)depth * 2 + 2,"",
1693    (UV)TRIE_NODENUM( state ) );
1694
1695   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1696    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1697    if (v)
1698     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1699    else
1700     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1701   }
1702   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1703    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1704            (UV)trie->trans[ state ].check );
1705   } else {
1706    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1707            (UV)trie->trans[ state ].check,
1708    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1709   }
1710  }
1711 }
1712
1713 #endif
1714
1715
1716 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1717   startbranch: the first branch in the whole branch sequence
1718   first      : start branch of sequence of branch-exact nodes.
1719    May be the same as startbranch
1720   last       : Thing following the last branch.
1721    May be the same as tail.
1722   tail       : item following the branch sequence
1723   count      : words in the sequence
1724   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1725   depth      : indent depth
1726
1727 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1728
1729 A trie is an N'ary tree where the branches are determined by digital
1730 decomposition of the key. IE, at the root node you look up the 1st character and
1731 follow that branch repeat until you find the end of the branches. Nodes can be
1732 marked as "accepting" meaning they represent a complete word. Eg:
1733
1734   /he|she|his|hers/
1735
1736 would convert into the following structure. Numbers represent states, letters
1737 following numbers represent valid transitions on the letter from that state, if
1738 the number is in square brackets it represents an accepting state, otherwise it
1739 will be in parenthesis.
1740
1741  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1742  |    |
1743  |   (2)
1744  |    |
1745  (1)   +-i->(6)-+-s->[7]
1746  |
1747  +-s->(3)-+-h->(4)-+-e->[5]
1748
1749  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1750
1751 This shows that when matching against the string 'hers' we will begin at state 1
1752 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1753 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1754 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1755 single traverse. We store a mapping from accepting to state to which word was
1756 matched, and then when we have multiple possibilities we try to complete the
1757 rest of the regex in the order in which they occured in the alternation.
1758
1759 The only prior NFA like behaviour that would be changed by the TRIE support is
1760 the silent ignoring of duplicate alternations which are of the form:
1761
1762  / (DUPE|DUPE) X? (?{ ... }) Y /x
1763
1764 Thus EVAL blocks following a trie may be called a different number of times with
1765 and without the optimisation. With the optimisations dupes will be silently
1766 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1767 the following demonstrates:
1768
1769  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1770
1771 which prints out 'word' three times, but
1772
1773  'words'=~/(word|word|word)(?{ print $1 })S/
1774
1775 which doesnt print it out at all. This is due to other optimisations kicking in.
1776
1777 Example of what happens on a structural level:
1778
1779 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1780
1781    1: CURLYM[1] {1,32767}(18)
1782    5:   BRANCH(8)
1783    6:     EXACT <ac>(16)
1784    8:   BRANCH(11)
1785    9:     EXACT <ad>(16)
1786   11:   BRANCH(14)
1787   12:     EXACT <ab>(16)
1788   16:   SUCCEED(0)
1789   17:   NOTHING(18)
1790   18: END(0)
1791
1792 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1793 and should turn into:
1794
1795    1: CURLYM[1] {1,32767}(18)
1796    5:   TRIE(16)
1797   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1798   <ac>
1799   <ad>
1800   <ab>
1801   16:   SUCCEED(0)
1802   17:   NOTHING(18)
1803   18: END(0)
1804
1805 Cases where tail != last would be like /(?foo|bar)baz/:
1806
1807    1: BRANCH(4)
1808    2:   EXACT <foo>(8)
1809    4: BRANCH(7)
1810    5:   EXACT <bar>(8)
1811    7: TAIL(8)
1812    8: EXACT <baz>(10)
1813   10: END(0)
1814
1815 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1816 and would end up looking like:
1817
1818  1: TRIE(8)
1819  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1820   <foo>
1821   <bar>
1822    7: TAIL(8)
1823    8: EXACT <baz>(10)
1824   10: END(0)
1825
1826  d = uvchr_to_utf8_flags(d, uv, 0);
1827
1828 is the recommended Unicode-aware way of saying
1829
1830  *(d++) = uv;
1831 */
1832
1833 #define TRIE_STORE_REVCHAR(val)                                            \
1834  STMT_START {                                                           \
1835   if (UTF) {          \
1836    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1837    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1838    unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1839    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1840    SvPOK_on(zlopp);         \
1841    SvUTF8_on(zlopp);         \
1842    av_push(revcharmap, zlopp);        \
1843   } else {          \
1844    char ooooff = (char)val;                                           \
1845    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1846   }           \
1847   } STMT_END
1848
1849 /* This gets the next character from the input, folding it if not already
1850  * folded. */
1851 #define TRIE_READ_CHAR STMT_START {                                           \
1852  wordlen++;                                                                \
1853  if ( UTF ) {                                                              \
1854   /* if it is UTF then it is either already folded, or does not need    \
1855   * folding */                                                         \
1856   uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1857  }                                                                         \
1858  else if (folder == PL_fold_latin1) {                                      \
1859   /* This folder implies Unicode rules, which in the range expressible  \
1860   *  by not UTF is the lower case, with the two exceptions, one of     \
1861   *  which should have been taken care of before calling this */       \
1862   assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1863   uvc = toLOWER_L1(*uc);                                                \
1864   if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1865   len = 1;                                                              \
1866  } else {                                                                  \
1867   /* raw data, will be folded later if needed */                        \
1868   uvc = (U32)*uc;                                                       \
1869   len = 1;                                                              \
1870  }                                                                         \
1871 } STMT_END
1872
1873
1874
1875 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1876  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1877   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1878   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1879  }                                                           \
1880  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1881  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1882  TRIE_LIST_CUR( state )++;                                   \
1883 } STMT_END
1884
1885 #define TRIE_LIST_NEW(state) STMT_START {                       \
1886  Newxz( trie->states[ state ].trans.list,               \
1887   4, reg_trie_trans_le );                                 \
1888  TRIE_LIST_CUR( state ) = 1;                                \
1889  TRIE_LIST_LEN( state ) = 4;                                \
1890 } STMT_END
1891
1892 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1893  U16 dupe= trie->states[ state ].wordnum;                    \
1894  regnode * const noper_next = regnext( noper );              \
1895                 \
1896  DEBUG_r({                                                   \
1897   /* store the word for dumping */                        \
1898   SV* tmp;                                                \
1899   if (OP(noper) != NOTHING)                               \
1900    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1901   else                                                    \
1902    tmp = newSVpvn_utf8( "", 0, UTF );   \
1903   av_push( trie_words, tmp );                             \
1904  });                                                         \
1905                 \
1906  curword++;                                                  \
1907  trie->wordinfo[curword].prev   = 0;                         \
1908  trie->wordinfo[curword].len    = wordlen;                   \
1909  trie->wordinfo[curword].accept = state;                     \
1910                 \
1911  if ( noper_next < tail ) {                                  \
1912   if (!trie->jump)                                        \
1913    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1914             sizeof(U16) ); \
1915   trie->jump[curword] = (U16)(noper_next - convert);      \
1916   if (!jumper)                                            \
1917    jumper = noper_next;                                \
1918   if (!nextbranch)                                        \
1919    nextbranch= regnext(cur);                           \
1920  }                                                           \
1921                 \
1922  if ( dupe ) {                                               \
1923   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1924   /* chain, so that when the bits of chain are later    */\
1925   /* linked together, the dups appear in the chain      */\
1926   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1927   trie->wordinfo[dupe].prev = curword;                    \
1928  } else {                                                    \
1929   /* we haven't inserted this word yet.                */ \
1930   trie->states[ state ].wordnum = curword;                \
1931  }                                                           \
1932 } STMT_END
1933
1934
1935 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1936  ( ( base + charid >=  ucharcount     \
1937   && base + charid < ubound     \
1938   && state == trie->trans[ base - ucharcount + charid ].check \
1939   && trie->trans[ base - ucharcount + charid ].next )  \
1940   ? trie->trans[ base - ucharcount + charid ].next  \
1941   : ( state==1 ? special : 0 )     \
1942  )
1943
1944 #define MADE_TRIE       1
1945 #define MADE_JUMP_TRIE  2
1946 #define MADE_EXACT_TRIE 4
1947
1948 STATIC I32
1949 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1950     regnode *first, regnode *last, regnode *tail,
1951     U32 word_count, U32 flags, U32 depth)
1952 {
1953  dVAR;
1954  /* first pass, loop through and scan words */
1955  reg_trie_data *trie;
1956  HV *widecharmap = NULL;
1957  AV *revcharmap = newAV();
1958  regnode *cur;
1959  STRLEN len = 0;
1960  UV uvc = 0;
1961  U16 curword = 0;
1962  U32 next_alloc = 0;
1963  regnode *jumper = NULL;
1964  regnode *nextbranch = NULL;
1965  regnode *convert = NULL;
1966  U32 *prev_states; /* temp array mapping each state to previous one */
1967  /* we just use folder as a flag in utf8 */
1968  const U8 * folder = NULL;
1969
1970 #ifdef DEBUGGING
1971  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1972  AV *trie_words = NULL;
1973  /* along with revcharmap, this only used during construction but both are
1974  * useful during debugging so we store them in the struct when debugging.
1975  */
1976 #else
1977  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1978  STRLEN trie_charcount=0;
1979 #endif
1980  SV *re_trie_maxbuff;
1981  GET_RE_DEBUG_FLAGS_DECL;
1982
1983  PERL_ARGS_ASSERT_MAKE_TRIE;
1984 #ifndef DEBUGGING
1985  PERL_UNUSED_ARG(depth);
1986 #endif
1987
1988  switch (flags) {
1989   case EXACT: break;
1990   case EXACTFA:
1991   case EXACTFU_SS:
1992   case EXACTFU: folder = PL_fold_latin1; break;
1993   case EXACTF:  folder = PL_fold; break;
1994   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1995  }
1996
1997  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1998  trie->refcount = 1;
1999  trie->startstate = 1;
2000  trie->wordcount = word_count;
2001  RExC_rxi->data->data[ data_slot ] = (void*)trie;
2002  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2003  if (flags == EXACT)
2004   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2005  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2006      trie->wordcount+1, sizeof(reg_trie_wordinfo));
2007
2008  DEBUG_r({
2009   trie_words = newAV();
2010  });
2011
2012  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2013  if (!SvIOK(re_trie_maxbuff)) {
2014   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2015  }
2016  DEBUG_TRIE_COMPILE_r({
2017   PerlIO_printf( Perl_debug_log,
2018   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2019   (int)depth * 2 + 2, "",
2020   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2021   REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2022  });
2023
2024    /* Find the node we are going to overwrite */
2025  if ( first == startbranch && OP( last ) != BRANCH ) {
2026   /* whole branch chain */
2027   convert = first;
2028  } else {
2029   /* branch sub-chain */
2030   convert = NEXTOPER( first );
2031  }
2032
2033  /*  -- First loop and Setup --
2034
2035  We first traverse the branches and scan each word to determine if it
2036  contains widechars, and how many unique chars there are, this is
2037  important as we have to build a table with at least as many columns as we
2038  have unique chars.
2039
2040  We use an array of integers to represent the character codes 0..255
2041  (trie->charmap) and we use a an HV* to store Unicode characters. We use
2042  the native representation of the character value as the key and IV's for
2043  the coded index.
2044
2045  *TODO* If we keep track of how many times each character is used we can
2046  remap the columns so that the table compression later on is more
2047  efficient in terms of memory by ensuring the most common value is in the
2048  middle and the least common are on the outside.  IMO this would be better
2049  than a most to least common mapping as theres a decent chance the most
2050  common letter will share a node with the least common, meaning the node
2051  will not be compressible. With a middle is most common approach the worst
2052  case is when we have the least common nodes twice.
2053
2054  */
2055
2056  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2057   regnode *noper = NEXTOPER( cur );
2058   const U8 *uc = (U8*)STRING( noper );
2059   const U8 *e  = uc + STR_LEN( noper );
2060   int foldlen = 0;
2061   U32 wordlen      = 0;         /* required init */
2062   STRLEN minchars = 0;
2063   STRLEN maxchars = 0;
2064   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2065            bitmap?*/
2066
2067   if (OP(noper) == NOTHING) {
2068    regnode *noper_next= regnext(noper);
2069    if (noper_next != tail && OP(noper_next) == flags) {
2070     noper = noper_next;
2071     uc= (U8*)STRING(noper);
2072     e= uc + STR_LEN(noper);
2073     trie->minlen= STR_LEN(noper);
2074    } else {
2075     trie->minlen= 0;
2076     continue;
2077    }
2078   }
2079
2080   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2081    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2082           regardless of encoding */
2083    if (OP( noper ) == EXACTFU_SS) {
2084     /* false positives are ok, so just set this */
2085     TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2086    }
2087   }
2088   for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2089           branch */
2090    TRIE_CHARCOUNT(trie)++;
2091    TRIE_READ_CHAR;
2092
2093    /* TRIE_READ_CHAR returns the current character, or its fold if /i
2094    * is in effect.  Under /i, this character can match itself, or
2095    * anything that folds to it.  If not under /i, it can match just
2096    * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2097    * all fold to k, and all are single characters.   But some folds
2098    * expand to more than one character, so for example LATIN SMALL
2099    * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2100    * the string beginning at 'uc' is 'ffi', it could be matched by
2101    * three characters, or just by the one ligature character. (It
2102    * could also be matched by two characters: LATIN SMALL LIGATURE FF
2103    * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2104    * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2105    * match.)  The trie needs to know the minimum and maximum number
2106    * of characters that could match so that it can use size alone to
2107    * quickly reject many match attempts.  The max is simple: it is
2108    * the number of folded characters in this branch (since a fold is
2109    * never shorter than what folds to it. */
2110
2111    maxchars++;
2112
2113    /* And the min is equal to the max if not under /i (indicated by
2114    * 'folder' being NULL), or there are no multi-character folds.  If
2115    * there is a multi-character fold, the min is incremented just
2116    * once, for the character that folds to the sequence.  Each
2117    * character in the sequence needs to be added to the list below of
2118    * characters in the trie, but we count only the first towards the
2119    * min number of characters needed.  This is done through the
2120    * variable 'foldlen', which is returned by the macros that look
2121    * for these sequences as the number of bytes the sequence
2122    * occupies.  Each time through the loop, we decrement 'foldlen' by
2123    * how many bytes the current char occupies.  Only when it reaches
2124    * 0 do we increment 'minchars' or look for another multi-character
2125    * sequence. */
2126    if (folder == NULL) {
2127     minchars++;
2128    }
2129    else if (foldlen > 0) {
2130     foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2131    }
2132    else {
2133     minchars++;
2134
2135     /* See if *uc is the beginning of a multi-character fold.  If
2136     * so, we decrement the length remaining to look at, to account
2137     * for the current character this iteration.  (We can use 'uc'
2138     * instead of the fold returned by TRIE_READ_CHAR because for
2139     * non-UTF, the latin1_safe macro is smart enough to account
2140     * for all the unfolded characters, and because for UTF, the
2141     * string will already have been folded earlier in the
2142     * compilation process */
2143     if (UTF) {
2144      if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2145       foldlen -= UTF8SKIP(uc);
2146      }
2147     }
2148     else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2149      foldlen--;
2150     }
2151    }
2152
2153    /* The current character (and any potential folds) should be added
2154    * to the possible matching characters for this position in this
2155    * branch */
2156    if ( uvc < 256 ) {
2157     if ( folder ) {
2158      U8 folded= folder[ (U8) uvc ];
2159      if ( !trie->charmap[ folded ] ) {
2160       trie->charmap[ folded ]=( ++trie->uniquecharcount );
2161       TRIE_STORE_REVCHAR( folded );
2162      }
2163     }
2164     if ( !trie->charmap[ uvc ] ) {
2165      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2166      TRIE_STORE_REVCHAR( uvc );
2167     }
2168     if ( set_bit ) {
2169      /* store the codepoint in the bitmap, and its folded
2170      * equivalent. */
2171      TRIE_BITMAP_SET(trie, uvc);
2172
2173      /* store the folded codepoint */
2174      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2175
2176      if ( !UTF ) {
2177       /* store first byte of utf8 representation of
2178       variant codepoints */
2179       if (! UVCHR_IS_INVARIANT(uvc)) {
2180        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2181       }
2182      }
2183      set_bit = 0; /* We've done our bit :-) */
2184     }
2185    } else {
2186
2187     /* XXX We could come up with the list of code points that fold
2188     * to this using PL_utf8_foldclosures, except not for
2189     * multi-char folds, as there may be multiple combinations
2190     * there that could work, which needs to wait until runtime to
2191     * resolve (The comment about LIGATURE FFI above is such an
2192     * example */
2193
2194     SV** svpp;
2195     if ( !widecharmap )
2196      widecharmap = newHV();
2197
2198     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2199
2200     if ( !svpp )
2201      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2202
2203     if ( !SvTRUE( *svpp ) ) {
2204      sv_setiv( *svpp, ++trie->uniquecharcount );
2205      TRIE_STORE_REVCHAR(uvc);
2206     }
2207    }
2208   } /* end loop through characters in this branch of the trie */
2209
2210   /* We take the min and max for this branch and combine to find the min
2211   * and max for all branches processed so far */
2212   if( cur == first ) {
2213    trie->minlen = minchars;
2214    trie->maxlen = maxchars;
2215   } else if (minchars < trie->minlen) {
2216    trie->minlen = minchars;
2217   } else if (maxchars > trie->maxlen) {
2218    trie->maxlen = maxchars;
2219   }
2220  } /* end first pass */
2221  DEBUG_TRIE_COMPILE_r(
2222   PerlIO_printf( Perl_debug_log,
2223     "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2224     (int)depth * 2 + 2,"",
2225     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2226     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2227     (int)trie->minlen, (int)trie->maxlen )
2228  );
2229
2230  /*
2231   We now know what we are dealing with in terms of unique chars and
2232   string sizes so we can calculate how much memory a naive
2233   representation using a flat table  will take. If it's over a reasonable
2234   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2235   conservative but potentially much slower representation using an array
2236   of lists.
2237
2238   At the end we convert both representations into the same compressed
2239   form that will be used in regexec.c for matching with. The latter
2240   is a form that cannot be used to construct with but has memory
2241   properties similar to the list form and access properties similar
2242   to the table form making it both suitable for fast searches and
2243   small enough that its feasable to store for the duration of a program.
2244
2245   See the comment in the code where the compressed table is produced
2246   inplace from the flat tabe representation for an explanation of how
2247   the compression works.
2248
2249  */
2250
2251
2252  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2253  prev_states[1] = 0;
2254
2255  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2256              > SvIV(re_trie_maxbuff) )
2257  {
2258   /*
2259    Second Pass -- Array Of Lists Representation
2260
2261    Each state will be represented by a list of charid:state records
2262    (reg_trie_trans_le) the first such element holds the CUR and LEN
2263    points of the allocated array. (See defines above).
2264
2265    We build the initial structure using the lists, and then convert
2266    it into the compressed table form which allows faster lookups
2267    (but cant be modified once converted).
2268   */
2269
2270   STRLEN transcount = 1;
2271
2272   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2273    "%*sCompiling trie using list compiler\n",
2274    (int)depth * 2 + 2, ""));
2275
2276   trie->states = (reg_trie_state *)
2277    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2278         sizeof(reg_trie_state) );
2279   TRIE_LIST_NEW(1);
2280   next_alloc = 2;
2281
2282   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2283
2284    regnode *noper   = NEXTOPER( cur );
2285    U8 *uc           = (U8*)STRING( noper );
2286    const U8 *e      = uc + STR_LEN( noper );
2287    U32 state        = 1;         /* required init */
2288    U16 charid       = 0;         /* sanity init */
2289    U32 wordlen      = 0;         /* required init */
2290
2291    if (OP(noper) == NOTHING) {
2292     regnode *noper_next= regnext(noper);
2293     if (noper_next != tail && OP(noper_next) == flags) {
2294      noper = noper_next;
2295      uc= (U8*)STRING(noper);
2296      e= uc + STR_LEN(noper);
2297     }
2298    }
2299
2300    if (OP(noper) != NOTHING) {
2301     for ( ; uc < e ; uc += len ) {
2302
2303      TRIE_READ_CHAR;
2304
2305      if ( uvc < 256 ) {
2306       charid = trie->charmap[ uvc ];
2307      } else {
2308       SV** const svpp = hv_fetch( widecharmap,
2309              (char*)&uvc,
2310              sizeof( UV ),
2311              0);
2312       if ( !svpp ) {
2313        charid = 0;
2314       } else {
2315        charid=(U16)SvIV( *svpp );
2316       }
2317      }
2318      /* charid is now 0 if we dont know the char read, or
2319      * nonzero if we do */
2320      if ( charid ) {
2321
2322       U16 check;
2323       U32 newstate = 0;
2324
2325       charid--;
2326       if ( !trie->states[ state ].trans.list ) {
2327        TRIE_LIST_NEW( state );
2328       }
2329       for ( check = 1;
2330        check <= TRIE_LIST_USED( state );
2331        check++ )
2332       {
2333        if ( TRIE_LIST_ITEM( state, check ).forid
2334                  == charid )
2335        {
2336         newstate = TRIE_LIST_ITEM( state, check ).newstate;
2337         break;
2338        }
2339       }
2340       if ( ! newstate ) {
2341        newstate = next_alloc++;
2342        prev_states[newstate] = state;
2343        TRIE_LIST_PUSH( state, charid, newstate );
2344        transcount++;
2345       }
2346       state = newstate;
2347      } else {
2348       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2349      }
2350     }
2351    }
2352    TRIE_HANDLE_WORD(state);
2353
2354   } /* end second pass */
2355
2356   /* next alloc is the NEXT state to be allocated */
2357   trie->statecount = next_alloc;
2358   trie->states = (reg_trie_state *)
2359    PerlMemShared_realloc( trie->states,
2360         next_alloc
2361         * sizeof(reg_trie_state) );
2362
2363   /* and now dump it out before we compress it */
2364   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2365               revcharmap, next_alloc,
2366               depth+1)
2367   );
2368
2369   trie->trans = (reg_trie_trans *)
2370    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2371   {
2372    U32 state;
2373    U32 tp = 0;
2374    U32 zp = 0;
2375
2376
2377    for( state=1 ; state < next_alloc ; state ++ ) {
2378     U32 base=0;
2379
2380     /*
2381     DEBUG_TRIE_COMPILE_MORE_r(
2382      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2383     );
2384     */
2385
2386     if (trie->states[state].trans.list) {
2387      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2388      U16 maxid=minid;
2389      U16 idx;
2390
2391      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2392       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2393       if ( forid < minid ) {
2394        minid=forid;
2395       } else if ( forid > maxid ) {
2396        maxid=forid;
2397       }
2398      }
2399      if ( transcount < tp + maxid - minid + 1) {
2400       transcount *= 2;
2401       trie->trans = (reg_trie_trans *)
2402        PerlMemShared_realloc( trie->trans,
2403              transcount
2404              * sizeof(reg_trie_trans) );
2405       Zero( trie->trans + (transcount / 2),
2406        transcount / 2,
2407        reg_trie_trans );
2408      }
2409      base = trie->uniquecharcount + tp - minid;
2410      if ( maxid == minid ) {
2411       U32 set = 0;
2412       for ( ; zp < tp ; zp++ ) {
2413        if ( ! trie->trans[ zp ].next ) {
2414         base = trie->uniquecharcount + zp - minid;
2415         trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2416                 1).newstate;
2417         trie->trans[ zp ].check = state;
2418         set = 1;
2419         break;
2420        }
2421       }
2422       if ( !set ) {
2423        trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2424                 1).newstate;
2425        trie->trans[ tp ].check = state;
2426        tp++;
2427        zp = tp;
2428       }
2429      } else {
2430       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2431        const U32 tid = base
2432           - trie->uniquecharcount
2433           + TRIE_LIST_ITEM( state, idx ).forid;
2434        trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2435                 idx ).newstate;
2436        trie->trans[ tid ].check = state;
2437       }
2438       tp += ( maxid - minid + 1 );
2439      }
2440      Safefree(trie->states[ state ].trans.list);
2441     }
2442     /*
2443     DEBUG_TRIE_COMPILE_MORE_r(
2444      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2445     );
2446     */
2447     trie->states[ state ].trans.base=base;
2448    }
2449    trie->lasttrans = tp + 1;
2450   }
2451  } else {
2452   /*
2453   Second Pass -- Flat Table Representation.
2454
2455   we dont use the 0 slot of either trans[] or states[] so we add 1 to
2456   each.  We know that we will need Charcount+1 trans at most to store
2457   the data (one row per char at worst case) So we preallocate both
2458   structures assuming worst case.
2459
2460   We then construct the trie using only the .next slots of the entry
2461   structs.
2462
2463   We use the .check field of the first entry of the node temporarily
2464   to make compression both faster and easier by keeping track of how
2465   many non zero fields are in the node.
2466
2467   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2468   transition.
2469
2470   There are two terms at use here: state as a TRIE_NODEIDX() which is
2471   a number representing the first entry of the node, and state as a
2472   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2473   and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2474   if there are 2 entrys per node. eg:
2475
2476    A B       A B
2477   1. 2 4    1. 3 7
2478   2. 0 3    3. 0 5
2479   3. 0 0    5. 0 0
2480   4. 0 0    7. 0 0
2481
2482   The table is internally in the right hand, idx form. However as we
2483   also have to deal with the states array which is indexed by nodenum
2484   we have to use TRIE_NODENUM() to convert.
2485
2486   */
2487   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2488    "%*sCompiling trie using table compiler\n",
2489    (int)depth * 2 + 2, ""));
2490
2491   trie->trans = (reg_trie_trans *)
2492    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2493         * trie->uniquecharcount + 1,
2494         sizeof(reg_trie_trans) );
2495   trie->states = (reg_trie_state *)
2496    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2497         sizeof(reg_trie_state) );
2498   next_alloc = trie->uniquecharcount + 1;
2499
2500
2501   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2502
2503    regnode *noper   = NEXTOPER( cur );
2504    const U8 *uc     = (U8*)STRING( noper );
2505    const U8 *e      = uc + STR_LEN( noper );
2506
2507    U32 state        = 1;         /* required init */
2508
2509    U16 charid       = 0;         /* sanity init */
2510    U32 accept_state = 0;         /* sanity init */
2511
2512    U32 wordlen      = 0;         /* required init */
2513
2514    if (OP(noper) == NOTHING) {
2515     regnode *noper_next= regnext(noper);
2516     if (noper_next != tail && OP(noper_next) == flags) {
2517      noper = noper_next;
2518      uc= (U8*)STRING(noper);
2519      e= uc + STR_LEN(noper);
2520     }
2521    }
2522
2523    if ( OP(noper) != NOTHING ) {
2524     for ( ; uc < e ; uc += len ) {
2525
2526      TRIE_READ_CHAR;
2527
2528      if ( uvc < 256 ) {
2529       charid = trie->charmap[ uvc ];
2530      } else {
2531       SV* const * const svpp = hv_fetch( widecharmap,
2532               (char*)&uvc,
2533               sizeof( UV ),
2534               0);
2535       charid = svpp ? (U16)SvIV(*svpp) : 0;
2536      }
2537      if ( charid ) {
2538       charid--;
2539       if ( !trie->trans[ state + charid ].next ) {
2540        trie->trans[ state + charid ].next = next_alloc;
2541        trie->trans[ state ].check++;
2542        prev_states[TRIE_NODENUM(next_alloc)]
2543          = TRIE_NODENUM(state);
2544        next_alloc += trie->uniquecharcount;
2545       }
2546       state = trie->trans[ state + charid ].next;
2547      } else {
2548       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2549      }
2550      /* charid is now 0 if we dont know the char read, or
2551      * nonzero if we do */
2552     }
2553    }
2554    accept_state = TRIE_NODENUM( state );
2555    TRIE_HANDLE_WORD(accept_state);
2556
2557   } /* end second pass */
2558
2559   /* and now dump it out before we compress it */
2560   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2561               revcharmap,
2562               next_alloc, depth+1));
2563
2564   {
2565   /*
2566   * Inplace compress the table.*
2567
2568   For sparse data sets the table constructed by the trie algorithm will
2569   be mostly 0/FAIL transitions or to put it another way mostly empty.
2570   (Note that leaf nodes will not contain any transitions.)
2571
2572   This algorithm compresses the tables by eliminating most such
2573   transitions, at the cost of a modest bit of extra work during lookup:
2574
2575   - Each states[] entry contains a .base field which indicates the
2576   index in the state[] array wheres its transition data is stored.
2577
2578   - If .base is 0 there are no valid transitions from that node.
2579
2580   - If .base is nonzero then charid is added to it to find an entry in
2581   the trans array.
2582
2583   -If trans[states[state].base+charid].check!=state then the
2584   transition is taken to be a 0/Fail transition. Thus if there are fail
2585   transitions at the front of the node then the .base offset will point
2586   somewhere inside the previous nodes data (or maybe even into a node
2587   even earlier), but the .check field determines if the transition is
2588   valid.
2589
2590   XXX - wrong maybe?
2591   The following process inplace converts the table to the compressed
2592   table: We first do not compress the root node 1,and mark all its
2593   .check pointers as 1 and set its .base pointer as 1 as well. This
2594   allows us to do a DFA construction from the compressed table later,
2595   and ensures that any .base pointers we calculate later are greater
2596   than 0.
2597
2598   - We set 'pos' to indicate the first entry of the second node.
2599
2600   - We then iterate over the columns of the node, finding the first and
2601   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2602   and set the .check pointers accordingly, and advance pos
2603   appropriately and repreat for the next node. Note that when we copy
2604   the next pointers we have to convert them from the original
2605   NODEIDX form to NODENUM form as the former is not valid post
2606   compression.
2607
2608   - If a node has no transitions used we mark its base as 0 and do not
2609   advance the pos pointer.
2610
2611   - If a node only has one transition we use a second pointer into the
2612   structure to fill in allocated fail transitions from other states.
2613   This pointer is independent of the main pointer and scans forward
2614   looking for null transitions that are allocated to a state. When it
2615   finds one it writes the single transition into the "hole".  If the
2616   pointer doesnt find one the single transition is appended as normal.
2617
2618   - Once compressed we can Renew/realloc the structures to release the
2619   excess space.
2620
2621   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2622   specifically Fig 3.47 and the associated pseudocode.
2623
2624   demq
2625   */
2626   const U32 laststate = TRIE_NODENUM( next_alloc );
2627   U32 state, charid;
2628   U32 pos = 0, zp=0;
2629   trie->statecount = laststate;
2630
2631   for ( state = 1 ; state < laststate ; state++ ) {
2632    U8 flag = 0;
2633    const U32 stateidx = TRIE_NODEIDX( state );
2634    const U32 o_used = trie->trans[ stateidx ].check;
2635    U32 used = trie->trans[ stateidx ].check;
2636    trie->trans[ stateidx ].check = 0;
2637
2638    for ( charid = 0;
2639     used && charid < trie->uniquecharcount;
2640     charid++ )
2641    {
2642     if ( flag || trie->trans[ stateidx + charid ].next ) {
2643      if ( trie->trans[ stateidx + charid ].next ) {
2644       if (o_used == 1) {
2645        for ( ; zp < pos ; zp++ ) {
2646         if ( ! trie->trans[ zp ].next ) {
2647          break;
2648         }
2649        }
2650        trie->states[ state ].trans.base
2651              = zp
2652              + trie->uniquecharcount
2653              - charid ;
2654        trie->trans[ zp ].next
2655         = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2656                + charid ].next );
2657        trie->trans[ zp ].check = state;
2658        if ( ++zp > pos ) pos = zp;
2659        break;
2660       }
2661       used--;
2662      }
2663      if ( !flag ) {
2664       flag = 1;
2665       trie->states[ state ].trans.base
2666          = pos + trie->uniquecharcount - charid ;
2667      }
2668      trie->trans[ pos ].next
2669       = SAFE_TRIE_NODENUM(
2670          trie->trans[ stateidx + charid ].next );
2671      trie->trans[ pos ].check = state;
2672      pos++;
2673     }
2674    }
2675   }
2676   trie->lasttrans = pos + 1;
2677   trie->states = (reg_trie_state *)
2678    PerlMemShared_realloc( trie->states, laststate
2679         * sizeof(reg_trie_state) );
2680   DEBUG_TRIE_COMPILE_MORE_r(
2681    PerlIO_printf( Perl_debug_log,
2682     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2683     (int)depth * 2 + 2,"",
2684     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2685      + 1 ),
2686     (IV)next_alloc,
2687     (IV)pos,
2688     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2689    );
2690
2691   } /* end table compress */
2692  }
2693  DEBUG_TRIE_COMPILE_MORE_r(
2694    PerlIO_printf(Perl_debug_log,
2695     "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2696     (int)depth * 2 + 2, "",
2697     (UV)trie->statecount,
2698     (UV)trie->lasttrans)
2699  );
2700  /* resize the trans array to remove unused space */
2701  trie->trans = (reg_trie_trans *)
2702   PerlMemShared_realloc( trie->trans, trie->lasttrans
2703        * sizeof(reg_trie_trans) );
2704
2705  {   /* Modify the program and insert the new TRIE node */
2706   U8 nodetype =(U8)(flags & 0xFF);
2707   char *str=NULL;
2708
2709 #ifdef DEBUGGING
2710   regnode *optimize = NULL;
2711 #ifdef RE_TRACK_PATTERN_OFFSETS
2712
2713   U32 mjd_offset = 0;
2714   U32 mjd_nodelen = 0;
2715 #endif /* RE_TRACK_PATTERN_OFFSETS */
2716 #endif /* DEBUGGING */
2717   /*
2718   This means we convert either the first branch or the first Exact,
2719   depending on whether the thing following (in 'last') is a branch
2720   or not and whther first is the startbranch (ie is it a sub part of
2721   the alternation or is it the whole thing.)
2722   Assuming its a sub part we convert the EXACT otherwise we convert
2723   the whole branch sequence, including the first.
2724   */
2725   /* Find the node we are going to overwrite */
2726   if ( first != startbranch || OP( last ) == BRANCH ) {
2727    /* branch sub-chain */
2728    NEXT_OFF( first ) = (U16)(last - first);
2729 #ifdef RE_TRACK_PATTERN_OFFSETS
2730    DEBUG_r({
2731     mjd_offset= Node_Offset((convert));
2732     mjd_nodelen= Node_Length((convert));
2733    });
2734 #endif
2735    /* whole branch chain */
2736   }
2737 #ifdef RE_TRACK_PATTERN_OFFSETS
2738   else {
2739    DEBUG_r({
2740     const  regnode *nop = NEXTOPER( convert );
2741     mjd_offset= Node_Offset((nop));
2742     mjd_nodelen= Node_Length((nop));
2743    });
2744   }
2745   DEBUG_OPTIMISE_r(
2746    PerlIO_printf(Perl_debug_log,
2747     "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2748     (int)depth * 2 + 2, "",
2749     (UV)mjd_offset, (UV)mjd_nodelen)
2750   );
2751 #endif
2752   /* But first we check to see if there is a common prefix we can
2753   split out as an EXACT and put in front of the TRIE node.  */
2754   trie->startstate= 1;
2755   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2756    U32 state;
2757    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2758     U32 ofs = 0;
2759     I32 idx = -1;
2760     U32 count = 0;
2761     const U32 base = trie->states[ state ].trans.base;
2762
2763     if ( trie->states[state].wordnum )
2764       count = 1;
2765
2766     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2767      if ( ( base + ofs >= trie->uniquecharcount ) &&
2768       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2769       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2770      {
2771       if ( ++count > 1 ) {
2772        SV **tmp = av_fetch( revcharmap, ofs, 0);
2773        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2774        if ( state == 1 ) break;
2775        if ( count == 2 ) {
2776         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2777         DEBUG_OPTIMISE_r(
2778          PerlIO_printf(Perl_debug_log,
2779           "%*sNew Start State=%"UVuf" Class: [",
2780           (int)depth * 2 + 2, "",
2781           (UV)state));
2782         if (idx >= 0) {
2783          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2784          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2785
2786          TRIE_BITMAP_SET(trie,*ch);
2787          if ( folder )
2788           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2789          DEBUG_OPTIMISE_r(
2790           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2791          );
2792         }
2793        }
2794        TRIE_BITMAP_SET(trie,*ch);
2795        if ( folder )
2796         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2797        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2798       }
2799       idx = ofs;
2800      }
2801     }
2802     if ( count == 1 ) {
2803      SV **tmp = av_fetch( revcharmap, idx, 0);
2804      STRLEN len;
2805      char *ch = SvPV( *tmp, len );
2806      DEBUG_OPTIMISE_r({
2807       SV *sv=sv_newmortal();
2808       PerlIO_printf( Perl_debug_log,
2809        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2810        (int)depth * 2 + 2, "",
2811        (UV)state, (UV)idx,
2812        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2813         PL_colors[0], PL_colors[1],
2814         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2815         PERL_PV_ESCAPE_FIRSTCHAR
2816        )
2817       );
2818      });
2819      if ( state==1 ) {
2820       OP( convert ) = nodetype;
2821       str=STRING(convert);
2822       STR_LEN(convert)=0;
2823      }
2824      STR_LEN(convert) += len;
2825      while (len--)
2826       *str++ = *ch++;
2827     } else {
2828 #ifdef DEBUGGING
2829      if (state>1)
2830       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2831 #endif
2832      break;
2833     }
2834    }
2835    trie->prefixlen = (state-1);
2836    if (str) {
2837     regnode *n = convert+NODE_SZ_STR(convert);
2838     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2839     trie->startstate = state;
2840     trie->minlen -= (state - 1);
2841     trie->maxlen -= (state - 1);
2842 #ifdef DEBUGGING
2843    /* At least the UNICOS C compiler choked on this
2844     * being argument to DEBUG_r(), so let's just have
2845     * it right here. */
2846    if (
2847 #ifdef PERL_EXT_RE_BUILD
2848     1
2849 #else
2850     DEBUG_r_TEST
2851 #endif
2852     ) {
2853     regnode *fix = convert;
2854     U32 word = trie->wordcount;
2855     mjd_nodelen++;
2856     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2857     while( ++fix < n ) {
2858      Set_Node_Offset_Length(fix, 0, 0);
2859     }
2860     while (word--) {
2861      SV ** const tmp = av_fetch( trie_words, word, 0 );
2862      if (tmp) {
2863       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2864        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2865       else
2866        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2867      }
2868     }
2869    }
2870 #endif
2871     if (trie->maxlen) {
2872      convert = n;
2873     } else {
2874      NEXT_OFF(convert) = (U16)(tail - convert);
2875      DEBUG_r(optimize= n);
2876     }
2877    }
2878   }
2879   if (!jumper)
2880    jumper = last;
2881   if ( trie->maxlen ) {
2882    NEXT_OFF( convert ) = (U16)(tail - convert);
2883    ARG_SET( convert, data_slot );
2884    /* Store the offset to the first unabsorbed branch in
2885    jump[0], which is otherwise unused by the jump logic.
2886    We use this when dumping a trie and during optimisation. */
2887    if (trie->jump)
2888     trie->jump[0] = (U16)(nextbranch - convert);
2889
2890    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2891    *   and there is a bitmap
2892    *   and the first "jump target" node we found leaves enough room
2893    * then convert the TRIE node into a TRIEC node, with the bitmap
2894    * embedded inline in the opcode - this is hypothetically faster.
2895    */
2896    if ( !trie->states[trie->startstate].wordnum
2897     && trie->bitmap
2898     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2899    {
2900     OP( convert ) = TRIEC;
2901     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2902     PerlMemShared_free(trie->bitmap);
2903     trie->bitmap= NULL;
2904    } else
2905     OP( convert ) = TRIE;
2906
2907    /* store the type in the flags */
2908    convert->flags = nodetype;
2909    DEBUG_r({
2910    optimize = convert
2911      + NODE_STEP_REGNODE
2912      + regarglen[ OP( convert ) ];
2913    });
2914    /* XXX We really should free up the resource in trie now,
2915     as we won't use them - (which resources?) dmq */
2916   }
2917   /* needed for dumping*/
2918   DEBUG_r(if (optimize) {
2919    regnode *opt = convert;
2920
2921    while ( ++opt < optimize) {
2922     Set_Node_Offset_Length(opt,0,0);
2923    }
2924    /*
2925     Try to clean up some of the debris left after the
2926     optimisation.
2927    */
2928    while( optimize < jumper ) {
2929     mjd_nodelen += Node_Length((optimize));
2930     OP( optimize ) = OPTIMIZED;
2931     Set_Node_Offset_Length(optimize,0,0);
2932     optimize++;
2933    }
2934    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2935   });
2936  } /* end node insert */
2937  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2938
2939  /*  Finish populating the prev field of the wordinfo array.  Walk back
2940  *  from each accept state until we find another accept state, and if
2941  *  so, point the first word's .prev field at the second word. If the
2942  *  second already has a .prev field set, stop now. This will be the
2943  *  case either if we've already processed that word's accept state,
2944  *  or that state had multiple words, and the overspill words were
2945  *  already linked up earlier.
2946  */
2947  {
2948   U16 word;
2949   U32 state;
2950   U16 prev;
2951
2952   for (word=1; word <= trie->wordcount; word++) {
2953    prev = 0;
2954    if (trie->wordinfo[word].prev)
2955     continue;
2956    state = trie->wordinfo[word].accept;
2957    while (state) {
2958     state = prev_states[state];
2959     if (!state)
2960      break;
2961     prev = trie->states[state].wordnum;
2962     if (prev)
2963      break;
2964    }
2965    trie->wordinfo[word].prev = prev;
2966   }
2967   Safefree(prev_states);
2968  }
2969
2970
2971  /* and now dump out the compressed format */
2972  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2973
2974  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2975 #ifdef DEBUGGING
2976  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2977  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2978 #else
2979  SvREFCNT_dec_NN(revcharmap);
2980 #endif
2981  return trie->jump
2982   ? MADE_JUMP_TRIE
2983   : trie->startstate>1
2984    ? MADE_EXACT_TRIE
2985    : MADE_TRIE;
2986 }
2987
2988 STATIC void
2989 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2990 {
2991 /* The Trie is constructed and compressed now so we can build a fail array if
2992  * it's needed
2993
2994    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2995    3.32 in the
2996    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2997    Ullman 1985/88
2998    ISBN 0-201-10088-6
2999
3000    We find the fail state for each state in the trie, this state is the longest
3001    proper suffix of the current state's 'word' that is also a proper prefix of
3002    another word in our trie. State 1 represents the word '' and is thus the
3003    default fail state. This allows the DFA not to have to restart after its
3004    tried and failed a word at a given point, it simply continues as though it
3005    had been matching the other word in the first place.
3006    Consider
3007  'abcdgu'=~/abcdefg|cdgu/
3008    When we get to 'd' we are still matching the first word, we would encounter
3009    'g' which would fail, which would bring us to the state representing 'd' in
3010    the second word where we would try 'g' and succeed, proceeding to match
3011    'cdgu'.
3012  */
3013  /* add a fail transition */
3014  const U32 trie_offset = ARG(source);
3015  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3016  U32 *q;
3017  const U32 ucharcount = trie->uniquecharcount;
3018  const U32 numstates = trie->statecount;
3019  const U32 ubound = trie->lasttrans + ucharcount;
3020  U32 q_read = 0;
3021  U32 q_write = 0;
3022  U32 charid;
3023  U32 base = trie->states[ 1 ].trans.base;
3024  U32 *fail;
3025  reg_ac_data *aho;
3026  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3027  GET_RE_DEBUG_FLAGS_DECL;
3028
3029  PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
3030 #ifndef DEBUGGING
3031  PERL_UNUSED_ARG(depth);
3032 #endif
3033
3034
3035  ARG_SET( stclass, data_slot );
3036  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3037  RExC_rxi->data->data[ data_slot ] = (void*)aho;
3038  aho->trie=trie_offset;
3039  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3040  Copy( trie->states, aho->states, numstates, reg_trie_state );
3041  Newxz( q, numstates, U32);
3042  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3043  aho->refcount = 1;
3044  fail = aho->fail;
3045  /* initialize fail[0..1] to be 1 so that we always have
3046  a valid final fail state */
3047  fail[ 0 ] = fail[ 1 ] = 1;
3048
3049  for ( charid = 0; charid < ucharcount ; charid++ ) {
3050   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3051   if ( newstate ) {
3052    q[ q_write ] = newstate;
3053    /* set to point at the root */
3054    fail[ q[ q_write++ ] ]=1;
3055   }
3056  }
3057  while ( q_read < q_write) {
3058   const U32 cur = q[ q_read++ % numstates ];
3059   base = trie->states[ cur ].trans.base;
3060
3061   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3062    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3063    if (ch_state) {
3064     U32 fail_state = cur;
3065     U32 fail_base;
3066     do {
3067      fail_state = fail[ fail_state ];
3068      fail_base = aho->states[ fail_state ].trans.base;
3069     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3070
3071     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3072     fail[ ch_state ] = fail_state;
3073     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3074     {
3075       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3076     }
3077     q[ q_write++ % numstates] = ch_state;
3078    }
3079   }
3080  }
3081  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3082  when we fail in state 1, this allows us to use the
3083  charclass scan to find a valid start char. This is based on the principle
3084  that theres a good chance the string being searched contains lots of stuff
3085  that cant be a start char.
3086  */
3087  fail[ 0 ] = fail[ 1 ] = 0;
3088  DEBUG_TRIE_COMPILE_r({
3089   PerlIO_printf(Perl_debug_log,
3090      "%*sStclass Failtable (%"UVuf" states): 0",
3091      (int)(depth * 2), "", (UV)numstates
3092   );
3093   for( q_read=1; q_read<numstates; q_read++ ) {
3094    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3095   }
3096   PerlIO_printf(Perl_debug_log, "\n");
3097  });
3098  Safefree(q);
3099  /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3100 }
3101
3102
3103 #define DEBUG_PEEP(str,scan,depth) \
3104  DEBUG_OPTIMISE_r({if (scan){ \
3105  SV * const mysv=sv_newmortal(); \
3106  regnode *Next = regnext(scan); \
3107  regprop(RExC_rx, mysv, scan, NULL); \
3108  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3109  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3110  Next ? (REG_NODE_NUM(Next)) : 0 ); \
3111    }});
3112
3113
3114 /* The below joins as many adjacent EXACTish nodes as possible into a single
3115  * one.  The regop may be changed if the node(s) contain certain sequences that
3116  * require special handling.  The joining is only done if:
3117  * 1) there is room in the current conglomerated node to entirely contain the
3118  *    next one.
3119  * 2) they are the exact same node type
3120  *
3121  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3122  * these get optimized out
3123  *
3124  * If a node is to match under /i (folded), the number of characters it matches
3125  * can be different than its character length if it contains a multi-character
3126  * fold.  *min_subtract is set to the total delta number of characters of the
3127  * input nodes.
3128  *
3129  * And *unfolded_multi_char is set to indicate whether or not the node contains
3130  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3131  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3132  * SMALL LETTER SHARP S, as only if the target string being matched against
3133  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3134  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3135  * whose components are all above the Latin1 range are not run-time locale
3136  * dependent, and have already been folded by the time this function is
3137  * called.)
3138  *
3139  * This is as good a place as any to discuss the design of handling these
3140  * multi-character fold sequences.  It's been wrong in Perl for a very long
3141  * time.  There are three code points in Unicode whose multi-character folds
3142  * were long ago discovered to mess things up.  The previous designs for
3143  * dealing with these involved assigning a special node for them.  This
3144  * approach doesn't always work, as evidenced by this example:
3145  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3146  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3147  * would match just the \xDF, it won't be able to handle the case where a
3148  * successful match would have to cross the node's boundary.  The new approach
3149  * that hopefully generally solves the problem generates an EXACTFU_SS node
3150  * that is "sss" in this case.
3151  *
3152  * It turns out that there are problems with all multi-character folds, and not
3153  * just these three.  Now the code is general, for all such cases.  The
3154  * approach taken is:
3155  * 1)   This routine examines each EXACTFish node that could contain multi-
3156  *      character folded sequences.  Since a single character can fold into
3157  *      such a sequence, the minimum match length for this node is less than
3158  *      the number of characters in the node.  This routine returns in
3159  *      *min_subtract how many characters to subtract from the the actual
3160  *      length of the string to get a real minimum match length; it is 0 if
3161  *      there are no multi-char foldeds.  This delta is used by the caller to
3162  *      adjust the min length of the match, and the delta between min and max,
3163  *      so that the optimizer doesn't reject these possibilities based on size
3164  *      constraints.
3165  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3166  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3167  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3168  *      there is a possible fold length change.  That means that a regular
3169  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3170  *      with length changes, and so can be processed faster.  regexec.c takes
3171  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3172  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3173  *      known until runtime).  This saves effort in regex matching.  However,
3174  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3175  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3176  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3177  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3178  *      possibilities for the non-UTF8 patterns are quite simple, except for
3179  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3180  *      members of a fold-pair, and arrays are set up for all of them so that
3181  *      the other member of the pair can be found quickly.  Code elsewhere in
3182  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3183  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3184  *      described in the next item.
3185  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3186  *      validity of the fold won't be known until runtime, and so must remain
3187  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3188  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3189  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3190  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3191  *      The reason this is a problem is that the optimizer part of regexec.c
3192  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3193  *      that a character in the pattern corresponds to at most a single
3194  *      character in the target string.  (And I do mean character, and not byte
3195  *      here, unlike other parts of the documentation that have never been
3196  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3197  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3198  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3199  *      nodes, violate the assumption, and they are the only instances where it
3200  *      is violated.  I'm reluctant to try to change the assumption, as the
3201  *      code involved is impenetrable to me (khw), so instead the code here
3202  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3203  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3204  *      boolean indicating whether or not the node contains such a fold.  When
3205  *      it is true, the caller sets a flag that later causes the optimizer in
3206  *      this file to not set values for the floating and fixed string lengths,
3207  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3208  *      assumption.  Thus, there is no optimization based on string lengths for
3209  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3210  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3211  *      assumption is wrong only in these cases is that all other non-UTF-8
3212  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3213  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3214  *      EXACTF nodes because we don't know at compile time if it actually
3215  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3216  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3217  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3218  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3219  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3220  *      string would require the pattern to be forced into UTF-8, the overhead
3221  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3222  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3223  *      locale.)
3224  *
3225  *      Similarly, the code that generates tries doesn't currently handle
3226  *      not-already-folded multi-char folds, and it looks like a pain to change
3227  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3228  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3229  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3230  *      using /iaa matching will be doing so almost entirely with ASCII
3231  *      strings, so this should rarely be encountered in practice */
3232
3233 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3234  if (PL_regkind[OP(scan)] == EXACT) \
3235   join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3236
3237 STATIC U32
3238 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3239     UV *min_subtract, bool *unfolded_multi_char,
3240     U32 flags,regnode *val, U32 depth)
3241 {
3242  /* Merge several consecutive EXACTish nodes into one. */
3243  regnode *n = regnext(scan);
3244  U32 stringok = 1;
3245  regnode *next = scan + NODE_SZ_STR(scan);
3246  U32 merged = 0;
3247  U32 stopnow = 0;
3248 #ifdef DEBUGGING
3249  regnode *stop = scan;
3250  GET_RE_DEBUG_FLAGS_DECL;
3251 #else
3252  PERL_UNUSED_ARG(depth);
3253 #endif
3254
3255  PERL_ARGS_ASSERT_JOIN_EXACT;
3256 #ifndef EXPERIMENTAL_INPLACESCAN
3257  PERL_UNUSED_ARG(flags);
3258  PERL_UNUSED_ARG(val);
3259 #endif
3260  DEBUG_PEEP("join",scan,depth);
3261
3262  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3263  * EXACT ones that are mergeable to the current one. */
3264  while (n
3265   && (PL_regkind[OP(n)] == NOTHING
3266    || (stringok && OP(n) == OP(scan)))
3267   && NEXT_OFF(n)
3268   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3269  {
3270
3271   if (OP(n) == TAIL || n > next)
3272    stringok = 0;
3273   if (PL_regkind[OP(n)] == NOTHING) {
3274    DEBUG_PEEP("skip:",n,depth);
3275    NEXT_OFF(scan) += NEXT_OFF(n);
3276    next = n + NODE_STEP_REGNODE;
3277 #ifdef DEBUGGING
3278    if (stringok)
3279     stop = n;
3280 #endif
3281    n = regnext(n);
3282   }
3283   else if (stringok) {
3284    const unsigned int oldl = STR_LEN(scan);
3285    regnode * const nnext = regnext(n);
3286
3287    /* XXX I (khw) kind of doubt that this works on platforms (should
3288    * Perl ever run on one) where U8_MAX is above 255 because of lots
3289    * of other assumptions */
3290    /* Don't join if the sum can't fit into a single node */
3291    if (oldl + STR_LEN(n) > U8_MAX)
3292     break;
3293
3294    DEBUG_PEEP("merg",n,depth);
3295    merged++;
3296
3297    NEXT_OFF(scan) += NEXT_OFF(n);
3298    STR_LEN(scan) += STR_LEN(n);
3299    next = n + NODE_SZ_STR(n);
3300    /* Now we can overwrite *n : */
3301    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3302 #ifdef DEBUGGING
3303    stop = next - 1;
3304 #endif
3305    n = nnext;
3306    if (stopnow) break;
3307   }
3308
3309 #ifdef EXPERIMENTAL_INPLACESCAN
3310   if (flags && !NEXT_OFF(n)) {
3311    DEBUG_PEEP("atch", val, depth);
3312    if (reg_off_by_arg[OP(n)]) {
3313     ARG_SET(n, val - n);
3314    }
3315    else {
3316     NEXT_OFF(n) = val - n;
3317    }
3318    stopnow = 1;
3319   }
3320 #endif
3321  }
3322
3323  *min_subtract = 0;
3324  *unfolded_multi_char = FALSE;
3325
3326  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3327  * can now analyze for sequences of problematic code points.  (Prior to
3328  * this final joining, sequences could have been split over boundaries, and
3329  * hence missed).  The sequences only happen in folding, hence for any
3330  * non-EXACT EXACTish node */
3331  if (OP(scan) != EXACT) {
3332   U8* s0 = (U8*) STRING(scan);
3333   U8* s = s0;
3334   U8* s_end = s0 + STR_LEN(scan);
3335
3336   int total_count_delta = 0;  /* Total delta number of characters that
3337          multi-char folds expand to */
3338
3339   /* One pass is made over the node's string looking for all the
3340   * possibilities.  To avoid some tests in the loop, there are two main
3341   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3342   * non-UTF-8 */
3343   if (UTF) {
3344    U8* folded = NULL;
3345
3346    if (OP(scan) == EXACTFL) {
3347     U8 *d;
3348
3349     /* An EXACTFL node would already have been changed to another
3350     * node type unless there is at least one character in it that
3351     * is problematic; likely a character whose fold definition
3352     * won't be known until runtime, and so has yet to be folded.
3353     * For all but the UTF-8 locale, folds are 1-1 in length, but
3354     * to handle the UTF-8 case, we need to create a temporary
3355     * folded copy using UTF-8 locale rules in order to analyze it.
3356     * This is because our macros that look to see if a sequence is
3357     * a multi-char fold assume everything is folded (otherwise the
3358     * tests in those macros would be too complicated and slow).
3359     * Note that here, the non-problematic folds will have already
3360     * been done, so we can just copy such characters.  We actually
3361     * don't completely fold the EXACTFL string.  We skip the
3362     * unfolded multi-char folds, as that would just create work
3363     * below to figure out the size they already are */
3364
3365     Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3366     d = folded;
3367     while (s < s_end) {
3368      STRLEN s_len = UTF8SKIP(s);
3369      if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3370       Copy(s, d, s_len, U8);
3371       d += s_len;
3372      }
3373      else if (is_FOLDS_TO_MULTI_utf8(s)) {
3374       *unfolded_multi_char = TRUE;
3375       Copy(s, d, s_len, U8);
3376       d += s_len;
3377      }
3378      else if (isASCII(*s)) {
3379       *(d++) = toFOLD(*s);
3380      }
3381      else {
3382       STRLEN len;
3383       _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3384       d += len;
3385      }
3386      s += s_len;
3387     }
3388
3389     /* Point the remainder of the routine to look at our temporary
3390     * folded copy */
3391     s = folded;
3392     s_end = d;
3393    } /* End of creating folded copy of EXACTFL string */
3394
3395    /* Examine the string for a multi-character fold sequence.  UTF-8
3396    * patterns have all characters pre-folded by the time this code is
3397    * executed */
3398    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3399          length sequence we are looking for is 2 */
3400    {
3401     int count = 0;  /* How many characters in a multi-char fold */
3402     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3403     if (! len) {    /* Not a multi-char fold: get next char */
3404      s += UTF8SKIP(s);
3405      continue;
3406     }
3407
3408     /* Nodes with 'ss' require special handling, except for
3409     * EXACTFA-ish for which there is no multi-char fold to this */
3410     if (len == 2 && *s == 's' && *(s+1) == 's'
3411      && OP(scan) != EXACTFA
3412      && OP(scan) != EXACTFA_NO_TRIE)
3413     {
3414      count = 2;
3415      if (OP(scan) != EXACTFL) {
3416       OP(scan) = EXACTFU_SS;
3417      }
3418      s += 2;
3419     }
3420     else { /* Here is a generic multi-char fold. */
3421      U8* multi_end  = s + len;
3422
3423      /* Count how many characters in it.  In the case of /aa, no
3424      * folds which contain ASCII code points are allowed, so
3425      * check for those, and skip if found. */
3426      if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3427       count = utf8_length(s, multi_end);
3428       s = multi_end;
3429      }
3430      else {
3431       while (s < multi_end) {
3432        if (isASCII(*s)) {
3433         s++;
3434         goto next_iteration;
3435        }
3436        else {
3437         s += UTF8SKIP(s);
3438        }
3439        count++;
3440       }
3441      }
3442     }
3443
3444     /* The delta is how long the sequence is minus 1 (1 is how long
3445     * the character that folds to the sequence is) */
3446     total_count_delta += count - 1;
3447    next_iteration: ;
3448    }
3449
3450    /* We created a temporary folded copy of the string in EXACTFL
3451    * nodes.  Therefore we need to be sure it doesn't go below zero,
3452    * as the real string could be shorter */
3453    if (OP(scan) == EXACTFL) {
3454     int total_chars = utf8_length((U8*) STRING(scan),
3455           (U8*) STRING(scan) + STR_LEN(scan));
3456     if (total_count_delta > total_chars) {
3457      total_count_delta = total_chars;
3458     }
3459    }
3460
3461    *min_subtract += total_count_delta;
3462    Safefree(folded);
3463   }
3464   else if (OP(scan) == EXACTFA) {
3465
3466    /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3467    * fold to the ASCII range (and there are no existing ones in the
3468    * upper latin1 range).  But, as outlined in the comments preceding
3469    * this function, we need to flag any occurrences of the sharp s.
3470    * This character forbids trie formation (because of added
3471    * complexity) */
3472    while (s < s_end) {
3473     if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3474      OP(scan) = EXACTFA_NO_TRIE;
3475      *unfolded_multi_char = TRUE;
3476      break;
3477     }
3478     s++;
3479     continue;
3480    }
3481   }
3482   else {
3483
3484    /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3485    * folds that are all Latin1.  As explained in the comments
3486    * preceding this function, we look also for the sharp s in EXACTF
3487    * and EXACTFL nodes; it can be in the final position.  Otherwise
3488    * we can stop looking 1 byte earlier because have to find at least
3489    * two characters for a multi-fold */
3490    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3491        ? s_end
3492        : s_end -1;
3493
3494    while (s < upper) {
3495     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3496     if (! len) {    /* Not a multi-char fold. */
3497      if (*s == LATIN_SMALL_LETTER_SHARP_S
3498       && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3499      {
3500       *unfolded_multi_char = TRUE;
3501      }
3502      s++;
3503      continue;
3504     }
3505
3506     if (len == 2
3507      && isARG2_lower_or_UPPER_ARG1('s', *s)
3508      && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3509     {
3510
3511      /* EXACTF nodes need to know that the minimum length
3512      * changed so that a sharp s in the string can match this
3513      * ss in the pattern, but they remain EXACTF nodes, as they
3514      * won't match this unless the target string is is UTF-8,
3515      * which we don't know until runtime.  EXACTFL nodes can't
3516      * transform into EXACTFU nodes */
3517      if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3518       OP(scan) = EXACTFU_SS;
3519      }
3520     }
3521
3522     *min_subtract += len - 1;
3523     s += len;
3524    }
3525   }
3526  }
3527
3528 #ifdef DEBUGGING
3529  /* Allow dumping but overwriting the collection of skipped
3530  * ops and/or strings with fake optimized ops */
3531  n = scan + NODE_SZ_STR(scan);
3532  while (n <= stop) {
3533   OP(n) = OPTIMIZED;
3534   FLAGS(n) = 0;
3535   NEXT_OFF(n) = 0;
3536   n++;
3537  }
3538 #endif
3539  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3540  return stopnow;
3541 }
3542
3543 /* REx optimizer.  Converts nodes into quicker variants "in place".
3544    Finds fixed substrings.  */
3545
3546 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3547    to the position after last scanned or to NULL. */
3548
3549 #define INIT_AND_WITHP \
3550  assert(!and_withp); \
3551  Newx(and_withp,1, regnode_ssc); \
3552  SAVEFREEPV(and_withp)
3553
3554 /* this is a chain of data about sub patterns we are processing that
3555    need to be handled separately/specially in study_chunk. Its so
3556    we can simulate recursion without losing state.  */
3557 struct scan_frame;
3558 typedef struct scan_frame {
3559  regnode *last;  /* last node to process in this frame */
3560  regnode *next;  /* next node to process when last is reached */
3561  struct scan_frame *prev; /*previous frame*/
3562  U32 prev_recursed_depth;
3563  I32 stop; /* what stopparen do we use */
3564 } scan_frame;
3565
3566
3567 STATIC SSize_t
3568 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3569       SSize_t *minlenp, SSize_t *deltap,
3570       regnode *last,
3571       scan_data_t *data,
3572       I32 stopparen,
3573       U32 recursed_depth,
3574       regnode_ssc *and_withp,
3575       U32 flags, U32 depth)
3576       /* scanp: Start here (read-write). */
3577       /* deltap: Write maxlen-minlen here. */
3578       /* last: Stop before this one. */
3579       /* data: string data about the pattern */
3580       /* stopparen: treat close N as END */
3581       /* recursed: which subroutines have we recursed into */
3582       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3583 {
3584  dVAR;
3585  /* There must be at least this number of characters to match */
3586  SSize_t min = 0;
3587  I32 pars = 0, code;
3588  regnode *scan = *scanp, *next;
3589  SSize_t delta = 0;
3590  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3591  int is_inf_internal = 0;  /* The studied chunk is infinite */
3592  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3593  scan_data_t data_fake;
3594  SV *re_trie_maxbuff = NULL;
3595  regnode *first_non_open = scan;
3596  SSize_t stopmin = SSize_t_MAX;
3597  scan_frame *frame = NULL;
3598  GET_RE_DEBUG_FLAGS_DECL;
3599
3600  PERL_ARGS_ASSERT_STUDY_CHUNK;
3601
3602 #ifdef DEBUGGING
3603  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3604 #endif
3605  if ( depth == 0 ) {
3606   while (first_non_open && OP(first_non_open) == OPEN)
3607    first_non_open=regnext(first_non_open);
3608  }
3609
3610
3611   fake_study_recurse:
3612  while ( scan && OP(scan) != END && scan < last ){
3613   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3614         node length to get a real minimum (because
3615         the folded version may be shorter) */
3616   bool unfolded_multi_char = FALSE;
3617   /* Peephole optimizer: */
3618   DEBUG_OPTIMISE_MORE_r(
3619   {
3620    PerlIO_printf(Perl_debug_log,
3621     "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3622     ((int) depth*2), "", (long)stopparen,
3623     (unsigned long)depth, (unsigned long)recursed_depth);
3624    if (recursed_depth) {
3625     U32 i;
3626     U32 j;
3627     for ( j = 0 ; j < recursed_depth ; j++ ) {
3628      PerlIO_printf(Perl_debug_log,"[");
3629      for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3630       PerlIO_printf(Perl_debug_log,"%d",
3631        PAREN_TEST(RExC_study_chunk_recursed +
3632          (j * RExC_study_chunk_recursed_bytes), i)
3633        ? 1 : 0
3634       );
3635      PerlIO_printf(Perl_debug_log,"]");
3636     }
3637    }
3638    PerlIO_printf(Perl_debug_log,"\n");
3639   }
3640   );
3641   DEBUG_STUDYDATA("Peep:", data, depth);
3642   DEBUG_PEEP("Peep", scan, depth);
3643
3644
3645   /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3646   * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3647   * by a different invocation of reg() -- Yves
3648   */
3649   JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3650
3651   /* Follow the next-chain of the current node and optimize
3652   away all the NOTHINGs from it.  */
3653   if (OP(scan) != CURLYX) {
3654    const int max = (reg_off_by_arg[OP(scan)]
3655      ? I32_MAX
3656      /* I32 may be smaller than U16 on CRAYs! */
3657      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3658    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3659    int noff;
3660    regnode *n = scan;
3661
3662    /* Skip NOTHING and LONGJMP. */
3663    while ((n = regnext(n))
3664     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3665      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3666     && off + noff < max)
3667     off += noff;
3668    if (reg_off_by_arg[OP(scan)])
3669     ARG(scan) = off;
3670    else
3671     NEXT_OFF(scan) = off;
3672   }
3673
3674
3675
3676   /* The principal pseudo-switch.  Cannot be a switch, since we
3677   look into several different things.  */
3678   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3679     || OP(scan) == IFTHEN) {
3680    next = regnext(scan);
3681    code = OP(scan);
3682    /* demq: the op(next)==code check is to see if we have
3683    * "branch-branch" AFAICT */
3684
3685    if (OP(next) == code || code == IFTHEN) {
3686     /* NOTE - There is similar code to this block below for
3687     * handling TRIE nodes on a re-study.  If you change stuff here
3688     * check there too. */
3689     SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3690     regnode_ssc accum;
3691     regnode * const startbranch=scan;
3692
3693     if (flags & SCF_DO_SUBSTR) {
3694      /* Cannot merge strings after this. */
3695      scan_commit(pRExC_state, data, minlenp, is_inf);
3696     }
3697
3698     if (flags & SCF_DO_STCLASS)
3699      ssc_init_zero(pRExC_state, &accum);
3700
3701     while (OP(scan) == code) {
3702      SSize_t deltanext, minnext, fake;
3703      I32 f = 0;
3704      regnode_ssc this_class;
3705
3706      num++;
3707      data_fake.flags = 0;
3708      if (data) {
3709       data_fake.whilem_c = data->whilem_c;
3710       data_fake.last_closep = data->last_closep;
3711      }
3712      else
3713       data_fake.last_closep = &fake;
3714
3715      data_fake.pos_delta = delta;
3716      next = regnext(scan);
3717      scan = NEXTOPER(scan);
3718      if (code != BRANCH)
3719       scan = NEXTOPER(scan);
3720      if (flags & SCF_DO_STCLASS) {
3721       ssc_init(pRExC_state, &this_class);
3722       data_fake.start_class = &this_class;
3723       f = SCF_DO_STCLASS_AND;
3724      }
3725      if (flags & SCF_WHILEM_VISITED_POS)
3726       f |= SCF_WHILEM_VISITED_POS;
3727
3728      /* we suppose the run is continuous, last=next...*/
3729      minnext = study_chunk(pRExC_state, &scan, minlenp,
3730          &deltanext, next, &data_fake, stopparen,
3731          recursed_depth, NULL, f,depth+1);
3732      if (min1 > minnext)
3733       min1 = minnext;
3734      if (deltanext == SSize_t_MAX) {
3735       is_inf = is_inf_internal = 1;
3736       max1 = SSize_t_MAX;
3737      } else if (max1 < minnext + deltanext)
3738       max1 = minnext + deltanext;
3739      scan = next;
3740      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3741       pars++;
3742      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3743       if ( stopmin > minnext)
3744        stopmin = min + min1;
3745       flags &= ~SCF_DO_SUBSTR;
3746       if (data)
3747        data->flags |= SCF_SEEN_ACCEPT;
3748      }
3749      if (data) {
3750       if (data_fake.flags & SF_HAS_EVAL)
3751        data->flags |= SF_HAS_EVAL;
3752       data->whilem_c = data_fake.whilem_c;
3753      }
3754      if (flags & SCF_DO_STCLASS)
3755       ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3756     }
3757     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3758      min1 = 0;
3759     if (flags & SCF_DO_SUBSTR) {
3760      data->pos_min += min1;
3761      if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3762       data->pos_delta = SSize_t_MAX;
3763      else
3764       data->pos_delta += max1 - min1;
3765      if (max1 != min1 || is_inf)
3766       data->longest = &(data->longest_float);
3767     }
3768     min += min1;
3769     if (delta == SSize_t_MAX
3770     || SSize_t_MAX - delta - (max1 - min1) < 0)
3771      delta = SSize_t_MAX;
3772     else
3773      delta += max1 - min1;
3774     if (flags & SCF_DO_STCLASS_OR) {
3775      ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3776      if (min1) {
3777       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3778       flags &= ~SCF_DO_STCLASS;
3779      }
3780     }
3781     else if (flags & SCF_DO_STCLASS_AND) {
3782      if (min1) {
3783       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3784       flags &= ~SCF_DO_STCLASS;
3785      }
3786      else {
3787       /* Switch to OR mode: cache the old value of
3788       * data->start_class */
3789       INIT_AND_WITHP;
3790       StructCopy(data->start_class, and_withp, regnode_ssc);
3791       flags &= ~SCF_DO_STCLASS_AND;
3792       StructCopy(&accum, data->start_class, regnode_ssc);
3793       flags |= SCF_DO_STCLASS_OR;
3794      }
3795     }
3796
3797     if (PERL_ENABLE_TRIE_OPTIMISATION &&
3798       OP( startbranch ) == BRANCH )
3799     {
3800     /* demq.
3801
3802     Assuming this was/is a branch we are dealing with: 'scan'
3803     now points at the item that follows the branch sequence,
3804     whatever it is. We now start at the beginning of the
3805     sequence and look for subsequences of
3806
3807     BRANCH->EXACT=>x1
3808     BRANCH->EXACT=>x2
3809     tail
3810
3811     which would be constructed from a pattern like
3812     /A|LIST|OF|WORDS/
3813
3814     If we can find such a subsequence we need to turn the first
3815     element into a trie and then add the subsequent branch exact
3816     strings to the trie.
3817
3818     We have two cases
3819
3820      1. patterns where the whole set of branches can be
3821       converted.
3822
3823      2. patterns where only a subset can be converted.
3824
3825     In case 1 we can replace the whole set with a single regop
3826     for the trie. In case 2 we need to keep the start and end
3827     branches so
3828
3829      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3830      becomes BRANCH TRIE; BRANCH X;
3831
3832     There is an additional case, that being where there is a
3833     common prefix, which gets split out into an EXACT like node
3834     preceding the TRIE node.
3835
3836     If x(1..n)==tail then we can do a simple trie, if not we make
3837     a "jump" trie, such that when we match the appropriate word
3838     we "jump" to the appropriate tail node. Essentially we turn
3839     a nested if into a case structure of sorts.
3840
3841     */
3842
3843      int made=0;
3844      if (!re_trie_maxbuff) {
3845       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3846       if (!SvIOK(re_trie_maxbuff))
3847        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3848      }
3849      if ( SvIV(re_trie_maxbuff)>=0  ) {
3850       regnode *cur;
3851       regnode *first = (regnode *)NULL;
3852       regnode *last = (regnode *)NULL;
3853       regnode *tail = scan;
3854       U8 trietype = 0;
3855       U32 count=0;
3856
3857 #ifdef DEBUGGING
3858       SV * const mysv = sv_newmortal();   /* for dumping */
3859 #endif
3860       /* var tail is used because there may be a TAIL
3861       regop in the way. Ie, the exacts will point to the
3862       thing following the TAIL, but the last branch will
3863       point at the TAIL. So we advance tail. If we
3864       have nested (?:) we may have to move through several
3865       tails.
3866       */
3867
3868       while ( OP( tail ) == TAIL ) {
3869        /* this is the TAIL generated by (?:) */
3870        tail = regnext( tail );
3871       }
3872
3873
3874       DEBUG_TRIE_COMPILE_r({
3875        regprop(RExC_rx, mysv, tail, NULL);
3876        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3877        (int)depth * 2 + 2, "",
3878        "Looking for TRIE'able sequences. Tail node is: ",
3879        SvPV_nolen_const( mysv )
3880        );
3881       });
3882
3883       /*
3884
3885        Step through the branches
3886         cur represents each branch,
3887         noper is the first thing to be matched as part
3888          of that branch
3889         noper_next is the regnext() of that node.
3890
3891        We normally handle a case like this
3892        /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3893        support building with NOJUMPTRIE, which restricts
3894        the trie logic to structures like /FOO|BAR/.
3895
3896        If noper is a trieable nodetype then the branch is
3897        a possible optimization target. If we are building
3898        under NOJUMPTRIE then we require that noper_next is
3899        the same as scan (our current position in the regex
3900        program).
3901
3902        Once we have two or more consecutive such branches
3903        we can create a trie of the EXACT's contents and
3904        stitch it in place into the program.
3905
3906        If the sequence represents all of the branches in
3907        the alternation we replace the entire thing with a
3908        single TRIE node.
3909
3910        Otherwise when it is a subsequence we need to
3911        stitch it in place and replace only the relevant
3912        branches. This means the first branch has to remain
3913        as it is used by the alternation logic, and its
3914        next pointer, and needs to be repointed at the item
3915        on the branch chain following the last branch we
3916        have optimized away.
3917
3918        This could be either a BRANCH, in which case the
3919        subsequence is internal, or it could be the item
3920        following the branch sequence in which case the
3921        subsequence is at the end (which does not
3922        necessarily mean the first node is the start of the
3923        alternation).
3924
3925        TRIE_TYPE(X) is a define which maps the optype to a
3926        trietype.
3927
3928         optype          |  trietype
3929         ----------------+-----------
3930         NOTHING         | NOTHING
3931         EXACT           | EXACT
3932         EXACTFU         | EXACTFU
3933         EXACTFU_SS      | EXACTFU
3934         EXACTFA         | EXACTFA
3935
3936
3937       */
3938 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3939      ( EXACT == (X) )   ? EXACT :        \
3940      ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3941      ( EXACTFA == (X) ) ? EXACTFA :        \
3942      0 )
3943
3944       /* dont use tail as the end marker for this traverse */
3945       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3946        regnode * const noper = NEXTOPER( cur );
3947        U8 noper_type = OP( noper );
3948        U8 noper_trietype = TRIE_TYPE( noper_type );
3949 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3950        regnode * const noper_next = regnext( noper );
3951        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3952        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3953 #endif
3954
3955        DEBUG_TRIE_COMPILE_r({
3956         regprop(RExC_rx, mysv, cur, NULL);
3957         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3958         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3959
3960         regprop(RExC_rx, mysv, noper, NULL);
3961         PerlIO_printf( Perl_debug_log, " -> %s",
3962          SvPV_nolen_const(mysv));
3963
3964         if ( noper_next ) {
3965         regprop(RExC_rx, mysv, noper_next, NULL);
3966         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3967          SvPV_nolen_const(mysv));
3968         }
3969         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3970         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3971         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3972         );
3973        });
3974
3975        /* Is noper a trieable nodetype that can be merged
3976        * with the current trie (if there is one)? */
3977        if ( noper_trietype
3978         &&
3979         (
3980           ( noper_trietype == NOTHING)
3981           || ( trietype == NOTHING )
3982           || ( trietype == noper_trietype )
3983         )
3984 #ifdef NOJUMPTRIE
3985         && noper_next == tail
3986 #endif
3987         && count < U16_MAX)
3988        {
3989         /* Handle mergable triable node Either we are
3990         * the first node in a new trieable sequence,
3991         * in which case we do some bookkeeping,
3992         * otherwise we update the end pointer. */
3993         if ( !first ) {
3994          first = cur;
3995          if ( noper_trietype == NOTHING ) {
3996 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3997           regnode * const noper_next = regnext( noper );
3998           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3999           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4000 #endif
4001
4002           if ( noper_next_trietype ) {
4003            trietype = noper_next_trietype;
4004           } else if (noper_next_type)  {
4005            /* a NOTHING regop is 1 regop wide.
4006            * We need at least two for a trie
4007            * so we can't merge this in */
4008            first = NULL;
4009           }
4010          } else {
4011           trietype = noper_trietype;
4012          }
4013         } else {
4014          if ( trietype == NOTHING )
4015           trietype = noper_trietype;
4016          last = cur;
4017         }
4018         if (first)
4019          count++;
4020        } /* end handle mergable triable node */
4021        else {
4022         /* handle unmergable node -
4023         * noper may either be a triable node which can
4024         * not be tried together with the current trie,
4025         * or a non triable node */
4026         if ( last ) {
4027          /* If last is set and trietype is not
4028          * NOTHING then we have found at least two
4029          * triable branch sequences in a row of a
4030          * similar trietype so we can turn them
4031          * into a trie. If/when we allow NOTHING to
4032          * start a trie sequence this condition
4033          * will be required, and it isn't expensive
4034          * so we leave it in for now. */
4035          if ( trietype && trietype != NOTHING )
4036           make_trie( pRExC_state,
4037             startbranch, first, cur, tail,
4038             count, trietype, depth+1 );
4039          last = NULL; /* note: we clear/update
4040              first, trietype etc below,
4041              so we dont do it here */
4042         }
4043         if ( noper_trietype
4044 #ifdef NOJUMPTRIE
4045          && noper_next == tail
4046 #endif
4047         ){
4048          /* noper is triable, so we can start a new
4049          * trie sequence */
4050          count = 1;
4051          first = cur;
4052          trietype = noper_trietype;
4053         } else if (first) {
4054          /* if we already saw a first but the
4055          * current node is not triable then we have
4056          * to reset the first information. */
4057          count = 0;
4058          first = NULL;
4059          trietype = 0;
4060         }
4061        } /* end handle unmergable node */
4062       } /* loop over branches */
4063       DEBUG_TRIE_COMPILE_r({
4064        regprop(RExC_rx, mysv, cur, NULL);
4065        PerlIO_printf( Perl_debug_log,
4066        "%*s- %s (%d) <SCAN FINISHED>\n",
4067        (int)depth * 2 + 2,
4068        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4069
4070       });
4071       if ( last && trietype ) {
4072        if ( trietype != NOTHING ) {
4073         /* the last branch of the sequence was part of
4074         * a trie, so we have to construct it here
4075         * outside of the loop */
4076         made= make_trie( pRExC_state, startbranch,
4077             first, scan, tail, count,
4078             trietype, depth+1 );
4079 #ifdef TRIE_STUDY_OPT
4080         if ( ((made == MADE_EXACT_TRIE &&
4081          startbranch == first)
4082          || ( first_non_open == first )) &&
4083          depth==0 ) {
4084          flags |= SCF_TRIE_RESTUDY;
4085          if ( startbranch == first
4086           && scan == tail )
4087          {
4088           RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4089          }
4090         }
4091 #endif
4092        } else {
4093         /* at this point we know whatever we have is a
4094         * NOTHING sequence/branch AND if 'startbranch'
4095         * is 'first' then we can turn the whole thing
4096         * into a NOTHING
4097         */
4098         if ( startbranch == first ) {
4099          regnode *opt;
4100          /* the entire thing is a NOTHING sequence,
4101          * something like this: (?:|) So we can
4102          * turn it into a plain NOTHING op. */
4103          DEBUG_TRIE_COMPILE_r({
4104           regprop(RExC_rx, mysv, cur, NULL);
4105           PerlIO_printf( Perl_debug_log,
4106           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4107           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4108
4109          });
4110          OP(startbranch)= NOTHING;
4111          NEXT_OFF(startbranch)= tail - startbranch;
4112          for ( opt= startbranch + 1; opt < tail ; opt++ )
4113           OP(opt)= OPTIMIZED;
4114         }
4115        }
4116       } /* end if ( last) */
4117      } /* TRIE_MAXBUF is non zero */
4118
4119     } /* do trie */
4120
4121    }
4122    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4123     scan = NEXTOPER(NEXTOPER(scan));
4124    } else   /* single branch is optimized. */
4125     scan = NEXTOPER(scan);
4126    continue;
4127   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4128    scan_frame *newframe = NULL;
4129    I32 paren;
4130    regnode *start;
4131    regnode *end;
4132    U32 my_recursed_depth= recursed_depth;
4133
4134    if (OP(scan) != SUSPEND) {
4135     /* set the pointer */
4136     if (OP(scan) == GOSUB) {
4137      paren = ARG(scan);
4138      RExC_recurse[ARG2L(scan)] = scan;
4139      start = RExC_open_parens[paren-1];
4140      end   = RExC_close_parens[paren-1];
4141     } else {
4142      paren = 0;
4143      start = RExC_rxi->program + 1;
4144      end   = RExC_opend;
4145     }
4146     if (!recursed_depth
4147      ||
4148      !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4149     ) {
4150      if (!recursed_depth) {
4151       Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4152      } else {
4153       Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4154        RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4155        RExC_study_chunk_recursed_bytes, U8);
4156      }
4157      /* we havent recursed into this paren yet, so recurse into it */
4158      DEBUG_STUDYDATA("set:", data,depth);
4159      PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4160      my_recursed_depth= recursed_depth + 1;
4161      Newx(newframe,1,scan_frame);
4162     } else {
4163      DEBUG_STUDYDATA("inf:", data,depth);
4164      /* some form of infinite recursion, assume infinite length
4165      * */
4166      if (flags & SCF_DO_SUBSTR) {
4167       scan_commit(pRExC_state, data, minlenp, is_inf);
4168       data->longest = &(data->longest_float);
4169      }
4170      is_inf = is_inf_internal = 1;
4171      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4172       ssc_anything(data->start_class);
4173      flags &= ~SCF_DO_STCLASS;
4174     }
4175    } else {
4176     Newx(newframe,1,scan_frame);
4177     paren = stopparen;
4178     start = scan+2;
4179     end = regnext(scan);
4180    }
4181    if (newframe) {
4182     assert(start);
4183     assert(end);
4184     SAVEFREEPV(newframe);
4185     newframe->next = regnext(scan);
4186     newframe->last = last;
4187     newframe->stop = stopparen;
4188     newframe->prev = frame;
4189     newframe->prev_recursed_depth = recursed_depth;
4190
4191     DEBUG_STUDYDATA("frame-new:",data,depth);
4192     DEBUG_PEEP("fnew", scan, depth);
4193
4194     frame = newframe;
4195     scan =  start;
4196     stopparen = paren;
4197     last = end;
4198     depth = depth + 1;
4199     recursed_depth= my_recursed_depth;
4200
4201     continue;
4202    }
4203   }
4204   else if (OP(scan) == EXACT) {
4205    SSize_t l = STR_LEN(scan);
4206    UV uc;
4207    if (UTF) {
4208     const U8 * const s = (U8*)STRING(scan);
4209     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4210     l = utf8_length(s, s + l);
4211    } else {
4212     uc = *((U8*)STRING(scan));
4213    }
4214    min += l;
4215    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4216     /* The code below prefers earlier match for fixed
4217     offset, later match for variable offset.  */
4218     if (data->last_end == -1) { /* Update the start info. */
4219      data->last_start_min = data->pos_min;
4220      data->last_start_max = is_inf
4221       ? SSize_t_MAX : data->pos_min + data->pos_delta;
4222     }
4223     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4224     if (UTF)
4225      SvUTF8_on(data->last_found);
4226     {
4227      SV * const sv = data->last_found;
4228      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4229       mg_find(sv, PERL_MAGIC_utf8) : NULL;
4230      if (mg && mg->mg_len >= 0)
4231       mg->mg_len += utf8_length((U8*)STRING(scan),
4232            (U8*)STRING(scan)+STR_LEN(scan));
4233     }
4234     data->last_end = data->pos_min + l;
4235     data->pos_min += l; /* As in the first entry. */
4236     data->flags &= ~SF_BEFORE_EOL;
4237    }
4238
4239    /* ANDing the code point leaves at most it, and not in locale, and
4240    * can't match null string */
4241    if (flags & SCF_DO_STCLASS_AND) {
4242     ssc_cp_and(data->start_class, uc);
4243     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4244     ssc_clear_locale(data->start_class);
4245    }
4246    else if (flags & SCF_DO_STCLASS_OR) {
4247     ssc_add_cp(data->start_class, uc);
4248     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4249
4250     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4251     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4252    }
4253    flags &= ~SCF_DO_STCLASS;
4254   }
4255   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4256    SSize_t l = STR_LEN(scan);
4257    UV uc = *((U8*)STRING(scan));
4258    SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4259              separate code points */
4260
4261    /* Search for fixed substrings supports EXACT only. */
4262    if (flags & SCF_DO_SUBSTR) {
4263     assert(data);
4264     scan_commit(pRExC_state, data, minlenp, is_inf);
4265    }
4266    if (UTF) {
4267     const U8 * const s = (U8 *)STRING(scan);
4268     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4269     l = utf8_length(s, s + l);
4270    }
4271    if (unfolded_multi_char) {
4272     RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4273    }
4274    min += l - min_subtract;
4275    assert (min >= 0);
4276    delta += min_subtract;
4277    if (flags & SCF_DO_SUBSTR) {
4278     data->pos_min += l - min_subtract;
4279     if (data->pos_min < 0) {
4280      data->pos_min = 0;
4281     }
4282     data->pos_delta += min_subtract;
4283     if (min_subtract) {
4284      data->longest = &(data->longest_float);
4285     }
4286    }
4287    if (OP(scan) == EXACTFL) {
4288
4289     /* We don't know what the folds are; it could be anything. XXX
4290     * Actually, we only support UTF-8 encoding for code points
4291     * above Latin1, so we could know what those folds are. */
4292     EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4293              0,
4294              UV_MAX);
4295    }
4296    else {  /* Non-locale EXACTFish */
4297     EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4298     if (flags & SCF_DO_STCLASS_AND) {
4299      ssc_clear_locale(data->start_class);
4300     }
4301     if (uc < 256) { /* We know what the Latin1 folds are ... */
4302      if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
4303              know if anything folds
4304              with this */
4305       EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4306               PL_fold_latin1[uc]);
4307       if (OP(scan) != EXACTFA) { /* The folds below aren't
4308              legal under /iaa */
4309        if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4310         EXACTF_invlist
4311          = add_cp_to_invlist(EXACTF_invlist,
4312             LATIN_SMALL_LETTER_SHARP_S);
4313        }
4314        else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4315         EXACTF_invlist
4316          = add_cp_to_invlist(EXACTF_invlist, 's');
4317         EXACTF_invlist
4318          = add_cp_to_invlist(EXACTF_invlist, 'S');
4319        }
4320       }
4321
4322       /* We also know if there are above-Latin1 code points
4323       * that fold to this (none legal for ASCII and /iaa) */
4324       if ((! isASCII(uc) || OP(scan) != EXACTFA)
4325        && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4326       {
4327        /* XXX We could know exactly what does fold to this
4328        * if the reverse folds are loaded, as currently in
4329        * S_regclass() */
4330        _invlist_union(EXACTF_invlist,
4331           PL_AboveLatin1,
4332           &EXACTF_invlist);
4333       }
4334      }
4335     }
4336     else {  /* Non-locale, above Latin1.  XXX We don't currently
4337       know what participates in folds with this, so have
4338       to assume anything could */
4339
4340      /* XXX We could know exactly what does fold to this if the
4341      * reverse folds are loaded, as currently in S_regclass().
4342      * But we do know that under /iaa nothing in the ASCII
4343      * range can participate */
4344      if (OP(scan) == EXACTFA) {
4345       _invlist_union_complement_2nd(EXACTF_invlist,
4346              PL_XPosix_ptrs[_CC_ASCII],
4347              &EXACTF_invlist);
4348      }
4349      else {
4350       EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4351                0, UV_MAX);
4352      }
4353     }
4354    }
4355    if (flags & SCF_DO_STCLASS_AND) {
4356     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4357     ANYOF_POSIXL_ZERO(data->start_class);
4358     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4359    }
4360    else if (flags & SCF_DO_STCLASS_OR) {
4361     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4362     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4363
4364     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4365     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4366    }
4367    flags &= ~SCF_DO_STCLASS;
4368    SvREFCNT_dec(EXACTF_invlist);
4369   }
4370   else if (REGNODE_VARIES(OP(scan))) {
4371    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4372    I32 fl = 0, f = flags;
4373    regnode * const oscan = scan;
4374    regnode_ssc this_class;
4375    regnode_ssc *oclass = NULL;
4376    I32 next_is_eval = 0;
4377
4378    switch (PL_regkind[OP(scan)]) {
4379    case WHILEM:  /* End of (?:...)* . */
4380     scan = NEXTOPER(scan);
4381     goto finish;
4382    case PLUS:
4383     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4384      next = NEXTOPER(scan);
4385      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4386       mincount = 1;
4387       maxcount = REG_INFTY;
4388       next = regnext(scan);
4389       scan = NEXTOPER(scan);
4390       goto do_curly;
4391      }
4392     }
4393     if (flags & SCF_DO_SUBSTR)
4394      data->pos_min++;
4395     min++;
4396     /* Fall through. */
4397    case STAR:
4398     if (flags & SCF_DO_STCLASS) {
4399      mincount = 0;
4400      maxcount = REG_INFTY;
4401      next = regnext(scan);
4402      scan = NEXTOPER(scan);
4403      goto do_curly;
4404     }
4405     if (flags & SCF_DO_SUBSTR) {
4406      scan_commit(pRExC_state, data, minlenp, is_inf);
4407      /* Cannot extend fixed substrings */
4408      data->longest = &(data->longest_float);
4409     }
4410     is_inf = is_inf_internal = 1;
4411     scan = regnext(scan);
4412     goto optimize_curly_tail;
4413    case CURLY:
4414     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4415      && (scan->flags == stopparen))
4416     {
4417      mincount = 1;
4418      maxcount = 1;
4419     } else {
4420      mincount = ARG1(scan);
4421      maxcount = ARG2(scan);
4422     }
4423     next = regnext(scan);
4424     if (OP(scan) == CURLYX) {
4425      I32 lp = (data ? *(data->last_closep) : 0);
4426      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4427     }
4428     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4429     next_is_eval = (OP(scan) == EVAL);
4430    do_curly:
4431     if (flags & SCF_DO_SUBSTR) {
4432      if (mincount == 0)
4433       scan_commit(pRExC_state, data, minlenp, is_inf);
4434      /* Cannot extend fixed substrings */
4435      pos_before = data->pos_min;
4436     }
4437     if (data) {
4438      fl = data->flags;
4439      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4440      if (is_inf)
4441       data->flags |= SF_IS_INF;
4442     }
4443     if (flags & SCF_DO_STCLASS) {
4444      ssc_init(pRExC_state, &this_class);
4445      oclass = data->start_class;
4446      data->start_class = &this_class;
4447      f |= SCF_DO_STCLASS_AND;
4448      f &= ~SCF_DO_STCLASS_OR;
4449     }
4450     /* Exclude from super-linear cache processing any {n,m}
4451     regops for which the combination of input pos and regex
4452     pos is not enough information to determine if a match
4453     will be possible.
4454
4455     For example, in the regex /foo(bar\s*){4,8}baz/ with the
4456     regex pos at the \s*, the prospects for a match depend not
4457     only on the input position but also on how many (bar\s*)
4458     repeats into the {4,8} we are. */
4459    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4460      f &= ~SCF_WHILEM_VISITED_POS;
4461
4462     /* This will finish on WHILEM, setting scan, or on NULL: */
4463     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4464         last, data, stopparen, recursed_depth, NULL,
4465         (mincount == 0
4466         ? (f & ~SCF_DO_SUBSTR)
4467         : f)
4468         ,depth+1);
4469
4470     if (flags & SCF_DO_STCLASS)
4471      data->start_class = oclass;
4472     if (mincount == 0 || minnext == 0) {
4473      if (flags & SCF_DO_STCLASS_OR) {
4474       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4475      }
4476      else if (flags & SCF_DO_STCLASS_AND) {
4477       /* Switch to OR mode: cache the old value of
4478       * data->start_class */
4479       INIT_AND_WITHP;
4480       StructCopy(data->start_class, and_withp, regnode_ssc);
4481       flags &= ~SCF_DO_STCLASS_AND;
4482       StructCopy(&this_class, data->start_class, regnode_ssc);
4483       flags |= SCF_DO_STCLASS_OR;
4484       ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4485      }
4486     } else {  /* Non-zero len */
4487      if (flags & SCF_DO_STCLASS_OR) {
4488       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4489       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4490      }
4491      else if (flags & SCF_DO_STCLASS_AND)
4492       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4493      flags &= ~SCF_DO_STCLASS;
4494     }
4495     if (!scan)   /* It was not CURLYX, but CURLY. */
4496      scan = next;
4497     if (!(flags & SCF_TRIE_DOING_RESTUDY)
4498      /* ? quantifier ok, except for (?{ ... }) */
4499      && (next_is_eval || !(mincount == 0 && maxcount == 1))
4500      && (minnext == 0) && (deltanext == 0)
4501      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4502      && maxcount <= REG_INFTY/3) /* Complement check for big
4503             count */
4504     {
4505      /* Fatal warnings may leak the regexp without this: */
4506      SAVEFREESV(RExC_rx_sv);
4507      ckWARNreg(RExC_parse,
4508        "Quantifier unexpected on zero-length expression");
4509      (void)ReREFCNT_inc(RExC_rx_sv);
4510     }
4511
4512     min += minnext * mincount;
4513     is_inf_internal |= deltanext == SSize_t_MAX
4514       || (maxcount == REG_INFTY && minnext + deltanext > 0);
4515     is_inf |= is_inf_internal;
4516     if (is_inf) {
4517      delta = SSize_t_MAX;
4518     } else {
4519      delta += (minnext + deltanext) * maxcount
4520        - minnext * mincount;
4521     }
4522     /* Try powerful optimization CURLYX => CURLYN. */
4523     if (  OP(oscan) == CURLYX && data
4524      && data->flags & SF_IN_PAR
4525      && !(data->flags & SF_HAS_EVAL)
4526      && !deltanext && minnext == 1 ) {
4527      /* Try to optimize to CURLYN.  */
4528      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4529      regnode * const nxt1 = nxt;
4530 #ifdef DEBUGGING
4531      regnode *nxt2;
4532 #endif
4533
4534      /* Skip open. */
4535      nxt = regnext(nxt);
4536      if (!REGNODE_SIMPLE(OP(nxt))
4537       && !(PL_regkind[OP(nxt)] == EXACT
4538        && STR_LEN(nxt) == 1))
4539       goto nogo;
4540 #ifdef DEBUGGING
4541      nxt2 = nxt;
4542 #endif
4543      nxt = regnext(nxt);
4544      if (OP(nxt) != CLOSE)
4545       goto nogo;
4546      if (RExC_open_parens) {
4547       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4548       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4549      }
4550      /* Now we know that nxt2 is the only contents: */
4551      oscan->flags = (U8)ARG(nxt);
4552      OP(oscan) = CURLYN;
4553      OP(nxt1) = NOTHING; /* was OPEN. */
4554
4555 #ifdef DEBUGGING
4556      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4557      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4558      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4559      OP(nxt) = OPTIMIZED; /* was CLOSE. */
4560      OP(nxt + 1) = OPTIMIZED; /* was count. */
4561      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4562 #endif
4563     }
4564    nogo:
4565
4566     /* Try optimization CURLYX => CURLYM. */
4567     if (  OP(oscan) == CURLYX && data
4568      && !(data->flags & SF_HAS_PAR)
4569      && !(data->flags & SF_HAS_EVAL)
4570      && !deltanext /* atom is fixed width */
4571      && minnext != 0 /* CURLYM can't handle zero width */
4572
4573       /* Nor characters whose fold at run-time may be
4574       * multi-character */
4575      && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4576     ) {
4577      /* XXXX How to optimize if data == 0? */
4578      /* Optimize to a simpler form.  */
4579      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4580      regnode *nxt2;
4581
4582      OP(oscan) = CURLYM;
4583      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4584        && (OP(nxt2) != WHILEM))
4585       nxt = nxt2;
4586      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4587      /* Need to optimize away parenths. */
4588      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4589       /* Set the parenth number.  */
4590       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4591
4592       oscan->flags = (U8)ARG(nxt);
4593       if (RExC_open_parens) {
4594        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4595        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4596       }
4597       OP(nxt1) = OPTIMIZED; /* was OPEN. */
4598       OP(nxt) = OPTIMIZED; /* was CLOSE. */
4599
4600 #ifdef DEBUGGING
4601       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4602       OP(nxt + 1) = OPTIMIZED; /* was count. */
4603       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4604       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4605 #endif
4606 #if 0
4607       while ( nxt1 && (OP(nxt1) != WHILEM)) {
4608        regnode *nnxt = regnext(nxt1);
4609        if (nnxt == nxt) {
4610         if (reg_off_by_arg[OP(nxt1)])
4611          ARG_SET(nxt1, nxt2 - nxt1);
4612         else if (nxt2 - nxt1 < U16_MAX)
4613          NEXT_OFF(nxt1) = nxt2 - nxt1;
4614         else
4615          OP(nxt) = NOTHING; /* Cannot beautify */
4616        }
4617        nxt1 = nnxt;
4618       }
4619 #endif
4620       /* Optimize again: */
4621       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4622          NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4623      }
4624      else
4625       oscan->flags = 0;
4626     }
4627     else if ((OP(oscan) == CURLYX)
4628       && (flags & SCF_WHILEM_VISITED_POS)
4629       /* See the comment on a similar expression above.
4630        However, this time it's not a subexpression
4631        we care about, but the expression itself. */
4632       && (maxcount == REG_INFTY)
4633       && data && ++data->whilem_c < 16) {
4634      /* This stays as CURLYX, we can put the count/of pair. */
4635      /* Find WHILEM (as in regexec.c) */
4636      regnode *nxt = oscan + NEXT_OFF(oscan);
4637
4638      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4639       nxt += ARG(nxt);
4640      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4641       | (RExC_whilem_seen << 4)); /* On WHILEM */
4642     }
4643     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4644      pars++;
4645     if (flags & SCF_DO_SUBSTR) {
4646      SV *last_str = NULL;
4647      STRLEN last_chrs = 0;
4648      int counted = mincount != 0;
4649
4650      if (data->last_end > 0 && mincount != 0) { /* Ends with a
4651                 string. */
4652       SSize_t b = pos_before >= data->last_start_min
4653        ? pos_before : data->last_start_min;
4654       STRLEN l;
4655       const char * const s = SvPV_const(data->last_found, l);
4656       SSize_t old = b - data->last_start_min;
4657
4658       if (UTF)
4659        old = utf8_hop((U8*)s, old) - (U8*)s;
4660       l -= old;
4661       /* Get the added string: */
4662       last_str = newSVpvn_utf8(s  + old, l, UTF);
4663       last_chrs = UTF ? utf8_length((U8*)(s + old),
4664            (U8*)(s + old + l)) : l;
4665       if (deltanext == 0 && pos_before == b) {
4666        /* What was added is a constant string */
4667        if (mincount > 1) {
4668
4669         SvGROW(last_str, (mincount * l) + 1);
4670         repeatcpy(SvPVX(last_str) + l,
4671           SvPVX_const(last_str), l,
4672           mincount - 1);
4673         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4674         /* Add additional parts. */
4675         SvCUR_set(data->last_found,
4676           SvCUR(data->last_found) - l);
4677         sv_catsv(data->last_found, last_str);
4678         {
4679          SV * sv = data->last_found;
4680          MAGIC *mg =
4681           SvUTF8(sv) && SvMAGICAL(sv) ?
4682           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4683          if (mg && mg->mg_len >= 0)
4684           mg->mg_len += last_chrs * (mincount-1);
4685         }
4686         last_chrs *= mincount;
4687         data->last_end += l * (mincount - 1);
4688        }
4689       } else {
4690        /* start offset must point into the last copy */
4691        data->last_start_min += minnext * (mincount - 1);
4692        data->last_start_max += is_inf ? SSize_t_MAX
4693         : (maxcount - 1) * (minnext + data->pos_delta);
4694       }
4695      }
4696      /* It is counted once already... */
4697      data->pos_min += minnext * (mincount - counted);
4698 #if 0
4699 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4700        " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4701        " maxcount=%"UVdf" mincount=%"UVdf"\n",
4702  (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4703  (UV)mincount);
4704 if (deltanext != SSize_t_MAX)
4705 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4706  (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4707   - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4708 #endif
4709      if (deltanext == SSize_t_MAX
4710       || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4711       data->pos_delta = SSize_t_MAX;
4712      else
4713       data->pos_delta += - counted * deltanext +
4714       (minnext + deltanext) * maxcount - minnext * mincount;
4715      if (mincount != maxcount) {
4716       /* Cannot extend fixed substrings found inside
4717        the group.  */
4718       scan_commit(pRExC_state, data, minlenp, is_inf);
4719       if (mincount && last_str) {
4720        SV * const sv = data->last_found;
4721        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4722         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4723
4724        if (mg)
4725         mg->mg_len = -1;
4726        sv_setsv(sv, last_str);
4727        data->last_end = data->pos_min;
4728        data->last_start_min = data->pos_min - last_chrs;
4729        data->last_start_max = is_inf
4730         ? SSize_t_MAX
4731         : data->pos_min + data->pos_delta - last_chrs;
4732       }
4733       data->longest = &(data->longest_float);
4734      }
4735      SvREFCNT_dec(last_str);
4736     }
4737     if (data && (fl & SF_HAS_EVAL))
4738      data->flags |= SF_HAS_EVAL;
4739    optimize_curly_tail:
4740     if (OP(oscan) != CURLYX) {
4741      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4742       && NEXT_OFF(next))
4743       NEXT_OFF(oscan) += NEXT_OFF(next);
4744     }
4745     continue;
4746
4747    default:
4748 #ifdef DEBUGGING
4749     Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4750                  OP(scan));
4751 #endif
4752    case REF:
4753    case CLUMP:
4754     if (flags & SCF_DO_SUBSTR) {
4755      /* Cannot expect anything... */
4756      scan_commit(pRExC_state, data, minlenp, is_inf);
4757      data->longest = &(data->longest_float);
4758     }
4759     is_inf = is_inf_internal = 1;
4760     if (flags & SCF_DO_STCLASS_OR) {
4761      if (OP(scan) == CLUMP) {
4762       /* Actually is any start char, but very few code points
4763       * aren't start characters */
4764       ssc_match_all_cp(data->start_class);
4765      }
4766      else {
4767       ssc_anything(data->start_class);
4768      }
4769     }
4770     flags &= ~SCF_DO_STCLASS;
4771     break;
4772    }
4773   }
4774   else if (OP(scan) == LNBREAK) {
4775    if (flags & SCF_DO_STCLASS) {
4776      if (flags & SCF_DO_STCLASS_AND) {
4777      ssc_intersection(data->start_class,
4778          PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4779      ssc_clear_locale(data->start_class);
4780      ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4781     }
4782     else if (flags & SCF_DO_STCLASS_OR) {
4783      ssc_union(data->start_class,
4784        PL_XPosix_ptrs[_CC_VERTSPACE],
4785        FALSE);
4786      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4787
4788      /* See commit msg for
4789      * 749e076fceedeb708a624933726e7989f2302f6a */
4790      ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4791     }
4792     flags &= ~SCF_DO_STCLASS;
4793    }
4794    min++;
4795    delta++;    /* Because of the 2 char string cr-lf */
4796    if (flags & SCF_DO_SUBSTR) {
4797     /* Cannot expect anything... */
4798     scan_commit(pRExC_state, data, minlenp, is_inf);
4799      data->pos_min += 1;
4800     data->pos_delta += 1;
4801     data->longest = &(data->longest_float);
4802     }
4803   }
4804   else if (REGNODE_SIMPLE(OP(scan))) {
4805
4806    if (flags & SCF_DO_SUBSTR) {
4807     scan_commit(pRExC_state, data, minlenp, is_inf);
4808     data->pos_min++;
4809    }
4810    min++;
4811    if (flags & SCF_DO_STCLASS) {
4812     bool invert = 0;
4813     SV* my_invlist = sv_2mortal(_new_invlist(0));
4814     U8 namedclass;
4815
4816     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4817     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4818
4819     /* Some of the logic below assumes that switching
4820     locale on will only add false positives. */
4821     switch (OP(scan)) {
4822
4823     default:
4824 #ifdef DEBUGGING
4825     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4826                  OP(scan));
4827 #endif
4828     case CANY:
4829     case SANY:
4830      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4831       ssc_match_all_cp(data->start_class);
4832      break;
4833
4834     case REG_ANY:
4835      {
4836       SV* REG_ANY_invlist = _new_invlist(2);
4837       REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4838                '\n');
4839       if (flags & SCF_DO_STCLASS_OR) {
4840        ssc_union(data->start_class,
4841          REG_ANY_invlist,
4842          TRUE /* TRUE => invert, hence all but \n
4843            */
4844          );
4845       }
4846       else if (flags & SCF_DO_STCLASS_AND) {
4847        ssc_intersection(data->start_class,
4848            REG_ANY_invlist,
4849            TRUE  /* TRUE => invert */
4850            );
4851        ssc_clear_locale(data->start_class);
4852       }
4853       SvREFCNT_dec_NN(REG_ANY_invlist);
4854      }
4855      break;
4856
4857     case ANYOF:
4858      if (flags & SCF_DO_STCLASS_AND)
4859       ssc_and(pRExC_state, data->start_class,
4860         (regnode_charclass *) scan);
4861      else
4862       ssc_or(pRExC_state, data->start_class,
4863               (regnode_charclass *) scan);
4864      break;
4865
4866     case NPOSIXL:
4867      invert = 1;
4868      /* FALL THROUGH */
4869
4870     case POSIXL:
4871      namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4872      if (flags & SCF_DO_STCLASS_AND) {
4873       bool was_there = cBOOL(
4874           ANYOF_POSIXL_TEST(data->start_class,
4875                 namedclass));
4876       ANYOF_POSIXL_ZERO(data->start_class);
4877       if (was_there) {    /* Do an AND */
4878        ANYOF_POSIXL_SET(data->start_class, namedclass);
4879       }
4880       /* No individual code points can now match */
4881       data->start_class->invlist
4882             = sv_2mortal(_new_invlist(0));
4883      }
4884      else {
4885       int complement = namedclass + ((invert) ? -1 : 1);
4886
4887       assert(flags & SCF_DO_STCLASS_OR);
4888
4889       /* If the complement of this class was already there,
4890       * the result is that they match all code points,
4891       * (\d + \D == everything).  Remove the classes from
4892       * future consideration.  Locale is not relevant in
4893       * this case */
4894       if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4895        ssc_match_all_cp(data->start_class);
4896        ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4897        ANYOF_POSIXL_CLEAR(data->start_class, complement);
4898       }
4899       else {  /* The usual case; just add this class to the
4900         existing set */
4901        ANYOF_POSIXL_SET(data->start_class, namedclass);
4902       }
4903      }
4904      break;
4905
4906     case NPOSIXA:   /* For these, we always know the exact set of
4907         what's matched */
4908      invert = 1;
4909      /* FALL THROUGH */
4910     case POSIXA:
4911      if (FLAGS(scan) == _CC_ASCII) {
4912       my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4913      }
4914      else {
4915       _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4916            PL_XPosix_ptrs[_CC_ASCII],
4917            &my_invlist);
4918      }
4919      goto join_posix;
4920
4921     case NPOSIXD:
4922     case NPOSIXU:
4923      invert = 1;
4924      /* FALL THROUGH */
4925     case POSIXD:
4926     case POSIXU:
4927      my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
4928
4929      /* NPOSIXD matches all upper Latin1 code points unless the
4930      * target string being matched is UTF-8, which is
4931      * unknowable until match time.  Since we are going to
4932      * invert, we want to get rid of all of them so that the
4933      * inversion will match all */
4934      if (OP(scan) == NPOSIXD) {
4935       _invlist_subtract(my_invlist, PL_UpperLatin1,
4936           &my_invlist);
4937      }
4938
4939     join_posix:
4940
4941      if (flags & SCF_DO_STCLASS_AND) {
4942       ssc_intersection(data->start_class, my_invlist, invert);
4943       ssc_clear_locale(data->start_class);
4944      }
4945      else {
4946       assert(flags & SCF_DO_STCLASS_OR);
4947       ssc_union(data->start_class, my_invlist, invert);
4948      }
4949     }
4950     if (flags & SCF_DO_STCLASS_OR)
4951      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4952     flags &= ~SCF_DO_STCLASS;
4953    }
4954   }
4955   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4956    data->flags |= (OP(scan) == MEOL
4957        ? SF_BEFORE_MEOL
4958        : SF_BEFORE_SEOL);
4959    scan_commit(pRExC_state, data, minlenp, is_inf);
4960
4961   }
4962   else if (  PL_regkind[OP(scan)] == BRANCHJ
4963     /* Lookbehind, or need to calculate parens/evals/stclass: */
4964     && (scan->flags || data || (flags & SCF_DO_STCLASS))
4965     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4966    if ( OP(scan) == UNLESSM &&
4967     scan->flags == 0 &&
4968     OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4969     OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4970    ) {
4971     regnode *opt;
4972     regnode *upto= regnext(scan);
4973     DEBUG_PARSE_r({
4974      SV * const mysv_val=sv_newmortal();
4975      DEBUG_STUDYDATA("OPFAIL",data,depth);
4976
4977      /*DEBUG_PARSE_MSG("opfail");*/
4978      regprop(RExC_rx, mysv_val, upto, NULL);
4979      PerlIO_printf(Perl_debug_log,
4980       "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4981       SvPV_nolen_const(mysv_val),
4982       (IV)REG_NODE_NUM(upto),
4983       (IV)(upto - scan)
4984      );
4985     });
4986     OP(scan) = OPFAIL;
4987     NEXT_OFF(scan) = upto - scan;
4988     for (opt= scan + 1; opt < upto ; opt++)
4989      OP(opt) = OPTIMIZED;
4990     scan= upto;
4991     continue;
4992    }
4993    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4994     || OP(scan) == UNLESSM )
4995    {
4996     /* Negative Lookahead/lookbehind
4997     In this case we can't do fixed string optimisation.
4998     */
4999
5000     SSize_t deltanext, minnext, fake = 0;
5001     regnode *nscan;
5002     regnode_ssc intrnl;
5003     int f = 0;
5004
5005     data_fake.flags = 0;
5006     if (data) {
5007      data_fake.whilem_c = data->whilem_c;
5008      data_fake.last_closep = data->last_closep;
5009     }
5010     else
5011      data_fake.last_closep = &fake;
5012     data_fake.pos_delta = delta;
5013     if ( flags & SCF_DO_STCLASS && !scan->flags
5014      && OP(scan) == IFMATCH ) { /* Lookahead */
5015      ssc_init(pRExC_state, &intrnl);
5016      data_fake.start_class = &intrnl;
5017      f |= SCF_DO_STCLASS_AND;
5018     }
5019     if (flags & SCF_WHILEM_VISITED_POS)
5020      f |= SCF_WHILEM_VISITED_POS;
5021     next = regnext(scan);
5022     nscan = NEXTOPER(NEXTOPER(scan));
5023     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5024          last, &data_fake, stopparen,
5025          recursed_depth, NULL, f, depth+1);
5026     if (scan->flags) {
5027      if (deltanext) {
5028       FAIL("Variable length lookbehind not implemented");
5029      }
5030      else if (minnext > (I32)U8_MAX) {
5031       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5032        (UV)U8_MAX);
5033      }
5034      scan->flags = (U8)minnext;
5035     }
5036     if (data) {
5037      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5038       pars++;
5039      if (data_fake.flags & SF_HAS_EVAL)
5040       data->flags |= SF_HAS_EVAL;
5041      data->whilem_c = data_fake.whilem_c;
5042     }
5043     if (f & SCF_DO_STCLASS_AND) {
5044      if (flags & SCF_DO_STCLASS_OR) {
5045       /* OR before, AND after: ideally we would recurse with
5046       * data_fake to get the AND applied by study of the
5047       * remainder of the pattern, and then derecurse;
5048       * *** HACK *** for now just treat as "no information".
5049       * See [perl #56690].
5050       */
5051       ssc_init(pRExC_state, data->start_class);
5052      }  else {
5053       /* AND before and after: combine and continue */
5054       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5055      }
5056     }
5057    }
5058 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5059    else {
5060     /* Positive Lookahead/lookbehind
5061     In this case we can do fixed string optimisation,
5062     but we must be careful about it. Note in the case of
5063     lookbehind the positions will be offset by the minimum
5064     length of the pattern, something we won't know about
5065     until after the recurse.
5066     */
5067     SSize_t deltanext, fake = 0;
5068     regnode *nscan;
5069     regnode_ssc intrnl;
5070     int f = 0;
5071     /* We use SAVEFREEPV so that when the full compile
5072      is finished perl will clean up the allocated
5073      minlens when it's all done. This way we don't
5074      have to worry about freeing them when we know
5075      they wont be used, which would be a pain.
5076     */
5077     SSize_t *minnextp;
5078     Newx( minnextp, 1, SSize_t );
5079     SAVEFREEPV(minnextp);
5080
5081     if (data) {
5082      StructCopy(data, &data_fake, scan_data_t);
5083      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5084       f |= SCF_DO_SUBSTR;
5085       if (scan->flags)
5086        scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5087       data_fake.last_found=newSVsv(data->last_found);
5088      }
5089     }
5090     else
5091      data_fake.last_closep = &fake;
5092     data_fake.flags = 0;
5093     data_fake.pos_delta = delta;
5094     if (is_inf)
5095      data_fake.flags |= SF_IS_INF;
5096     if ( flags & SCF_DO_STCLASS && !scan->flags
5097      && OP(scan) == IFMATCH ) { /* Lookahead */
5098      ssc_init(pRExC_state, &intrnl);
5099      data_fake.start_class = &intrnl;
5100      f |= SCF_DO_STCLASS_AND;
5101     }
5102     if (flags & SCF_WHILEM_VISITED_POS)
5103      f |= SCF_WHILEM_VISITED_POS;
5104     next = regnext(scan);
5105     nscan = NEXTOPER(NEXTOPER(scan));
5106
5107     *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5108           &deltanext, last, &data_fake,
5109           stopparen, recursed_depth, NULL,
5110           f,depth+1);
5111     if (scan->flags) {
5112      if (deltanext) {
5113       FAIL("Variable length lookbehind not implemented");
5114      }
5115      else if (*minnextp > (I32)U8_MAX) {
5116       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5117        (UV)U8_MAX);
5118      }
5119      scan->flags = (U8)*minnextp;
5120     }
5121
5122     *minnextp += min;
5123
5124     if (f & SCF_DO_STCLASS_AND) {
5125      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5126     }
5127     if (data) {
5128      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5129       pars++;
5130      if (data_fake.flags & SF_HAS_EVAL)
5131       data->flags |= SF_HAS_EVAL;
5132      data->whilem_c = data_fake.whilem_c;
5133      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5134       if (RExC_rx->minlen<*minnextp)
5135        RExC_rx->minlen=*minnextp;
5136       scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5137       SvREFCNT_dec_NN(data_fake.last_found);
5138
5139       if ( data_fake.minlen_fixed != minlenp )
5140       {
5141        data->offset_fixed= data_fake.offset_fixed;
5142        data->minlen_fixed= data_fake.minlen_fixed;
5143        data->lookbehind_fixed+= scan->flags;
5144       }
5145       if ( data_fake.minlen_float != minlenp )
5146       {
5147        data->minlen_float= data_fake.minlen_float;
5148        data->offset_float_min=data_fake.offset_float_min;
5149        data->offset_float_max=data_fake.offset_float_max;
5150        data->lookbehind_float+= scan->flags;
5151       }
5152      }
5153     }
5154    }
5155 #endif
5156   }
5157   else if (OP(scan) == OPEN) {
5158    if (stopparen != (I32)ARG(scan))
5159     pars++;
5160   }
5161   else if (OP(scan) == CLOSE) {
5162    if (stopparen == (I32)ARG(scan)) {
5163     break;
5164    }
5165    if ((I32)ARG(scan) == is_par) {
5166     next = regnext(scan);
5167
5168     if ( next && (OP(next) != WHILEM) && next < last)
5169      is_par = 0;  /* Disable optimization */
5170    }
5171    if (data)
5172     *(data->last_closep) = ARG(scan);
5173   }
5174   else if (OP(scan) == EVAL) {
5175     if (data)
5176      data->flags |= SF_HAS_EVAL;
5177   }
5178   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5179    if (flags & SCF_DO_SUBSTR) {
5180     scan_commit(pRExC_state, data, minlenp, is_inf);
5181     flags &= ~SCF_DO_SUBSTR;
5182    }
5183    if (data && OP(scan)==ACCEPT) {
5184     data->flags |= SCF_SEEN_ACCEPT;
5185     if (stopmin > min)
5186      stopmin = min;
5187    }
5188   }
5189   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5190   {
5191     if (flags & SCF_DO_SUBSTR) {
5192      scan_commit(pRExC_state, data, minlenp, is_inf);
5193      data->longest = &(data->longest_float);
5194     }
5195     is_inf = is_inf_internal = 1;
5196     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5197      ssc_anything(data->start_class);
5198     flags &= ~SCF_DO_STCLASS;
5199   }
5200   else if (OP(scan) == GPOS) {
5201    if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5202     !(delta || is_inf || (data && data->pos_delta)))
5203    {
5204     if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5205      RExC_rx->intflags |= PREGf_ANCH_GPOS;
5206     if (RExC_rx->gofs < (STRLEN)min)
5207      RExC_rx->gofs = min;
5208    } else {
5209     RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5210     RExC_rx->gofs = 0;
5211    }
5212   }
5213 #ifdef TRIE_STUDY_OPT
5214 #ifdef FULL_TRIE_STUDY
5215   else if (PL_regkind[OP(scan)] == TRIE) {
5216    /* NOTE - There is similar code to this block above for handling
5217    BRANCH nodes on the initial study.  If you change stuff here
5218    check there too. */
5219    regnode *trie_node= scan;
5220    regnode *tail= regnext(scan);
5221    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5222    SSize_t max1 = 0, min1 = SSize_t_MAX;
5223    regnode_ssc accum;
5224
5225    if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5226     /* Cannot merge strings after this. */
5227     scan_commit(pRExC_state, data, minlenp, is_inf);
5228    }
5229    if (flags & SCF_DO_STCLASS)
5230     ssc_init_zero(pRExC_state, &accum);
5231
5232    if (!trie->jump) {
5233     min1= trie->minlen;
5234     max1= trie->maxlen;
5235    } else {
5236     const regnode *nextbranch= NULL;
5237     U32 word;
5238
5239     for ( word=1 ; word <= trie->wordcount ; word++)
5240     {
5241      SSize_t deltanext=0, minnext=0, f = 0, fake;
5242      regnode_ssc this_class;
5243
5244      data_fake.flags = 0;
5245      if (data) {
5246       data_fake.whilem_c = data->whilem_c;
5247       data_fake.last_closep = data->last_closep;
5248      }
5249      else
5250       data_fake.last_closep = &fake;
5251      data_fake.pos_delta = delta;
5252      if (flags & SCF_DO_STCLASS) {
5253       ssc_init(pRExC_state, &this_class);
5254       data_fake.start_class = &this_class;
5255       f = SCF_DO_STCLASS_AND;
5256      }
5257      if (flags & SCF_WHILEM_VISITED_POS)
5258       f |= SCF_WHILEM_VISITED_POS;
5259
5260      if (trie->jump[word]) {
5261       if (!nextbranch)
5262        nextbranch = trie_node + trie->jump[0];
5263       scan= trie_node + trie->jump[word];
5264       /* We go from the jump point to the branch that follows
5265       it. Note this means we need the vestigal unused
5266       branches even though they arent otherwise used. */
5267       minnext = study_chunk(pRExC_state, &scan, minlenp,
5268        &deltanext, (regnode *)nextbranch, &data_fake,
5269        stopparen, recursed_depth, NULL, f,depth+1);
5270      }
5271      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5272       nextbranch= regnext((regnode*)nextbranch);
5273
5274      if (min1 > (SSize_t)(minnext + trie->minlen))
5275       min1 = minnext + trie->minlen;
5276      if (deltanext == SSize_t_MAX) {
5277       is_inf = is_inf_internal = 1;
5278       max1 = SSize_t_MAX;
5279      } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5280       max1 = minnext + deltanext + trie->maxlen;
5281
5282      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5283       pars++;
5284      if (data_fake.flags & SCF_SEEN_ACCEPT) {
5285       if ( stopmin > min + min1)
5286        stopmin = min + min1;
5287       flags &= ~SCF_DO_SUBSTR;
5288       if (data)
5289        data->flags |= SCF_SEEN_ACCEPT;
5290      }
5291      if (data) {
5292       if (data_fake.flags & SF_HAS_EVAL)
5293        data->flags |= SF_HAS_EVAL;
5294       data->whilem_c = data_fake.whilem_c;
5295      }
5296      if (flags & SCF_DO_STCLASS)
5297       ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5298     }
5299    }
5300    if (flags & SCF_DO_SUBSTR) {
5301     data->pos_min += min1;
5302     data->pos_delta += max1 - min1;
5303     if (max1 != min1 || is_inf)
5304      data->longest = &(data->longest_float);
5305    }
5306    min += min1;
5307    delta += max1 - min1;
5308    if (flags & SCF_DO_STCLASS_OR) {
5309     ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5310     if (min1) {
5311      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5312      flags &= ~SCF_DO_STCLASS;
5313     }
5314    }
5315    else if (flags & SCF_DO_STCLASS_AND) {
5316     if (min1) {
5317      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5318      flags &= ~SCF_DO_STCLASS;
5319     }
5320     else {
5321      /* Switch to OR mode: cache the old value of
5322      * data->start_class */
5323      INIT_AND_WITHP;
5324      StructCopy(data->start_class, and_withp, regnode_ssc);
5325      flags &= ~SCF_DO_STCLASS_AND;
5326      StructCopy(&accum, data->start_class, regnode_ssc);
5327      flags |= SCF_DO_STCLASS_OR;
5328     }
5329    }
5330    scan= tail;
5331    continue;
5332   }
5333 #else
5334   else if (PL_regkind[OP(scan)] == TRIE) {
5335    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5336    U8*bang=NULL;
5337
5338    min += trie->minlen;
5339    delta += (trie->maxlen - trie->minlen);
5340    flags &= ~SCF_DO_STCLASS; /* xxx */
5341    if (flags & SCF_DO_SUBSTR) {
5342     /* Cannot expect anything... */
5343     scan_commit(pRExC_state, data, minlenp, is_inf);
5344      data->pos_min += trie->minlen;
5345      data->pos_delta += (trie->maxlen - trie->minlen);
5346     if (trie->maxlen != trie->minlen)
5347      data->longest = &(data->longest_float);
5348     }
5349     if (trie->jump) /* no more substrings -- for now /grr*/
5350    flags &= ~SCF_DO_SUBSTR;
5351   }
5352 #endif /* old or new */
5353 #endif /* TRIE_STUDY_OPT */
5354
5355   /* Else: zero-length, ignore. */
5356   scan = regnext(scan);
5357  }
5358  /* If we are exiting a recursion we can unset its recursed bit
5359  * and allow ourselves to enter it again - no danger of an
5360  * infinite loop there.
5361  if (stopparen > -1 && recursed) {
5362   DEBUG_STUDYDATA("unset:", data,depth);
5363   PAREN_UNSET( recursed, stopparen);
5364  }
5365  */
5366  if (frame) {
5367   DEBUG_STUDYDATA("frame-end:",data,depth);
5368   DEBUG_PEEP("fend", scan, depth);
5369   /* restore previous context */
5370   last = frame->last;
5371   scan = frame->next;
5372   stopparen = frame->stop;
5373   recursed_depth = frame->prev_recursed_depth;
5374   depth = depth - 1;
5375
5376   frame = frame->prev;
5377   goto fake_study_recurse;
5378  }
5379
5380   finish:
5381  assert(!frame);
5382  DEBUG_STUDYDATA("pre-fin:",data,depth);
5383
5384  *scanp = scan;
5385  *deltap = is_inf_internal ? SSize_t_MAX : delta;
5386
5387  if (flags & SCF_DO_SUBSTR && is_inf)
5388   data->pos_delta = SSize_t_MAX - data->pos_min;
5389  if (is_par > (I32)U8_MAX)
5390   is_par = 0;
5391  if (is_par && pars==1 && data) {
5392   data->flags |= SF_IN_PAR;
5393   data->flags &= ~SF_HAS_PAR;
5394  }
5395  else if (pars && data) {
5396   data->flags |= SF_HAS_PAR;
5397   data->flags &= ~SF_IN_PAR;
5398  }
5399  if (flags & SCF_DO_STCLASS_OR)
5400   ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5401  if (flags & SCF_TRIE_RESTUDY)
5402   data->flags |=  SCF_TRIE_RESTUDY;
5403
5404  DEBUG_STUDYDATA("post-fin:",data,depth);
5405
5406  {
5407   SSize_t final_minlen= min < stopmin ? min : stopmin;
5408
5409   if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5410    RExC_maxlen = final_minlen + delta;
5411   }
5412   return final_minlen;
5413  }
5414  /* not-reached */
5415 }
5416
5417 STATIC U32
5418 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5419 {
5420  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5421
5422  PERL_ARGS_ASSERT_ADD_DATA;
5423
5424  Renewc(RExC_rxi->data,
5425   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5426   char, struct reg_data);
5427  if(count)
5428   Renew(RExC_rxi->data->what, count + n, U8);
5429  else
5430   Newx(RExC_rxi->data->what, n, U8);
5431  RExC_rxi->data->count = count + n;
5432  Copy(s, RExC_rxi->data->what + count, n, U8);
5433  return count;
5434 }
5435
5436 /*XXX: todo make this not included in a non debugging perl */
5437 #ifndef PERL_IN_XSUB_RE
5438 void
5439 Perl_reginitcolors(pTHX)
5440 {
5441  dVAR;
5442  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5443  if (s) {
5444   char *t = savepv(s);
5445   int i = 0;
5446   PL_colors[0] = t;
5447   while (++i < 6) {
5448    t = strchr(t, '\t');
5449    if (t) {
5450     *t = '\0';
5451     PL_colors[i] = ++t;
5452    }
5453    else
5454     PL_colors[i] = t = (char *)"";
5455   }
5456  } else {
5457   int i = 0;
5458   while (i < 6)
5459    PL_colors[i++] = (char *)"";
5460  }
5461  PL_colorset = 1;
5462 }
5463 #endif
5464
5465
5466 #ifdef TRIE_STUDY_OPT
5467 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5468  STMT_START {                                            \
5469   if (                                                \
5470    (data.flags & SCF_TRIE_RESTUDY)               \
5471    && ! restudied++                              \
5472   ) {                                                 \
5473    dOsomething;                                    \
5474    goto reStudy;                                   \
5475   }                                                   \
5476  } STMT_END
5477 #else
5478 #define CHECK_RESTUDY_GOTO_butfirst
5479 #endif
5480
5481 /*
5482  * pregcomp - compile a regular expression into internal code
5483  *
5484  * Decides which engine's compiler to call based on the hint currently in
5485  * scope
5486  */
5487
5488 #ifndef PERL_IN_XSUB_RE
5489
5490 /* return the currently in-scope regex engine (or the default if none)  */
5491
5492 regexp_engine const *
5493 Perl_current_re_engine(pTHX)
5494 {
5495  dVAR;
5496
5497  if (IN_PERL_COMPILETIME) {
5498   HV * const table = GvHV(PL_hintgv);
5499   SV **ptr;
5500
5501   if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5502    return &reh_regexp_engine;
5503   ptr = hv_fetchs(table, "regcomp", FALSE);
5504   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5505    return &reh_regexp_engine;
5506   return INT2PTR(regexp_engine*,SvIV(*ptr));
5507  }
5508  else {
5509   SV *ptr;
5510   if (!PL_curcop->cop_hints_hash)
5511    return &reh_regexp_engine;
5512   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5513   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5514    return &reh_regexp_engine;
5515   return INT2PTR(regexp_engine*,SvIV(ptr));
5516  }
5517 }
5518
5519
5520 REGEXP *
5521 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5522 {
5523  dVAR;
5524  regexp_engine const *eng = current_re_engine();
5525  GET_RE_DEBUG_FLAGS_DECL;
5526
5527  PERL_ARGS_ASSERT_PREGCOMP;
5528
5529  /* Dispatch a request to compile a regexp to correct regexp engine. */
5530  DEBUG_COMPILE_r({
5531   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5532       PTR2UV(eng));
5533  });
5534  return CALLREGCOMP_ENG(eng, pattern, flags);
5535 }
5536 #endif
5537
5538 /* public(ish) entry point for the perl core's own regex compiling code.
5539  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5540  * pattern rather than a list of OPs, and uses the internal engine rather
5541  * than the current one */
5542
5543 REGEXP *
5544 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5545 {
5546  SV *pat = pattern; /* defeat constness! */
5547  PERL_ARGS_ASSERT_RE_COMPILE;
5548  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5549 #ifdef PERL_IN_XSUB_RE
5550         &my_reg_engine,
5551 #else
5552         &reh_regexp_engine,
5553 #endif
5554         NULL, NULL, rx_flags, 0);
5555 }
5556
5557
5558 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5559  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5560  * point to the realloced string and length.
5561  *
5562  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5563  * stuff added */
5564
5565 static void
5566 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5567      char **pat_p, STRLEN *plen_p, int num_code_blocks)
5568 {
5569  U8 *const src = (U8*)*pat_p;
5570  U8 *dst;
5571  int n=0;
5572  STRLEN s = 0, d = 0;
5573  bool do_end = 0;
5574  GET_RE_DEBUG_FLAGS_DECL;
5575
5576  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5577   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5578
5579  Newx(dst, *plen_p * 2 + 1, U8);
5580
5581  while (s < *plen_p) {
5582   if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5583    dst[d]   = src[s];
5584   else {
5585    dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5586    dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5587   }
5588   if (n < num_code_blocks) {
5589    if (!do_end && pRExC_state->code_blocks[n].start == s) {
5590     pRExC_state->code_blocks[n].start = d;
5591     assert(dst[d] == '(');
5592     do_end = 1;
5593    }
5594    else if (do_end && pRExC_state->code_blocks[n].end == s) {
5595     pRExC_state->code_blocks[n].end = d;
5596     assert(dst[d] == ')');
5597     do_end = 0;
5598     n++;
5599    }
5600   }
5601   s++;
5602   d++;
5603  }
5604  dst[d] = '\0';
5605  *plen_p = d;
5606  *pat_p = (char*) dst;
5607  SAVEFREEPV(*pat_p);
5608  RExC_orig_utf8 = RExC_utf8 = 1;
5609 }
5610
5611
5612
5613 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5614  * while recording any code block indices, and handling overloading,
5615  * nested qr// objects etc.  If pat is null, it will allocate a new
5616  * string, or just return the first arg, if there's only one.
5617  *
5618  * Returns the malloced/updated pat.
5619  * patternp and pat_count is the array of SVs to be concatted;
5620  * oplist is the optional list of ops that generated the SVs;
5621  * recompile_p is a pointer to a boolean that will be set if
5622  *   the regex will need to be recompiled.
5623  * delim, if non-null is an SV that will be inserted between each element
5624  */
5625
5626 static SV*
5627 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5628     SV *pat, SV ** const patternp, int pat_count,
5629     OP *oplist, bool *recompile_p, SV *delim)
5630 {
5631  SV **svp;
5632  int n = 0;
5633  bool use_delim = FALSE;
5634  bool alloced = FALSE;
5635
5636  /* if we know we have at least two args, create an empty string,
5637  * then concatenate args to that. For no args, return an empty string */
5638  if (!pat && pat_count != 1) {
5639   pat = newSVpvn("", 0);
5640   SAVEFREESV(pat);
5641   alloced = TRUE;
5642  }
5643
5644  for (svp = patternp; svp < patternp + pat_count; svp++) {
5645   SV *sv;
5646   SV *rx  = NULL;
5647   STRLEN orig_patlen = 0;
5648   bool code = 0;
5649   SV *msv = use_delim ? delim : *svp;
5650   if (!msv) msv = &PL_sv_undef;
5651
5652   /* if we've got a delimiter, we go round the loop twice for each
5653   * svp slot (except the last), using the delimiter the second
5654   * time round */
5655   if (use_delim) {
5656    svp--;
5657    use_delim = FALSE;
5658   }
5659   else if (delim)
5660    use_delim = TRUE;
5661
5662   if (SvTYPE(msv) == SVt_PVAV) {
5663    /* we've encountered an interpolated array within
5664    * the pattern, e.g. /...@a..../. Expand the list of elements,
5665    * then recursively append elements.
5666    * The code in this block is based on S_pushav() */
5667
5668    AV *const av = (AV*)msv;
5669    const SSize_t maxarg = AvFILL(av) + 1;
5670    SV **array;
5671
5672    if (oplist) {
5673     assert(oplist->op_type == OP_PADAV
5674      || oplist->op_type == OP_RV2AV);
5675     oplist = oplist->op_sibling;;
5676    }
5677
5678    if (SvRMAGICAL(av)) {
5679     SSize_t i;
5680
5681     Newx(array, maxarg, SV*);
5682     SAVEFREEPV(array);
5683     for (i=0; i < maxarg; i++) {
5684      SV ** const svp = av_fetch(av, i, FALSE);
5685      array[i] = svp ? *svp : &PL_sv_undef;
5686     }
5687    }
5688    else
5689     array = AvARRAY(av);
5690
5691    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5692         array, maxarg, NULL, recompile_p,
5693         /* $" */
5694         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5695
5696    continue;
5697   }
5698
5699
5700   /* we make the assumption here that each op in the list of
5701   * op_siblings maps to one SV pushed onto the stack,
5702   * except for code blocks, with have both an OP_NULL and
5703   * and OP_CONST.
5704   * This allows us to match up the list of SVs against the
5705   * list of OPs to find the next code block.
5706   *
5707   * Note that       PUSHMARK PADSV PADSV ..
5708   * is optimised to
5709   *                 PADRANGE PADSV  PADSV  ..
5710   * so the alignment still works. */
5711
5712   if (oplist) {
5713    if (oplist->op_type == OP_NULL
5714     && (oplist->op_flags & OPf_SPECIAL))
5715    {
5716     assert(n < pRExC_state->num_code_blocks);
5717     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5718     pRExC_state->code_blocks[n].block = oplist;
5719     pRExC_state->code_blocks[n].src_regex = NULL;
5720     n++;
5721     code = 1;
5722     oplist = oplist->op_sibling; /* skip CONST */
5723     assert(oplist);
5724    }
5725    oplist = oplist->op_sibling;;
5726   }
5727
5728   /* apply magic and QR overloading to arg */
5729
5730   SvGETMAGIC(msv);
5731   if (SvROK(msv) && SvAMAGIC(msv)) {
5732    SV *sv = AMG_CALLunary(msv, regexp_amg);
5733    if (sv) {
5734     if (SvROK(sv))
5735      sv = SvRV(sv);
5736     if (SvTYPE(sv) != SVt_REGEXP)
5737      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5738     msv = sv;
5739    }
5740   }
5741
5742   /* try concatenation overload ... */
5743   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5744     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5745   {
5746    sv_setsv(pat, sv);
5747    /* overloading involved: all bets are off over literal
5748    * code. Pretend we haven't seen it */
5749    pRExC_state->num_code_blocks -= n;
5750    n = 0;
5751   }
5752   else  {
5753    /* ... or failing that, try "" overload */
5754    while (SvAMAGIC(msv)
5755      && (sv = AMG_CALLunary(msv, string_amg))
5756      && sv != msv
5757      &&  !(   SvROK(msv)
5758       && SvROK(sv)
5759       && SvRV(msv) == SvRV(sv))
5760    ) {
5761     msv = sv;
5762     SvGETMAGIC(msv);
5763    }
5764    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5765     msv = SvRV(msv);
5766
5767    if (pat) {
5768     /* this is a partially unrolled
5769     *     sv_catsv_nomg(pat, msv);
5770     * that allows us to adjust code block indices if
5771     * needed */
5772     STRLEN dlen;
5773     char *dst = SvPV_force_nomg(pat, dlen);
5774     orig_patlen = dlen;
5775     if (SvUTF8(msv) && !SvUTF8(pat)) {
5776      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5777      sv_setpvn(pat, dst, dlen);
5778      SvUTF8_on(pat);
5779     }
5780     sv_catsv_nomg(pat, msv);
5781     rx = msv;
5782    }
5783    else
5784     pat = msv;
5785
5786    if (code)
5787     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5788   }
5789
5790   /* extract any code blocks within any embedded qr//'s */
5791   if (rx && SvTYPE(rx) == SVt_REGEXP
5792    && RX_ENGINE((REGEXP*)rx)->op_comp)
5793   {
5794
5795    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5796    if (ri->num_code_blocks) {
5797     int i;
5798     /* the presence of an embedded qr// with code means
5799     * we should always recompile: the text of the
5800     * qr// may not have changed, but it may be a
5801     * different closure than last time */
5802     *recompile_p = 1;
5803     Renew(pRExC_state->code_blocks,
5804      pRExC_state->num_code_blocks + ri->num_code_blocks,
5805      struct reg_code_block);
5806     pRExC_state->num_code_blocks += ri->num_code_blocks;
5807
5808     for (i=0; i < ri->num_code_blocks; i++) {
5809      struct reg_code_block *src, *dst;
5810      STRLEN offset =  orig_patlen
5811       + ReANY((REGEXP *)rx)->pre_prefix;
5812      assert(n < pRExC_state->num_code_blocks);
5813      src = &ri->code_blocks[i];
5814      dst = &pRExC_state->code_blocks[n];
5815      dst->start     = src->start + offset;
5816      dst->end     = src->end   + offset;
5817      dst->block     = src->block;
5818      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5819            src->src_regex
5820             ? src->src_regex
5821             : (REGEXP*)rx);
5822      n++;
5823     }
5824    }
5825   }
5826  }
5827  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5828  if (alloced)
5829   SvSETMAGIC(pat);
5830
5831  return pat;
5832 }
5833
5834
5835
5836 /* see if there are any run-time code blocks in the pattern.
5837  * False positives are allowed */
5838
5839 static bool
5840 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5841      char *pat, STRLEN plen)
5842 {
5843  int n = 0;
5844  STRLEN s;
5845
5846  for (s = 0; s < plen; s++) {
5847   if (n < pRExC_state->num_code_blocks
5848    && s == pRExC_state->code_blocks[n].start)
5849   {
5850    s = pRExC_state->code_blocks[n].end;
5851    n++;
5852    continue;
5853   }
5854   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5855   * positives here */
5856   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5857    (pat[s+2] == '{'
5858     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5859   )
5860    return 1;
5861  }
5862  return 0;
5863 }
5864
5865 /* Handle run-time code blocks. We will already have compiled any direct
5866  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5867  * copy of it, but with any literal code blocks blanked out and
5868  * appropriate chars escaped; then feed it into
5869  *
5870  *    eval "qr'modified_pattern'"
5871  *
5872  * For example,
5873  *
5874  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5875  *
5876  * becomes
5877  *
5878  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5879  *
5880  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5881  * and merge them with any code blocks of the original regexp.
5882  *
5883  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5884  * instead, just save the qr and return FALSE; this tells our caller that
5885  * the original pattern needs upgrading to utf8.
5886  */
5887
5888 static bool
5889 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5890  char *pat, STRLEN plen)
5891 {
5892  SV *qr;
5893
5894  GET_RE_DEBUG_FLAGS_DECL;
5895
5896  if (pRExC_state->runtime_code_qr) {
5897   /* this is the second time we've been called; this should
5898   * only happen if the main pattern got upgraded to utf8
5899   * during compilation; re-use the qr we compiled first time
5900   * round (which should be utf8 too)
5901   */
5902   qr = pRExC_state->runtime_code_qr;
5903   pRExC_state->runtime_code_qr = NULL;
5904   assert(RExC_utf8 && SvUTF8(qr));
5905  }
5906  else {
5907   int n = 0;
5908   STRLEN s;
5909   char *p, *newpat;
5910   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5911   SV *sv, *qr_ref;
5912   dSP;
5913
5914   /* determine how many extra chars we need for ' and \ escaping */
5915   for (s = 0; s < plen; s++) {
5916    if (pat[s] == '\'' || pat[s] == '\\')
5917     newlen++;
5918   }
5919
5920   Newx(newpat, newlen, char);
5921   p = newpat;
5922   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5923
5924   for (s = 0; s < plen; s++) {
5925    if (n < pRExC_state->num_code_blocks
5926     && s == pRExC_state->code_blocks[n].start)
5927    {
5928     /* blank out literal code block */
5929     assert(pat[s] == '(');
5930     while (s <= pRExC_state->code_blocks[n].end) {
5931      *p++ = '_';
5932      s++;
5933     }
5934     s--;
5935     n++;
5936     continue;
5937    }
5938    if (pat[s] == '\'' || pat[s] == '\\')
5939     *p++ = '\\';
5940    *p++ = pat[s];
5941   }
5942   *p++ = '\'';
5943   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5944    *p++ = 'x';
5945   *p++ = '\0';
5946   DEBUG_COMPILE_r({
5947    PerlIO_printf(Perl_debug_log,
5948     "%sre-parsing pattern for runtime code:%s %s\n",
5949     PL_colors[4],PL_colors[5],newpat);
5950   });
5951
5952   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5953   Safefree(newpat);
5954
5955   ENTER;
5956   SAVETMPS;
5957   save_re_context();
5958   PUSHSTACKi(PERLSI_REQUIRE);
5959   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5960   * parsing qr''; normally only q'' does this. It also alters
5961   * hints handling */
5962   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5963   SvREFCNT_dec_NN(sv);
5964   SPAGAIN;
5965   qr_ref = POPs;
5966   PUTBACK;
5967   {
5968    SV * const errsv = ERRSV;
5969    if (SvTRUE_NN(errsv))
5970    {
5971     Safefree(pRExC_state->code_blocks);
5972     /* use croak_sv ? */
5973     Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5974    }
5975   }
5976   assert(SvROK(qr_ref));
5977   qr = SvRV(qr_ref);
5978   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5979   /* the leaving below frees the tmp qr_ref.
5980   * Give qr a life of its own */
5981   SvREFCNT_inc(qr);
5982   POPSTACK;
5983   FREETMPS;
5984   LEAVE;
5985
5986  }
5987
5988  if (!RExC_utf8 && SvUTF8(qr)) {
5989   /* first time through; the pattern got upgraded; save the
5990   * qr for the next time through */
5991   assert(!pRExC_state->runtime_code_qr);
5992   pRExC_state->runtime_code_qr = qr;
5993   return 0;
5994  }
5995
5996
5997  /* extract any code blocks within the returned qr//  */
5998
5999
6000  /* merge the main (r1) and run-time (r2) code blocks into one */
6001  {
6002   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6003   struct reg_code_block *new_block, *dst;
6004   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6005   int i1 = 0, i2 = 0;
6006
6007   if (!r2->num_code_blocks) /* we guessed wrong */
6008   {
6009    SvREFCNT_dec_NN(qr);
6010    return 1;
6011   }
6012
6013   Newx(new_block,
6014    r1->num_code_blocks + r2->num_code_blocks,
6015    struct reg_code_block);
6016   dst = new_block;
6017
6018   while (    i1 < r1->num_code_blocks
6019     || i2 < r2->num_code_blocks)
6020   {
6021    struct reg_code_block *src;
6022    bool is_qr = 0;
6023
6024    if (i1 == r1->num_code_blocks) {
6025     src = &r2->code_blocks[i2++];
6026     is_qr = 1;
6027    }
6028    else if (i2 == r2->num_code_blocks)
6029     src = &r1->code_blocks[i1++];
6030    else if (  r1->code_blocks[i1].start
6031      < r2->code_blocks[i2].start)
6032    {
6033     src = &r1->code_blocks[i1++];
6034     assert(src->end < r2->code_blocks[i2].start);
6035    }
6036    else {
6037     assert(  r1->code_blocks[i1].start
6038      > r2->code_blocks[i2].start);
6039     src = &r2->code_blocks[i2++];
6040     is_qr = 1;
6041     assert(src->end < r1->code_blocks[i1].start);
6042    }
6043
6044    assert(pat[src->start] == '(');
6045    assert(pat[src->end]   == ')');
6046    dst->start     = src->start;
6047    dst->end     = src->end;
6048    dst->block     = src->block;
6049    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6050          : src->src_regex;
6051    dst++;
6052   }
6053   r1->num_code_blocks += r2->num_code_blocks;
6054   Safefree(r1->code_blocks);
6055   r1->code_blocks = new_block;
6056  }
6057
6058  SvREFCNT_dec_NN(qr);
6059  return 1;
6060 }
6061
6062
6063 STATIC bool
6064 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6065      SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6066      SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6067      STRLEN longest_length, bool eol, bool meol)
6068 {
6069  /* This is the common code for setting up the floating and fixed length
6070  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6071  * as to whether succeeded or not */
6072
6073  I32 t;
6074  SSize_t ml;
6075
6076  if (! (longest_length
6077   || (eol /* Can't have SEOL and MULTI */
6078    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6079   )
6080    /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6081   || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6082  {
6083   return FALSE;
6084  }
6085
6086  /* copy the information about the longest from the reg_scan_data
6087   over to the program. */
6088  if (SvUTF8(sv_longest)) {
6089   *rx_utf8 = sv_longest;
6090   *rx_substr = NULL;
6091  } else {
6092   *rx_substr = sv_longest;
6093   *rx_utf8 = NULL;
6094  }
6095  /* end_shift is how many chars that must be matched that
6096   follow this item. We calculate it ahead of time as once the
6097   lookbehind offset is added in we lose the ability to correctly
6098   calculate it.*/
6099  ml = minlen ? *(minlen) : (SSize_t)longest_length;
6100  *rx_end_shift = ml - offset
6101   - longest_length + (SvTAIL(sv_longest) != 0)
6102   + lookbehind;
6103
6104  t = (eol/* Can't have SEOL and MULTI */
6105   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6106  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6107
6108  return TRUE;
6109 }
6110
6111 /*
6112  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6113  * regular expression into internal code.
6114  * The pattern may be passed either as:
6115  *    a list of SVs (patternp plus pat_count)
6116  *    a list of OPs (expr)
6117  * If both are passed, the SV list is used, but the OP list indicates
6118  * which SVs are actually pre-compiled code blocks
6119  *
6120  * The SVs in the list have magic and qr overloading applied to them (and
6121  * the list may be modified in-place with replacement SVs in the latter
6122  * case).
6123  *
6124  * If the pattern hasn't changed from old_re, then old_re will be
6125  * returned.
6126  *
6127  * eng is the current engine. If that engine has an op_comp method, then
6128  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6129  * do the initial concatenation of arguments and pass on to the external
6130  * engine.
6131  *
6132  * If is_bare_re is not null, set it to a boolean indicating whether the
6133  * arg list reduced (after overloading) to a single bare regex which has
6134  * been returned (i.e. /$qr/).
6135  *
6136  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6137  *
6138  * pm_flags contains the PMf_* flags, typically based on those from the
6139  * pm_flags field of the related PMOP. Currently we're only interested in
6140  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6141  *
6142  * We can't allocate space until we know how big the compiled form will be,
6143  * but we can't compile it (and thus know how big it is) until we've got a
6144  * place to put the code.  So we cheat:  we compile it twice, once with code
6145  * generation turned off and size counting turned on, and once "for real".
6146  * This also means that we don't allocate space until we are sure that the
6147  * thing really will compile successfully, and we never have to move the
6148  * code and thus invalidate pointers into it.  (Note that it has to be in
6149  * one piece because free() must be able to free it all.) [NB: not true in perl]
6150  *
6151  * Beware that the optimization-preparation code in here knows about some
6152  * of the structure of the compiled regexp.  [I'll say.]
6153  */
6154
6155 REGEXP *
6156 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6157      OP *expr, const regexp_engine* eng, REGEXP *old_re,
6158      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6159 {
6160  dVAR;
6161  REGEXP *rx;
6162  struct regexp *r;
6163  regexp_internal *ri;
6164  STRLEN plen;
6165  char *exp;
6166  regnode *scan;
6167  I32 flags;
6168  SSize_t minlen = 0;
6169  U32 rx_flags;
6170  SV *pat;
6171  SV *code_blocksv = NULL;
6172  SV** new_patternp = patternp;
6173
6174  /* these are all flags - maybe they should be turned
6175  * into a single int with different bit masks */
6176  I32 sawlookahead = 0;
6177  I32 sawplus = 0;
6178  I32 sawopen = 0;
6179  I32 sawminmod = 0;
6180
6181  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6182  bool recompile = 0;
6183  bool runtime_code = 0;
6184  scan_data_t data;
6185  RExC_state_t RExC_state;
6186  RExC_state_t * const pRExC_state = &RExC_state;
6187 #ifdef TRIE_STUDY_OPT
6188  int restudied = 0;
6189  RExC_state_t copyRExC_state;
6190 #endif
6191  GET_RE_DEBUG_FLAGS_DECL;
6192
6193  PERL_ARGS_ASSERT_RE_OP_COMPILE;
6194
6195  DEBUG_r(if (!PL_colorset) reginitcolors());
6196
6197 #ifndef PERL_IN_XSUB_RE
6198  /* Initialize these here instead of as-needed, as is quick and avoids
6199  * having to test them each time otherwise */
6200  if (! PL_AboveLatin1) {
6201   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6202   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6203   PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6204   PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6205   PL_HasMultiCharFold =
6206      _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6207  }
6208 #endif
6209
6210  pRExC_state->code_blocks = NULL;
6211  pRExC_state->num_code_blocks = 0;
6212
6213  if (is_bare_re)
6214   *is_bare_re = FALSE;
6215
6216  if (expr && (expr->op_type == OP_LIST ||
6217     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6218   /* allocate code_blocks if needed */
6219   OP *o;
6220   int ncode = 0;
6221
6222   for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6223    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6224     ncode++; /* count of DO blocks */
6225   if (ncode) {
6226    pRExC_state->num_code_blocks = ncode;
6227    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6228   }
6229  }
6230
6231  if (!pat_count) {
6232   /* compile-time pattern with just OP_CONSTs and DO blocks */
6233
6234   int n;
6235   OP *o;
6236
6237   /* find how many CONSTs there are */
6238   assert(expr);
6239   n = 0;
6240   if (expr->op_type == OP_CONST)
6241    n = 1;
6242   else
6243    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6244     if (o->op_type == OP_CONST)
6245      n++;
6246    }
6247
6248   /* fake up an SV array */
6249
6250   assert(!new_patternp);
6251   Newx(new_patternp, n, SV*);
6252   SAVEFREEPV(new_patternp);
6253   pat_count = n;
6254
6255   n = 0;
6256   if (expr->op_type == OP_CONST)
6257    new_patternp[n] = cSVOPx_sv(expr);
6258   else
6259    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6260     if (o->op_type == OP_CONST)
6261      new_patternp[n++] = cSVOPo_sv;
6262    }
6263
6264  }
6265
6266  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6267   "Assembling pattern from %d elements%s\n", pat_count,
6268    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6269
6270  /* set expr to the first arg op */
6271
6272  if (pRExC_state->num_code_blocks
6273   && expr->op_type != OP_CONST)
6274  {
6275    expr = cLISTOPx(expr)->op_first;
6276    assert(   expr->op_type == OP_PUSHMARK
6277     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6278     || expr->op_type == OP_PADRANGE);
6279    expr = expr->op_sibling;
6280  }
6281
6282  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6283       expr, &recompile, NULL);
6284
6285  /* handle bare (possibly after overloading) regex: foo =~ $re */
6286  {
6287   SV *re = pat;
6288   if (SvROK(re))
6289    re = SvRV(re);
6290   if (SvTYPE(re) == SVt_REGEXP) {
6291    if (is_bare_re)
6292     *is_bare_re = TRUE;
6293    SvREFCNT_inc(re);
6294    Safefree(pRExC_state->code_blocks);
6295    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6296     "Precompiled pattern%s\n",
6297      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6298
6299    return (REGEXP*)re;
6300   }
6301  }
6302
6303  exp = SvPV_nomg(pat, plen);
6304
6305  if (!eng->op_comp) {
6306   if ((SvUTF8(pat) && IN_BYTES)
6307     || SvGMAGICAL(pat) || SvAMAGIC(pat))
6308   {
6309    /* make a temporary copy; either to convert to bytes,
6310    * or to avoid repeating get-magic / overloaded stringify */
6311    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6312           (IN_BYTES ? 0 : SvUTF8(pat)));
6313   }
6314   Safefree(pRExC_state->code_blocks);
6315   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6316  }
6317
6318  /* ignore the utf8ness if the pattern is 0 length */
6319  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6320  RExC_uni_semantics = 0;
6321  RExC_contains_locale = 0;
6322  RExC_contains_i = 0;
6323  pRExC_state->runtime_code_qr = NULL;
6324
6325  DEBUG_COMPILE_r({
6326    SV *dsv= sv_newmortal();
6327    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6328    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6329       PL_colors[4],PL_colors[5],s);
6330   });
6331
6332   redo_first_pass:
6333  /* we jump here if we upgrade the pattern to utf8 and have to
6334  * recompile */
6335
6336  if ((pm_flags & PMf_USE_RE_EVAL)
6337     /* this second condition covers the non-regex literal case,
6338     * i.e.  $foo =~ '(?{})'. */
6339     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6340  )
6341   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6342
6343  /* return old regex if pattern hasn't changed */
6344  /* XXX: note in the below we have to check the flags as well as the
6345  * pattern.
6346  *
6347  * Things get a touch tricky as we have to compare the utf8 flag
6348  * independently from the compile flags.  */
6349
6350  if (   old_re
6351   && !recompile
6352   && !!RX_UTF8(old_re) == !!RExC_utf8
6353   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6354   && RX_PRECOMP(old_re)
6355   && RX_PRELEN(old_re) == plen
6356   && memEQ(RX_PRECOMP(old_re), exp, plen)
6357   && !runtime_code /* with runtime code, always recompile */ )
6358  {
6359   Safefree(pRExC_state->code_blocks);
6360   return old_re;
6361  }
6362
6363  rx_flags = orig_rx_flags;
6364
6365  if (rx_flags & PMf_FOLD) {
6366   RExC_contains_i = 1;
6367  }
6368  if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6369
6370   /* Set to use unicode semantics if the pattern is in utf8 and has the
6371   * 'depends' charset specified, as it means unicode when utf8  */
6372   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6373  }
6374
6375  RExC_precomp = exp;
6376  RExC_flags = rx_flags;
6377  RExC_pm_flags = pm_flags;
6378
6379  if (runtime_code) {
6380   if (TAINTING_get && TAINT_get)
6381    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6382
6383   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6384    /* whoops, we have a non-utf8 pattern, whilst run-time code
6385    * got compiled as utf8. Try again with a utf8 pattern */
6386    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6387          pRExC_state->num_code_blocks);
6388    goto redo_first_pass;
6389   }
6390  }
6391  assert(!pRExC_state->runtime_code_qr);
6392
6393  RExC_sawback = 0;
6394
6395  RExC_seen = 0;
6396  RExC_maxlen = 0;
6397  RExC_in_lookbehind = 0;
6398  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6399  RExC_extralen = 0;
6400  RExC_override_recoding = 0;
6401  RExC_in_multi_char_class = 0;
6402
6403  /* First pass: determine size, legality. */
6404  RExC_parse = exp;
6405  RExC_start = exp;
6406  RExC_end = exp + plen;
6407  RExC_naughty = 0;
6408  RExC_npar = 1;
6409  RExC_nestroot = 0;
6410  RExC_size = 0L;
6411  RExC_emit = (regnode *) &RExC_emit_dummy;
6412  RExC_whilem_seen = 0;
6413  RExC_open_parens = NULL;
6414  RExC_close_parens = NULL;
6415  RExC_opend = NULL;
6416  RExC_paren_names = NULL;
6417 #ifdef DEBUGGING
6418  RExC_paren_name_list = NULL;
6419 #endif
6420  RExC_recurse = NULL;
6421  RExC_study_chunk_recursed = NULL;
6422  RExC_study_chunk_recursed_bytes= 0;
6423  RExC_recurse_count = 0;
6424  pRExC_state->code_index = 0;
6425
6426 #if 0 /* REGC() is (currently) a NOP at the first pass.
6427  * Clever compilers notice this and complain. --jhi */
6428  REGC((U8)REG_MAGIC, (char*)RExC_emit);
6429 #endif
6430  DEBUG_PARSE_r(
6431   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6432   RExC_lastnum=0;
6433   RExC_lastparse=NULL;
6434  );
6435  /* reg may croak on us, not giving us a chance to free
6436  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6437  need it to survive as long as the regexp (qr/(?{})/).
6438  We must check that code_blocksv is not already set, because we may
6439  have jumped back to restart the sizing pass. */
6440  if (pRExC_state->code_blocks && !code_blocksv) {
6441   code_blocksv = newSV_type(SVt_PV);
6442   SAVEFREESV(code_blocksv);
6443   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6444   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6445  }
6446  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6447   /* It's possible to write a regexp in ascii that represents Unicode
6448   codepoints outside of the byte range, such as via \x{100}. If we
6449   detect such a sequence we have to convert the entire pattern to utf8
6450   and then recompile, as our sizing calculation will have been based
6451   on 1 byte == 1 character, but we will need to use utf8 to encode
6452   at least some part of the pattern, and therefore must convert the whole
6453   thing.
6454   -- dmq */
6455   if (flags & RESTART_UTF8) {
6456    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6457          pRExC_state->num_code_blocks);
6458    goto redo_first_pass;
6459   }
6460   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6461  }
6462  if (code_blocksv)
6463   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6464
6465  DEBUG_PARSE_r({
6466   PerlIO_printf(Perl_debug_log,
6467    "Required size %"IVdf" nodes\n"
6468    "Starting second pass (creation)\n",
6469    (IV)RExC_size);
6470   RExC_lastnum=0;
6471   RExC_lastparse=NULL;
6472  });
6473
6474  /* The first pass could have found things that force Unicode semantics */
6475  if ((RExC_utf8 || RExC_uni_semantics)
6476   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6477  {
6478   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6479  }
6480
6481  /* Small enough for pointer-storage convention?
6482  If extralen==0, this means that we will not need long jumps. */
6483  if (RExC_size >= 0x10000L && RExC_extralen)
6484   RExC_size += RExC_extralen;
6485  else
6486   RExC_extralen = 0;
6487  if (RExC_whilem_seen > 15)
6488   RExC_whilem_seen = 15;
6489
6490  /* Allocate space and zero-initialize. Note, the two step process
6491  of zeroing when in debug mode, thus anything assigned has to
6492  happen after that */
6493  rx = (REGEXP*) newSV_type(SVt_REGEXP);
6494  r = ReANY(rx);
6495  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6496   char, regexp_internal);
6497  if ( r == NULL || ri == NULL )
6498   FAIL("Regexp out of space");
6499 #ifdef DEBUGGING
6500  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6501  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6502   char);
6503 #else
6504  /* bulk initialize base fields with 0. */
6505  Zero(ri, sizeof(regexp_internal), char);
6506 #endif
6507
6508  /* non-zero initialization begins here */
6509  RXi_SET( r, ri );
6510  r->engine= eng;
6511  r->extflags = rx_flags;
6512  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6513
6514  if (pm_flags & PMf_IS_QR) {
6515   ri->code_blocks = pRExC_state->code_blocks;
6516   ri->num_code_blocks = pRExC_state->num_code_blocks;
6517  }
6518  else
6519  {
6520   int n;
6521   for (n = 0; n < pRExC_state->num_code_blocks; n++)
6522    if (pRExC_state->code_blocks[n].src_regex)
6523     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6524   SAVEFREEPV(pRExC_state->code_blocks);
6525  }
6526
6527  {
6528   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6529   bool has_charset = (get_regex_charset(r->extflags)
6530              != REGEX_DEPENDS_CHARSET);
6531
6532   /* The caret is output if there are any defaults: if not all the STD
6533   * flags are set, or if no character set specifier is needed */
6534   bool has_default =
6535      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6536      || ! has_charset);
6537   bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6538             == REG_RUN_ON_COMMENT_SEEN);
6539   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6540        >> RXf_PMf_STD_PMMOD_SHIFT);
6541   const char *fptr = STD_PAT_MODS;        /*"msix"*/
6542   char *p;
6543   /* Allocate for the worst case, which is all the std flags are turned
6544   * on.  If more precision is desired, we could do a population count of
6545   * the flags set.  This could be done with a small lookup table, or by
6546   * shifting, masking and adding, or even, when available, assembly
6547   * language for a machine-language population count.
6548   * We never output a minus, as all those are defaults, so are
6549   * covered by the caret */
6550   const STRLEN wraplen = plen + has_p + has_runon
6551    + has_default       /* If needs a caret */
6552
6553     /* If needs a character set specifier */
6554    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6555    + (sizeof(STD_PAT_MODS) - 1)
6556    + (sizeof("(?:)") - 1);
6557
6558   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6559   r->xpv_len_u.xpvlenu_pv = p;
6560   if (RExC_utf8)
6561    SvFLAGS(rx) |= SVf_UTF8;
6562   *p++='('; *p++='?';
6563
6564   /* If a default, cover it using the caret */
6565   if (has_default) {
6566    *p++= DEFAULT_PAT_MOD;
6567   }
6568   if (has_charset) {
6569    STRLEN len;
6570    const char* const name = get_regex_charset_name(r->extflags, &len);
6571    Copy(name, p, len, char);
6572    p += len;
6573   }
6574   if (has_p)
6575    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6576   {
6577    char ch;
6578    while((ch = *fptr++)) {
6579     if(reganch & 1)
6580      *p++ = ch;
6581     reganch >>= 1;
6582    }
6583   }
6584
6585   *p++ = ':';
6586   Copy(RExC_precomp, p, plen, char);
6587   assert ((RX_WRAPPED(rx) - p) < 16);
6588   r->pre_prefix = p - RX_WRAPPED(rx);
6589   p += plen;
6590   if (has_runon)
6591    *p++ = '\n';
6592   *p++ = ')';
6593   *p = 0;
6594   SvCUR_set(rx, p - RX_WRAPPED(rx));
6595  }
6596
6597  r->intflags = 0;
6598  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6599
6600  /* setup various meta data about recursion, this all requires
6601  * RExC_npar to be correctly set, and a bit later on we clear it */
6602  if (RExC_seen & REG_RECURSE_SEEN) {
6603   Newxz(RExC_open_parens, RExC_npar,regnode *);
6604   SAVEFREEPV(RExC_open_parens);
6605   Newxz(RExC_close_parens,RExC_npar,regnode *);
6606   SAVEFREEPV(RExC_close_parens);
6607  }
6608  if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6609   /* Note, RExC_npar is 1 + the number of parens in a pattern.
6610   * So its 1 if there are no parens. */
6611   RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6612           ((RExC_npar & 0x07) != 0);
6613   Newx(RExC_study_chunk_recursed,
6614    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6615   SAVEFREEPV(RExC_study_chunk_recursed);
6616  }
6617
6618  /* Useful during FAIL. */
6619 #ifdef RE_TRACK_PATTERN_OFFSETS
6620  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6621  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6622       "%s %"UVuf" bytes for offset annotations.\n",
6623       ri->u.offsets ? "Got" : "Couldn't get",
6624       (UV)((2*RExC_size+1) * sizeof(U32))));
6625 #endif
6626  SetProgLen(ri,RExC_size);
6627  RExC_rx_sv = rx;
6628  RExC_rx = r;
6629  RExC_rxi = ri;
6630  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6631
6632  /* Second pass: emit code. */
6633  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6634  RExC_pm_flags = pm_flags;
6635  RExC_parse = exp;
6636  RExC_end = exp + plen;
6637  RExC_naughty = 0;
6638  RExC_npar = 1;
6639  RExC_emit_start = ri->program;
6640  RExC_emit = ri->program;
6641  RExC_emit_bound = ri->program + RExC_size + 1;
6642  pRExC_state->code_index = 0;
6643
6644  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6645  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6646   ReREFCNT_dec(rx);
6647   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6648  }
6649  /* XXXX To minimize changes to RE engine we always allocate
6650  3-units-long substrs field. */
6651  Newx(r->substrs, 1, struct reg_substr_data);
6652  if (RExC_recurse_count) {
6653   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6654   SAVEFREEPV(RExC_recurse);
6655  }
6656
6657 reStudy:
6658  r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6659  Zero(r->substrs, 1, struct reg_substr_data);
6660  if (RExC_study_chunk_recursed)
6661   Zero(RExC_study_chunk_recursed,
6662    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6663
6664 #ifdef TRIE_STUDY_OPT
6665  if (!restudied) {
6666   StructCopy(&zero_scan_data, &data, scan_data_t);
6667   copyRExC_state = RExC_state;
6668  } else {
6669   U32 seen=RExC_seen;
6670   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6671
6672   RExC_state = copyRExC_state;
6673   if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6674    RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6675   else
6676    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6677   StructCopy(&zero_scan_data, &data, scan_data_t);
6678  }
6679 #else
6680  StructCopy(&zero_scan_data, &data, scan_data_t);
6681 #endif
6682
6683  /* Dig out information for optimizations. */
6684  r->extflags = RExC_flags; /* was pm_op */
6685  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6686
6687  if (UTF)
6688   SvUTF8_on(rx); /* Unicode in it? */
6689  ri->regstclass = NULL;
6690  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6691   r->intflags |= PREGf_NAUGHTY;
6692  scan = ri->program + 1;  /* First BRANCH. */
6693
6694  /* testing for BRANCH here tells us whether there is "must appear"
6695  data in the pattern. If there is then we can use it for optimisations */
6696  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6697             */
6698   SSize_t fake;
6699   STRLEN longest_float_length, longest_fixed_length;
6700   regnode_ssc ch_class; /* pointed to by data */
6701   int stclass_flag;
6702   SSize_t last_close = 0; /* pointed to by data */
6703   regnode *first= scan;
6704   regnode *first_next= regnext(first);
6705   /*
6706   * Skip introductions and multiplicators >= 1
6707   * so that we can extract the 'meat' of the pattern that must
6708   * match in the large if() sequence following.
6709   * NOTE that EXACT is NOT covered here, as it is normally
6710   * picked up by the optimiser separately.
6711   *
6712   * This is unfortunate as the optimiser isnt handling lookahead
6713   * properly currently.
6714   *
6715   */
6716   while ((OP(first) == OPEN && (sawopen = 1)) ||
6717    /* An OR of *one* alternative - should not happen now. */
6718    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6719    /* for now we can't handle lookbehind IFMATCH*/
6720    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6721    (OP(first) == PLUS) ||
6722    (OP(first) == MINMOD) ||
6723    /* An {n,m} with n>0 */
6724    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6725    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6726   {
6727     /*
6728     * the only op that could be a regnode is PLUS, all the rest
6729     * will be regnode_1 or regnode_2.
6730     *
6731     * (yves doesn't think this is true)
6732     */
6733     if (OP(first) == PLUS)
6734      sawplus = 1;
6735     else {
6736      if (OP(first) == MINMOD)
6737       sawminmod = 1;
6738      first += regarglen[OP(first)];
6739     }
6740     first = NEXTOPER(first);
6741     first_next= regnext(first);
6742   }
6743
6744   /* Starting-point info. */
6745  again:
6746   DEBUG_PEEP("first:",first,0);
6747   /* Ignore EXACT as we deal with it later. */
6748   if (PL_regkind[OP(first)] == EXACT) {
6749    if (OP(first) == EXACT)
6750     NOOP; /* Empty, get anchored substr later. */
6751    else
6752     ri->regstclass = first;
6753   }
6754 #ifdef TRIE_STCLASS
6755   else if (PL_regkind[OP(first)] == TRIE &&
6756     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6757   {
6758    regnode *trie_op;
6759    /* this can happen only on restudy */
6760    if ( OP(first) == TRIE ) {
6761     struct regnode_1 *trieop = (struct regnode_1 *)
6762      PerlMemShared_calloc(1, sizeof(struct regnode_1));
6763     StructCopy(first,trieop,struct regnode_1);
6764     trie_op=(regnode *)trieop;
6765    } else {
6766     struct regnode_charclass *trieop = (struct regnode_charclass *)
6767      PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6768     StructCopy(first,trieop,struct regnode_charclass);
6769     trie_op=(regnode *)trieop;
6770    }
6771    OP(trie_op)+=2;
6772    make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6773    ri->regstclass = trie_op;
6774   }
6775 #endif
6776   else if (REGNODE_SIMPLE(OP(first)))
6777    ri->regstclass = first;
6778   else if (PL_regkind[OP(first)] == BOUND ||
6779     PL_regkind[OP(first)] == NBOUND)
6780    ri->regstclass = first;
6781   else if (PL_regkind[OP(first)] == BOL) {
6782    r->intflags |= (OP(first) == MBOL
6783       ? PREGf_ANCH_MBOL
6784       : (OP(first) == SBOL
6785        ? PREGf_ANCH_SBOL
6786        : PREGf_ANCH_BOL));
6787    first = NEXTOPER(first);
6788    goto again;
6789   }
6790   else if (OP(first) == GPOS) {
6791    r->intflags |= PREGf_ANCH_GPOS;
6792    first = NEXTOPER(first);
6793    goto again;
6794   }
6795   else if ((!sawopen || !RExC_sawback) &&
6796    (OP(first) == STAR &&
6797    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6798    !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6799   {
6800    /* turn .* into ^.* with an implied $*=1 */
6801    const int type =
6802     (OP(NEXTOPER(first)) == REG_ANY)
6803      ? PREGf_ANCH_MBOL
6804      : PREGf_ANCH_SBOL;
6805    r->intflags |= (type | PREGf_IMPLICIT);
6806    first = NEXTOPER(first);
6807    goto again;
6808   }
6809   if (sawplus && !sawminmod && !sawlookahead
6810    && (!sawopen || !RExC_sawback)
6811    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6812    /* x+ must match at the 1st pos of run of x's */
6813    r->intflags |= PREGf_SKIP;
6814
6815   /* Scan is after the zeroth branch, first is atomic matcher. */
6816 #ifdef TRIE_STUDY_OPT
6817   DEBUG_PARSE_r(
6818    if (!restudied)
6819     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6820        (IV)(first - scan + 1))
6821   );
6822 #else
6823   DEBUG_PARSE_r(
6824    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6825     (IV)(first - scan + 1))
6826   );
6827 #endif
6828
6829
6830   /*
6831   * If there's something expensive in the r.e., find the
6832   * longest literal string that must appear and make it the
6833   * regmust.  Resolve ties in favor of later strings, since
6834   * the regstart check works with the beginning of the r.e.
6835   * and avoiding duplication strengthens checking.  Not a
6836   * strong reason, but sufficient in the absence of others.
6837   * [Now we resolve ties in favor of the earlier string if
6838   * it happens that c_offset_min has been invalidated, since the
6839   * earlier string may buy us something the later one won't.]
6840   */
6841
6842   data.longest_fixed = newSVpvs("");
6843   data.longest_float = newSVpvs("");
6844   data.last_found = newSVpvs("");
6845   data.longest = &(data.longest_fixed);
6846   ENTER_with_name("study_chunk");
6847   SAVEFREESV(data.longest_fixed);
6848   SAVEFREESV(data.longest_float);
6849   SAVEFREESV(data.last_found);
6850   first = scan;
6851   if (!ri->regstclass) {
6852    ssc_init(pRExC_state, &ch_class);
6853    data.start_class = &ch_class;
6854    stclass_flag = SCF_DO_STCLASS_AND;
6855   } else    /* XXXX Check for BOUND? */
6856    stclass_flag = 0;
6857   data.last_closep = &last_close;
6858
6859   DEBUG_RExC_seen();
6860   minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6861        scan + RExC_size, /* Up to end */
6862    &data, -1, 0, NULL,
6863    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6864       | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6865    0);
6866
6867
6868   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6869
6870
6871   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6872    && data.last_start_min == 0 && data.last_end > 0
6873    && !RExC_seen_zerolen
6874    && !(RExC_seen & REG_VERBARG_SEEN)
6875    && !(RExC_seen & REG_GPOS_SEEN)
6876   ){
6877    r->extflags |= RXf_CHECK_ALL;
6878   }
6879   scan_commit(pRExC_state, &data,&minlen,0);
6880
6881   longest_float_length = CHR_SVLEN(data.longest_float);
6882
6883   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6884     && data.offset_fixed == data.offset_float_min
6885     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6886    && S_setup_longest (aTHX_ pRExC_state,
6887          data.longest_float,
6888          &(r->float_utf8),
6889          &(r->float_substr),
6890          &(r->float_end_shift),
6891          data.lookbehind_float,
6892          data.offset_float_min,
6893          data.minlen_float,
6894          longest_float_length,
6895          cBOOL(data.flags & SF_FL_BEFORE_EOL),
6896          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6897   {
6898    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6899    r->float_max_offset = data.offset_float_max;
6900    if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6901     r->float_max_offset -= data.lookbehind_float;
6902    SvREFCNT_inc_simple_void_NN(data.longest_float);
6903   }
6904   else {
6905    r->float_substr = r->float_utf8 = NULL;
6906    longest_float_length = 0;
6907   }
6908
6909   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6910
6911   if (S_setup_longest (aTHX_ pRExC_state,
6912         data.longest_fixed,
6913         &(r->anchored_utf8),
6914         &(r->anchored_substr),
6915         &(r->anchored_end_shift),
6916         data.lookbehind_fixed,
6917         data.offset_fixed,
6918         data.minlen_fixed,
6919         longest_fixed_length,
6920         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6921         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6922   {
6923    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6924    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6925   }
6926   else {
6927    r->anchored_substr = r->anchored_utf8 = NULL;
6928    longest_fixed_length = 0;
6929   }
6930   LEAVE_with_name("study_chunk");
6931
6932   if (ri->regstclass
6933    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6934    ri->regstclass = NULL;
6935
6936   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6937    && stclass_flag
6938    && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6939    && !ssc_is_anything(data.start_class))
6940   {
6941    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6942
6943    ssc_finalize(pRExC_state, data.start_class);
6944
6945    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6946    StructCopy(data.start_class,
6947      (regnode_ssc*)RExC_rxi->data->data[n],
6948      regnode_ssc);
6949    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6950    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6951    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6952      regprop(r, sv, (regnode*)data.start_class, NULL);
6953      PerlIO_printf(Perl_debug_log,
6954          "synthetic stclass \"%s\".\n",
6955          SvPVX_const(sv));});
6956    data.start_class = NULL;
6957   }
6958
6959   /* A temporary algorithm prefers floated substr to fixed one to dig
6960   * more info. */
6961   if (longest_fixed_length > longest_float_length) {
6962    r->substrs->check_ix = 0;
6963    r->check_end_shift = r->anchored_end_shift;
6964    r->check_substr = r->anchored_substr;
6965    r->check_utf8 = r->anchored_utf8;
6966    r->check_offset_min = r->check_offset_max = r->anchored_offset;
6967    if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
6968     r->intflags |= PREGf_NOSCAN;
6969   }
6970   else {
6971    r->substrs->check_ix = 1;
6972    r->check_end_shift = r->float_end_shift;
6973    r->check_substr = r->float_substr;
6974    r->check_utf8 = r->float_utf8;
6975    r->check_offset_min = r->float_min_offset;
6976    r->check_offset_max = r->float_max_offset;
6977   }
6978   if ((r->check_substr || r->check_utf8) ) {
6979    r->extflags |= RXf_USE_INTUIT;
6980    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6981     r->extflags |= RXf_INTUIT_TAIL;
6982   }
6983   r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
6984
6985   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6986   if ( (STRLEN)minlen < longest_float_length )
6987    minlen= longest_float_length;
6988   if ( (STRLEN)minlen < longest_fixed_length )
6989    minlen= longest_fixed_length;
6990   */
6991  }
6992  else {
6993   /* Several toplevels. Best we can is to set minlen. */
6994   SSize_t fake;
6995   regnode_ssc ch_class;
6996   SSize_t last_close = 0;
6997
6998   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6999
7000   scan = ri->program + 1;
7001   ssc_init(pRExC_state, &ch_class);
7002   data.start_class = &ch_class;
7003   data.last_closep = &last_close;
7004
7005   DEBUG_RExC_seen();
7006   minlen = study_chunk(pRExC_state,
7007    &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7008    SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7009              ? SCF_TRIE_DOING_RESTUDY
7010              : 0),
7011    0);
7012
7013   CHECK_RESTUDY_GOTO_butfirst(NOOP);
7014
7015   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7016     = r->float_substr = r->float_utf8 = NULL;
7017
7018   if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7019    && ! ssc_is_anything(data.start_class))
7020   {
7021    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7022
7023    ssc_finalize(pRExC_state, data.start_class);
7024
7025    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7026    StructCopy(data.start_class,
7027      (regnode_ssc*)RExC_rxi->data->data[n],
7028      regnode_ssc);
7029    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7030    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7031    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7032      regprop(r, sv, (regnode*)data.start_class, NULL);
7033      PerlIO_printf(Perl_debug_log,
7034          "synthetic stclass \"%s\".\n",
7035          SvPVX_const(sv));});
7036    data.start_class = NULL;
7037   }
7038  }
7039
7040  if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7041   r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7042   r->maxlen = REG_INFTY;
7043  }
7044  else {
7045   r->maxlen = RExC_maxlen;
7046  }
7047
7048  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7049  the "real" pattern. */
7050  DEBUG_OPTIMISE_r({
7051   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7052      (IV)minlen, (IV)r->minlen, RExC_maxlen);
7053  });
7054  r->minlenret = minlen;
7055  if (r->minlen < minlen)
7056   r->minlen = minlen;
7057
7058  if (RExC_seen & REG_GPOS_SEEN)
7059   r->intflags |= PREGf_GPOS_SEEN;
7060  if (RExC_seen & REG_LOOKBEHIND_SEEN)
7061   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7062             lookbehind */
7063  if (pRExC_state->num_code_blocks)
7064   r->extflags |= RXf_EVAL_SEEN;
7065  if (RExC_seen & REG_CANY_SEEN)
7066   r->intflags |= PREGf_CANY_SEEN;
7067  if (RExC_seen & REG_VERBARG_SEEN)
7068  {
7069   r->intflags |= PREGf_VERBARG_SEEN;
7070   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7071  }
7072  if (RExC_seen & REG_CUTGROUP_SEEN)
7073   r->intflags |= PREGf_CUTGROUP_SEEN;
7074  if (pm_flags & PMf_USE_RE_EVAL)
7075   r->intflags |= PREGf_USE_RE_EVAL;
7076  if (RExC_paren_names)
7077   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7078  else
7079   RXp_PAREN_NAMES(r) = NULL;
7080
7081  /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7082  * so it can be used in pp.c */
7083  if (r->intflags & PREGf_ANCH)
7084   r->extflags |= RXf_IS_ANCHORED;
7085
7086
7087  {
7088   /* this is used to identify "special" patterns that might result
7089   * in Perl NOT calling the regex engine and instead doing the match "itself",
7090   * particularly special cases in split//. By having the regex compiler
7091   * do this pattern matching at a regop level (instead of by inspecting the pattern)
7092   * we avoid weird issues with equivalent patterns resulting in different behavior,
7093   * AND we allow non Perl engines to get the same optimizations by the setting the
7094   * flags appropriately - Yves */
7095   regnode *first = ri->program + 1;
7096   U8 fop = OP(first);
7097   regnode *next = NEXTOPER(first);
7098   U8 nop = OP(next);
7099
7100   if (PL_regkind[fop] == NOTHING && nop == END)
7101    r->extflags |= RXf_NULL;
7102   else if (PL_regkind[fop] == BOL && nop == END)
7103    r->extflags |= RXf_START_ONLY;
7104   else if (fop == PLUS
7105     && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7106     && OP(regnext(first)) == END)
7107    r->extflags |= RXf_WHITE;
7108   else if ( r->extflags & RXf_SPLIT
7109     && fop == EXACT
7110     && STR_LEN(first) == 1
7111     && *(STRING(first)) == ' '
7112     && OP(regnext(first)) == END )
7113    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7114
7115  }
7116
7117  if (RExC_contains_locale) {
7118   RXp_EXTFLAGS(r) |= RXf_TAINTED;
7119  }
7120
7121 #ifdef DEBUGGING
7122  if (RExC_paren_names) {
7123   ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7124   ri->data->data[ri->name_list_idx]
7125         = (void*)SvREFCNT_inc(RExC_paren_name_list);
7126  } else
7127 #endif
7128   ri->name_list_idx = 0;
7129
7130  if (RExC_recurse_count) {
7131   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7132    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7133    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7134   }
7135  }
7136  Newxz(r->offs, RExC_npar, regexp_paren_pair);
7137  /* assume we don't need to swap parens around before we match */
7138
7139  DEBUG_DUMP_r({
7140   DEBUG_RExC_seen();
7141   PerlIO_printf(Perl_debug_log,"Final program:\n");
7142   regdump(r);
7143  });
7144 #ifdef RE_TRACK_PATTERN_OFFSETS
7145  DEBUG_OFFSETS_r(if (ri->u.offsets) {
7146   const STRLEN len = ri->u.offsets[0];
7147   STRLEN i;
7148   GET_RE_DEBUG_FLAGS_DECL;
7149   PerlIO_printf(Perl_debug_log,
7150      "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7151   for (i = 1; i <= len; i++) {
7152    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7153     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7154     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7155    }
7156   PerlIO_printf(Perl_debug_log, "\n");
7157  });
7158 #endif
7159
7160 #ifdef USE_ITHREADS
7161  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7162  * by setting the regexp SV to readonly-only instead. If the
7163  * pattern's been recompiled, the USEDness should remain. */
7164  if (old_re && SvREADONLY(old_re))
7165   SvREADONLY_on(rx);
7166 #endif
7167  return rx;
7168 }
7169
7170
7171 SV*
7172 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7173      const U32 flags)
7174 {
7175  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7176
7177  PERL_UNUSED_ARG(value);
7178
7179  if (flags & RXapif_FETCH) {
7180   return reg_named_buff_fetch(rx, key, flags);
7181  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7182   Perl_croak_no_modify();
7183   return NULL;
7184  } else if (flags & RXapif_EXISTS) {
7185   return reg_named_buff_exists(rx, key, flags)
7186    ? &PL_sv_yes
7187    : &PL_sv_no;
7188  } else if (flags & RXapif_REGNAMES) {
7189   return reg_named_buff_all(rx, flags);
7190  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7191   return reg_named_buff_scalar(rx, flags);
7192  } else {
7193   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7194   return NULL;
7195  }
7196 }
7197
7198 SV*
7199 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7200       const U32 flags)
7201 {
7202  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7203  PERL_UNUSED_ARG(lastkey);
7204
7205  if (flags & RXapif_FIRSTKEY)
7206   return reg_named_buff_firstkey(rx, flags);
7207  else if (flags & RXapif_NEXTKEY)
7208   return reg_named_buff_nextkey(rx, flags);
7209  else {
7210   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7211            (int)flags);
7212   return NULL;
7213  }
7214 }
7215
7216 SV*
7217 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7218       const U32 flags)
7219 {
7220  AV *retarray = NULL;
7221  SV *ret;
7222  struct regexp *const rx = ReANY(r);
7223
7224  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7225
7226  if (flags & RXapif_ALL)
7227   retarray=newAV();
7228
7229  if (rx && RXp_PAREN_NAMES(rx)) {
7230   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7231   if (he_str) {
7232    IV i;
7233    SV* sv_dat=HeVAL(he_str);
7234    I32 *nums=(I32*)SvPVX(sv_dat);
7235    for ( i=0; i<SvIVX(sv_dat); i++ ) {
7236     if ((I32)(rx->nparens) >= nums[i]
7237      && rx->offs[nums[i]].start != -1
7238      && rx->offs[nums[i]].end != -1)
7239     {
7240      ret = newSVpvs("");
7241      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7242      if (!retarray)
7243       return ret;
7244     } else {
7245      if (retarray)
7246       ret = newSVsv(&PL_sv_undef);
7247     }
7248     if (retarray)
7249      av_push(retarray, ret);
7250    }
7251    if (retarray)
7252     return newRV_noinc(MUTABLE_SV(retarray));
7253   }
7254  }
7255  return NULL;
7256 }
7257
7258 bool
7259 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7260       const U32 flags)
7261 {
7262  struct regexp *const rx = ReANY(r);
7263
7264  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7265
7266  if (rx && RXp_PAREN_NAMES(rx)) {
7267   if (flags & RXapif_ALL) {
7268    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7269   } else {
7270    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7271    if (sv) {
7272     SvREFCNT_dec_NN(sv);
7273     return TRUE;
7274    } else {
7275     return FALSE;
7276    }
7277   }
7278  } else {
7279   return FALSE;
7280  }
7281 }
7282
7283 SV*
7284 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7285 {
7286  struct regexp *const rx = ReANY(r);
7287
7288  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7289
7290  if ( rx && RXp_PAREN_NAMES(rx) ) {
7291   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7292
7293   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7294  } else {
7295   return FALSE;
7296  }
7297 }
7298
7299 SV*
7300 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7301 {
7302  struct regexp *const rx = ReANY(r);
7303  GET_RE_DEBUG_FLAGS_DECL;
7304
7305  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7306
7307  if (rx && RXp_PAREN_NAMES(rx)) {
7308   HV *hv = RXp_PAREN_NAMES(rx);
7309   HE *temphe;
7310   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7311    IV i;
7312    IV parno = 0;
7313    SV* sv_dat = HeVAL(temphe);
7314    I32 *nums = (I32*)SvPVX(sv_dat);
7315    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7316     if ((I32)(rx->lastparen) >= nums[i] &&
7317      rx->offs[nums[i]].start != -1 &&
7318      rx->offs[nums[i]].end != -1)
7319     {
7320      parno = nums[i];
7321      break;
7322     }
7323    }
7324    if (parno || flags & RXapif_ALL) {
7325     return newSVhek(HeKEY_hek(temphe));
7326    }
7327   }
7328  }
7329  return NULL;
7330 }
7331
7332 SV*
7333 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7334 {
7335  SV *ret;
7336  AV *av;
7337  SSize_t length;
7338  struct regexp *const rx = ReANY(r);
7339
7340  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7341
7342  if (rx && RXp_PAREN_NAMES(rx)) {
7343   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7344    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7345   } else if (flags & RXapif_ONE) {
7346    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7347    av = MUTABLE_AV(SvRV(ret));
7348    length = av_tindex(av);
7349    SvREFCNT_dec_NN(ret);
7350    return newSViv(length + 1);
7351   } else {
7352    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7353             (int)flags);
7354    return NULL;
7355   }
7356  }
7357  return &PL_sv_undef;
7358 }
7359
7360 SV*
7361 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7362 {
7363  struct regexp *const rx = ReANY(r);
7364  AV *av = newAV();
7365
7366  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7367
7368  if (rx && RXp_PAREN_NAMES(rx)) {
7369   HV *hv= RXp_PAREN_NAMES(rx);
7370   HE *temphe;
7371   (void)hv_iterinit(hv);
7372   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7373    IV i;
7374    IV parno = 0;
7375    SV* sv_dat = HeVAL(temphe);
7376    I32 *nums = (I32*)SvPVX(sv_dat);
7377    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7378     if ((I32)(rx->lastparen) >= nums[i] &&
7379      rx->offs[nums[i]].start != -1 &&
7380      rx->offs[nums[i]].end != -1)
7381     {
7382      parno = nums[i];
7383      break;
7384     }
7385    }
7386    if (parno || flags & RXapif_ALL) {
7387     av_push(av, newSVhek(HeKEY_hek(temphe)));
7388    }
7389   }
7390  }
7391
7392  return newRV_noinc(MUTABLE_SV(av));
7393 }
7394
7395 void
7396 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7397        SV * const sv)
7398 {
7399  struct regexp *const rx = ReANY(r);
7400  char *s = NULL;
7401  SSize_t i = 0;
7402  SSize_t s1, t1;
7403  I32 n = paren;
7404
7405  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7406
7407  if (      n == RX_BUFF_IDX_CARET_PREMATCH
7408   || n == RX_BUFF_IDX_CARET_FULLMATCH
7409   || n == RX_BUFF_IDX_CARET_POSTMATCH
7410  )
7411  {
7412   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7413   if (!keepcopy) {
7414    /* on something like
7415    *    $r = qr/.../;
7416    *    /$qr/p;
7417    * the KEEPCOPY is set on the PMOP rather than the regex */
7418    if (PL_curpm && r == PM_GETRE(PL_curpm))
7419     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7420   }
7421   if (!keepcopy)
7422    goto ret_undef;
7423  }
7424
7425  if (!rx->subbeg)
7426   goto ret_undef;
7427
7428  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7429   /* no need to distinguish between them any more */
7430   n = RX_BUFF_IDX_FULLMATCH;
7431
7432  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7433   && rx->offs[0].start != -1)
7434  {
7435   /* $`, ${^PREMATCH} */
7436   i = rx->offs[0].start;
7437   s = rx->subbeg;
7438  }
7439  else
7440  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7441   && rx->offs[0].end != -1)
7442  {
7443   /* $', ${^POSTMATCH} */
7444   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7445   i = rx->sublen + rx->suboffset - rx->offs[0].end;
7446  }
7447  else
7448  if ( 0 <= n && n <= (I32)rx->nparens &&
7449   (s1 = rx->offs[n].start) != -1 &&
7450   (t1 = rx->offs[n].end) != -1)
7451  {
7452   /* $&, ${^MATCH},  $1 ... */
7453   i = t1 - s1;
7454   s = rx->subbeg + s1 - rx->suboffset;
7455  } else {
7456   goto ret_undef;
7457  }
7458
7459  assert(s >= rx->subbeg);
7460  assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7461  if (i >= 0) {
7462 #ifdef NO_TAINT_SUPPORT
7463   sv_setpvn(sv, s, i);
7464 #else
7465   const int oldtainted = TAINT_get;
7466   TAINT_NOT;
7467   sv_setpvn(sv, s, i);
7468   TAINT_set(oldtainted);
7469 #endif
7470   if ( (rx->intflags & PREGf_CANY_SEEN)
7471    ? (RXp_MATCH_UTF8(rx)
7472       && (!i || is_utf8_string((U8*)s, i)))
7473    : (RXp_MATCH_UTF8(rx)) )
7474   {
7475    SvUTF8_on(sv);
7476   }
7477   else
7478    SvUTF8_off(sv);
7479   if (TAINTING_get) {
7480    if (RXp_MATCH_TAINTED(rx)) {
7481     if (SvTYPE(sv) >= SVt_PVMG) {
7482      MAGIC* const mg = SvMAGIC(sv);
7483      MAGIC* mgt;
7484      TAINT;
7485      SvMAGIC_set(sv, mg->mg_moremagic);
7486      SvTAINT(sv);
7487      if ((mgt = SvMAGIC(sv))) {
7488       mg->mg_moremagic = mgt;
7489       SvMAGIC_set(sv, mg);
7490      }
7491     } else {
7492      TAINT;
7493      SvTAINT(sv);
7494     }
7495    } else
7496     SvTAINTED_off(sv);
7497   }
7498  } else {
7499  ret_undef:
7500   sv_setsv(sv,&PL_sv_undef);
7501   return;
7502  }
7503 }
7504
7505 void
7506 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7507               SV const * const value)
7508 {
7509  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7510
7511  PERL_UNUSED_ARG(rx);
7512  PERL_UNUSED_ARG(paren);
7513  PERL_UNUSED_ARG(value);
7514
7515  if (!PL_localizing)
7516   Perl_croak_no_modify();
7517 }
7518
7519 I32
7520 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7521        const I32 paren)
7522 {
7523  struct regexp *const rx = ReANY(r);
7524  I32 i;
7525  I32 s1, t1;
7526
7527  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7528
7529  if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7530   || paren == RX_BUFF_IDX_CARET_FULLMATCH
7531   || paren == RX_BUFF_IDX_CARET_POSTMATCH
7532  )
7533  {
7534   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7535   if (!keepcopy) {
7536    /* on something like
7537    *    $r = qr/.../;
7538    *    /$qr/p;
7539    * the KEEPCOPY is set on the PMOP rather than the regex */
7540    if (PL_curpm && r == PM_GETRE(PL_curpm))
7541     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7542   }
7543   if (!keepcopy)
7544    goto warn_undef;
7545  }
7546
7547  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7548  switch (paren) {
7549  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7550  case RX_BUFF_IDX_PREMATCH:       /* $` */
7551   if (rx->offs[0].start != -1) {
7552       i = rx->offs[0].start;
7553       if (i > 0) {
7554         s1 = 0;
7555         t1 = i;
7556         goto getlen;
7557       }
7558    }
7559   return 0;
7560
7561  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7562  case RX_BUFF_IDX_POSTMATCH:       /* $' */
7563    if (rx->offs[0].end != -1) {
7564       i = rx->sublen - rx->offs[0].end;
7565       if (i > 0) {
7566         s1 = rx->offs[0].end;
7567         t1 = rx->sublen;
7568         goto getlen;
7569       }
7570    }
7571   return 0;
7572
7573  default: /* $& / ${^MATCH}, $1, $2, ... */
7574    if (paren <= (I32)rx->nparens &&
7575    (s1 = rx->offs[paren].start) != -1 &&
7576    (t1 = rx->offs[paren].end) != -1)
7577    {
7578    i = t1 - s1;
7579    goto getlen;
7580   } else {
7581   warn_undef:
7582    if (ckWARN(WARN_UNINITIALIZED))
7583     report_uninit((const SV *)sv);
7584    return 0;
7585   }
7586  }
7587   getlen:
7588  if (i > 0 && RXp_MATCH_UTF8(rx)) {
7589   const char * const s = rx->subbeg - rx->suboffset + s1;
7590   const U8 *ep;
7591   STRLEN el;
7592
7593   i = t1 - s1;
7594   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7595       i = el;
7596  }
7597  return i;
7598 }
7599
7600 SV*
7601 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7602 {
7603  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7604   PERL_UNUSED_ARG(rx);
7605   if (0)
7606    return NULL;
7607   else
7608    return newSVpvs("Regexp");
7609 }
7610
7611 /* Scans the name of a named buffer from the pattern.
7612  * If flags is REG_RSN_RETURN_NULL returns null.
7613  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7614  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7615  * to the parsed name as looked up in the RExC_paren_names hash.
7616  * If there is an error throws a vFAIL().. type exception.
7617  */
7618
7619 #define REG_RSN_RETURN_NULL    0
7620 #define REG_RSN_RETURN_NAME    1
7621 #define REG_RSN_RETURN_DATA    2
7622
7623 STATIC SV*
7624 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7625 {
7626  char *name_start = RExC_parse;
7627
7628  PERL_ARGS_ASSERT_REG_SCAN_NAME;
7629
7630  assert (RExC_parse <= RExC_end);
7631  if (RExC_parse == RExC_end) NOOP;
7632  else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7633   /* skip IDFIRST by using do...while */
7634   if (UTF)
7635    do {
7636     RExC_parse += UTF8SKIP(RExC_parse);
7637    } while (isWORDCHAR_utf8((U8*)RExC_parse));
7638   else
7639    do {
7640     RExC_parse++;
7641    } while (isWORDCHAR(*RExC_parse));
7642  } else {
7643   RExC_parse++; /* so the <- from the vFAIL is after the offending
7644       character */
7645   vFAIL("Group name must start with a non-digit word character");
7646  }
7647  if ( flags ) {
7648   SV* sv_name
7649    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7650        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7651   if ( flags == REG_RSN_RETURN_NAME)
7652    return sv_name;
7653   else if (flags==REG_RSN_RETURN_DATA) {
7654    HE *he_str = NULL;
7655    SV *sv_dat = NULL;
7656    if ( ! sv_name )      /* should not happen*/
7657     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7658    if (RExC_paren_names)
7659     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7660    if ( he_str )
7661     sv_dat = HeVAL(he_str);
7662    if ( ! sv_dat )
7663     vFAIL("Reference to nonexistent named group");
7664    return sv_dat;
7665   }
7666   else {
7667    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7668      (unsigned long) flags);
7669   }
7670   assert(0); /* NOT REACHED */
7671  }
7672  return NULL;
7673 }
7674
7675 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7676  int rem=(int)(RExC_end - RExC_parse);                       \
7677  int cut;                                                    \
7678  int num;                                                    \
7679  int iscut=0;                                                \
7680  if (rem>10) {                                               \
7681   rem=10;                                                 \
7682   iscut=1;                                                \
7683  }                                                           \
7684  cut=10-rem;                                                 \
7685  if (RExC_lastparse!=RExC_parse)                             \
7686   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7687    rem, RExC_parse,                                    \
7688    cut + 4,                                            \
7689    iscut ? "..." : "<"                                 \
7690   );                                                      \
7691  else                                                        \
7692   PerlIO_printf(Perl_debug_log,"%16s","");                \
7693                 \
7694  if (SIZE_ONLY)                                              \
7695  num = RExC_size + 1;                                     \
7696  else                                                        \
7697  num=REG_NODE_NUM(RExC_emit);                             \
7698  if (RExC_lastnum!=num)                                      \
7699  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7700  else                                                        \
7701  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7702  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7703   (int)((depth*2)), "",                                   \
7704   (funcname)                                              \
7705  );                                                          \
7706  RExC_lastnum=num;                                           \
7707  RExC_lastparse=RExC_parse;                                  \
7708 })
7709
7710
7711
7712 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7713  DEBUG_PARSE_MSG((funcname));                            \
7714  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7715 })
7716 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7717  DEBUG_PARSE_MSG((funcname));                            \
7718  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7719 })
7720
7721 /* This section of code defines the inversion list object and its methods.  The
7722  * interfaces are highly subject to change, so as much as possible is static to
7723  * this file.  An inversion list is here implemented as a malloc'd C UV array
7724  * as an SVt_INVLIST scalar.
7725  *
7726  * An inversion list for Unicode is an array of code points, sorted by ordinal
7727  * number.  The zeroth element is the first code point in the list.  The 1th
7728  * element is the first element beyond that not in the list.  In other words,
7729  * the first range is
7730  *  invlist[0]..(invlist[1]-1)
7731  * The other ranges follow.  Thus every element whose index is divisible by two
7732  * marks the beginning of a range that is in the list, and every element not
7733  * divisible by two marks the beginning of a range not in the list.  A single
7734  * element inversion list that contains the single code point N generally
7735  * consists of two elements
7736  *  invlist[0] == N
7737  *  invlist[1] == N+1
7738  * (The exception is when N is the highest representable value on the
7739  * machine, in which case the list containing just it would be a single
7740  * element, itself.  By extension, if the last range in the list extends to
7741  * infinity, then the first element of that range will be in the inversion list
7742  * at a position that is divisible by two, and is the final element in the
7743  * list.)
7744  * Taking the complement (inverting) an inversion list is quite simple, if the
7745  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7746  * This implementation reserves an element at the beginning of each inversion
7747  * list to always contain 0; there is an additional flag in the header which
7748  * indicates if the list begins at the 0, or is offset to begin at the next
7749  * element.
7750  *
7751  * More about inversion lists can be found in "Unicode Demystified"
7752  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7753  * More will be coming when functionality is added later.
7754  *
7755  * The inversion list data structure is currently implemented as an SV pointing
7756  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7757  * array of UV whose memory management is automatically handled by the existing
7758  * facilities for SV's.
7759  *
7760  * Some of the methods should always be private to the implementation, and some
7761  * should eventually be made public */
7762
7763 /* The header definitions are in F<inline_invlist.c> */
7764
7765 PERL_STATIC_INLINE UV*
7766 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7767 {
7768  /* Returns a pointer to the first element in the inversion list's array.
7769  * This is called upon initialization of an inversion list.  Where the
7770  * array begins depends on whether the list has the code point U+0000 in it
7771  * or not.  The other parameter tells it whether the code that follows this
7772  * call is about to put a 0 in the inversion list or not.  The first
7773  * element is either the element reserved for 0, if TRUE, or the element
7774  * after it, if FALSE */
7775
7776  bool* offset = get_invlist_offset_addr(invlist);
7777  UV* zero_addr = (UV *) SvPVX(invlist);
7778
7779  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7780
7781  /* Must be empty */
7782  assert(! _invlist_len(invlist));
7783
7784  *zero_addr = 0;
7785
7786  /* 1^1 = 0; 1^0 = 1 */
7787  *offset = 1 ^ will_have_0;
7788  return zero_addr + *offset;
7789 }
7790
7791 PERL_STATIC_INLINE UV*
7792 S_invlist_array(pTHX_ SV* const invlist)
7793 {
7794  /* Returns the pointer to the inversion list's array.  Every time the
7795  * length changes, this needs to be called in case malloc or realloc moved
7796  * it */
7797
7798  PERL_ARGS_ASSERT_INVLIST_ARRAY;
7799
7800  /* Must not be empty.  If these fail, you probably didn't check for <len>
7801  * being non-zero before trying to get the array */
7802  assert(_invlist_len(invlist));
7803
7804  /* The very first element always contains zero, The array begins either
7805  * there, or if the inversion list is offset, at the element after it.
7806  * The offset header field determines which; it contains 0 or 1 to indicate
7807  * how much additionally to add */
7808  assert(0 == *(SvPVX(invlist)));
7809  return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7810 }
7811
7812 PERL_STATIC_INLINE void
7813 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7814 {
7815  /* Sets the current number of elements stored in the inversion list.
7816  * Updates SvCUR correspondingly */
7817
7818  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7819
7820  assert(SvTYPE(invlist) == SVt_INVLIST);
7821
7822  SvCUR_set(invlist,
7823    (len == 0)
7824    ? 0
7825    : TO_INTERNAL_SIZE(len + offset));
7826  assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7827 }
7828
7829 PERL_STATIC_INLINE IV*
7830 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7831 {
7832  /* Return the address of the IV that is reserved to hold the cached index
7833  * */
7834
7835  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7836
7837  assert(SvTYPE(invlist) == SVt_INVLIST);
7838
7839  return &(((XINVLIST*) SvANY(invlist))->prev_index);
7840 }
7841
7842 PERL_STATIC_INLINE IV
7843 S_invlist_previous_index(pTHX_ SV* const invlist)
7844 {
7845  /* Returns cached index of previous search */
7846
7847  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7848
7849  return *get_invlist_previous_index_addr(invlist);
7850 }
7851
7852 PERL_STATIC_INLINE void
7853 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7854 {
7855  /* Caches <index> for later retrieval */
7856
7857  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7858
7859  assert(index == 0 || index < (int) _invlist_len(invlist));
7860
7861  *get_invlist_previous_index_addr(invlist) = index;
7862 }
7863
7864 PERL_STATIC_INLINE UV
7865 S_invlist_max(pTHX_ SV* const invlist)
7866 {
7867  /* Returns the maximum number of elements storable in the inversion list's
7868  * array, without having to realloc() */
7869
7870  PERL_ARGS_ASSERT_INVLIST_MAX;
7871
7872  assert(SvTYPE(invlist) == SVt_INVLIST);
7873
7874  /* Assumes worst case, in which the 0 element is not counted in the
7875  * inversion list, so subtracts 1 for that */
7876  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7877   ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7878   : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7879 }
7880
7881 #ifndef PERL_IN_XSUB_RE
7882 SV*
7883 Perl__new_invlist(pTHX_ IV initial_size)
7884 {
7885
7886  /* Return a pointer to a newly constructed inversion list, with enough
7887  * space to store 'initial_size' elements.  If that number is negative, a
7888  * system default is used instead */
7889
7890  SV* new_list;
7891
7892  if (initial_size < 0) {
7893   initial_size = 10;
7894  }
7895
7896  /* Allocate the initial space */
7897  new_list = newSV_type(SVt_INVLIST);
7898
7899  /* First 1 is in case the zero element isn't in the list; second 1 is for
7900  * trailing NUL */
7901  SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7902  invlist_set_len(new_list, 0, 0);
7903
7904  /* Force iterinit() to be used to get iteration to work */
7905  *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7906
7907  *get_invlist_previous_index_addr(new_list) = 0;
7908
7909  return new_list;
7910 }
7911
7912 SV*
7913 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7914 {
7915  /* Return a pointer to a newly constructed inversion list, initialized to
7916  * point to <list>, which has to be in the exact correct inversion list
7917  * form, including internal fields.  Thus this is a dangerous routine that
7918  * should not be used in the wrong hands.  The passed in 'list' contains
7919  * several header fields at the beginning that are not part of the
7920  * inversion list body proper */
7921
7922  const STRLEN length = (STRLEN) list[0];
7923  const UV version_id =          list[1];
7924  const bool offset   =    cBOOL(list[2]);
7925 #define HEADER_LENGTH 3
7926  /* If any of the above changes in any way, you must change HEADER_LENGTH
7927  * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7928  *      perl -E 'say int(rand 2**31-1)'
7929  */
7930 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7931           data structure type, so that one being
7932           passed in can be validated to be an
7933           inversion list of the correct vintage.
7934          */
7935
7936  SV* invlist = newSV_type(SVt_INVLIST);
7937
7938  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7939
7940  if (version_id != INVLIST_VERSION_ID) {
7941   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7942  }
7943
7944  /* The generated array passed in includes header elements that aren't part
7945  * of the list proper, so start it just after them */
7946  SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7947
7948  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7949        shouldn't touch it */
7950
7951  *(get_invlist_offset_addr(invlist)) = offset;
7952
7953  /* The 'length' passed to us is the physical number of elements in the
7954  * inversion list.  But if there is an offset the logical number is one
7955  * less than that */
7956  invlist_set_len(invlist, length  - offset, offset);
7957
7958  invlist_set_previous_index(invlist, 0);
7959
7960  /* Initialize the iteration pointer. */
7961  invlist_iterfinish(invlist);
7962
7963  SvREADONLY_on(invlist);
7964
7965  return invlist;
7966 }
7967 #endif /* ifndef PERL_IN_XSUB_RE */
7968
7969 STATIC void
7970 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7971 {
7972  /* Grow the maximum size of an inversion list */
7973
7974  PERL_ARGS_ASSERT_INVLIST_EXTEND;
7975
7976  assert(SvTYPE(invlist) == SVt_INVLIST);
7977
7978  /* Add one to account for the zero element at the beginning which may not
7979  * be counted by the calling parameters */
7980  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7981 }
7982
7983 PERL_STATIC_INLINE void
7984 S_invlist_trim(pTHX_ SV* const invlist)
7985 {
7986  PERL_ARGS_ASSERT_INVLIST_TRIM;
7987
7988  assert(SvTYPE(invlist) == SVt_INVLIST);
7989
7990  /* Change the length of the inversion list to how many entries it currently
7991  * has */
7992  SvPV_shrink_to_cur((SV *) invlist);
7993 }
7994
7995 STATIC void
7996 S__append_range_to_invlist(pTHX_ SV* const invlist,
7997         const UV start, const UV end)
7998 {
7999    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8000  * the end of the inversion list.  The range must be above any existing
8001  * ones. */
8002
8003  UV* array;
8004  UV max = invlist_max(invlist);
8005  UV len = _invlist_len(invlist);
8006  bool offset;
8007
8008  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8009
8010  if (len == 0) { /* Empty lists must be initialized */
8011   offset = start != 0;
8012   array = _invlist_array_init(invlist, ! offset);
8013  }
8014  else {
8015   /* Here, the existing list is non-empty. The current max entry in the
8016   * list is generally the first value not in the set, except when the
8017   * set extends to the end of permissible values, in which case it is
8018   * the first entry in that final set, and so this call is an attempt to
8019   * append out-of-order */
8020
8021   UV final_element = len - 1;
8022   array = invlist_array(invlist);
8023   if (array[final_element] > start
8024    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8025   {
8026    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8027      array[final_element], start,
8028      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8029   }
8030
8031   /* Here, it is a legal append.  If the new range begins with the first
8032   * value not in the set, it is extending the set, so the new first
8033   * value not in the set is one greater than the newly extended range.
8034   * */
8035   offset = *get_invlist_offset_addr(invlist);
8036   if (array[final_element] == start) {
8037    if (end != UV_MAX) {
8038     array[final_element] = end + 1;
8039    }
8040    else {
8041     /* But if the end is the maximum representable on the machine,
8042     * just let the range that this would extend to have no end */
8043     invlist_set_len(invlist, len - 1, offset);
8044    }
8045    return;
8046   }
8047  }
8048
8049  /* Here the new range doesn't extend any existing set.  Add it */
8050
8051  len += 2; /* Includes an element each for the start and end of range */
8052
8053  /* If wll overflow the existing space, extend, which may cause the array to
8054  * be moved */
8055  if (max < len) {
8056   invlist_extend(invlist, len);
8057
8058   /* Have to set len here to avoid assert failure in invlist_array() */
8059   invlist_set_len(invlist, len, offset);
8060
8061   array = invlist_array(invlist);
8062  }
8063  else {
8064   invlist_set_len(invlist, len, offset);
8065  }
8066
8067  /* The next item on the list starts the range, the one after that is
8068  * one past the new range.  */
8069  array[len - 2] = start;
8070  if (end != UV_MAX) {
8071   array[len - 1] = end + 1;
8072  }
8073  else {
8074   /* But if the end is the maximum representable on the machine, just let
8075   * the range have no end */
8076   invlist_set_len(invlist, len - 1, offset);
8077  }
8078 }
8079
8080 #ifndef PERL_IN_XSUB_RE
8081
8082 IV
8083 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
8084 {
8085  /* Searches the inversion list for the entry that contains the input code
8086  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8087  * return value is the index into the list's array of the range that
8088  * contains <cp> */
8089
8090  IV low = 0;
8091  IV mid;
8092  IV high = _invlist_len(invlist);
8093  const IV highest_element = high - 1;
8094  const UV* array;
8095
8096  PERL_ARGS_ASSERT__INVLIST_SEARCH;
8097
8098  /* If list is empty, return failure. */
8099  if (high == 0) {
8100   return -1;
8101  }
8102
8103  /* (We can't get the array unless we know the list is non-empty) */
8104  array = invlist_array(invlist);
8105
8106  mid = invlist_previous_index(invlist);
8107  assert(mid >=0 && mid <= highest_element);
8108
8109  /* <mid> contains the cache of the result of the previous call to this
8110  * function (0 the first time).  See if this call is for the same result,
8111  * or if it is for mid-1.  This is under the theory that calls to this
8112  * function will often be for related code points that are near each other.
8113  * And benchmarks show that caching gives better results.  We also test
8114  * here if the code point is within the bounds of the list.  These tests
8115  * replace others that would have had to be made anyway to make sure that
8116  * the array bounds were not exceeded, and these give us extra information
8117  * at the same time */
8118  if (cp >= array[mid]) {
8119   if (cp >= array[highest_element]) {
8120    return highest_element;
8121   }
8122
8123   /* Here, array[mid] <= cp < array[highest_element].  This means that
8124   * the final element is not the answer, so can exclude it; it also
8125   * means that <mid> is not the final element, so can refer to 'mid + 1'
8126   * safely */
8127   if (cp < array[mid + 1]) {
8128    return mid;
8129   }
8130   high--;
8131   low = mid + 1;
8132  }
8133  else { /* cp < aray[mid] */
8134   if (cp < array[0]) { /* Fail if outside the array */
8135    return -1;
8136   }
8137   high = mid;
8138   if (cp >= array[mid - 1]) {
8139    goto found_entry;
8140   }
8141  }
8142
8143  /* Binary search.  What we are looking for is <i> such that
8144  * array[i] <= cp < array[i+1]
8145  * The loop below converges on the i+1.  Note that there may not be an
8146  * (i+1)th element in the array, and things work nonetheless */
8147  while (low < high) {
8148   mid = (low + high) / 2;
8149   assert(mid <= highest_element);
8150   if (array[mid] <= cp) { /* cp >= array[mid] */
8151    low = mid + 1;
8152
8153    /* We could do this extra test to exit the loop early.
8154    if (cp < array[low]) {
8155     return mid;
8156    }
8157    */
8158   }
8159   else { /* cp < array[mid] */
8160    high = mid;
8161   }
8162  }
8163
8164   found_entry:
8165  high--;
8166  invlist_set_previous_index(invlist, high);
8167  return high;
8168 }
8169
8170 void
8171 Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
8172          const UV start, const UV end, U8* swatch)
8173 {
8174  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8175  * but is used when the swash has an inversion list.  This makes this much
8176  * faster, as it uses a binary search instead of a linear one.  This is
8177  * intimately tied to that function, and perhaps should be in utf8.c,
8178  * except it is intimately tied to inversion lists as well.  It assumes
8179  * that <swatch> is all 0's on input */
8180
8181  UV current = start;
8182  const IV len = _invlist_len(invlist);
8183  IV i;
8184  const UV * array;
8185
8186  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8187
8188  if (len == 0) { /* Empty inversion list */
8189   return;
8190  }
8191
8192  array = invlist_array(invlist);
8193
8194  /* Find which element it is */
8195  i = _invlist_search(invlist, start);
8196
8197  /* We populate from <start> to <end> */
8198  while (current < end) {
8199   UV upper;
8200
8201   /* The inversion list gives the results for every possible code point
8202   * after the first one in the list.  Only those ranges whose index is
8203   * even are ones that the inversion list matches.  For the odd ones,
8204   * and if the initial code point is not in the list, we have to skip
8205   * forward to the next element */
8206   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8207    i++;
8208    if (i >= len) { /* Finished if beyond the end of the array */
8209     return;
8210    }
8211    current = array[i];
8212    if (current >= end) {   /* Finished if beyond the end of what we
8213          are populating */
8214     if (LIKELY(end < UV_MAX)) {
8215      return;
8216     }
8217
8218     /* We get here when the upper bound is the maximum
8219     * representable on the machine, and we are looking for just
8220     * that code point.  Have to special case it */
8221     i = len;
8222     goto join_end_of_list;
8223    }
8224   }
8225   assert(current >= start);
8226
8227   /* The current range ends one below the next one, except don't go past
8228   * <end> */
8229   i++;
8230   upper = (i < len && array[i] < end) ? array[i] : end;
8231
8232   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8233   * for each code point in it */
8234   for (; current < upper; current++) {
8235    const STRLEN offset = (STRLEN)(current - start);
8236    swatch[offset >> 3] |= 1 << (offset & 7);
8237   }
8238
8239  join_end_of_list:
8240
8241   /* Quit if at the end of the list */
8242   if (i >= len) {
8243
8244    /* But first, have to deal with the highest possible code point on
8245    * the platform.  The previous code assumes that <end> is one
8246    * beyond where we want to populate, but that is impossible at the
8247    * platform's infinity, so have to handle it specially */
8248    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8249    {
8250     const STRLEN offset = (STRLEN)(end - start);
8251     swatch[offset >> 3] |= 1 << (offset & 7);
8252    }
8253    return;
8254   }
8255
8256   /* Advance to the next range, which will be for code points not in the
8257   * inversion list */
8258   current = array[i];
8259  }
8260
8261  return;
8262 }
8263
8264 void
8265 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8266           const bool complement_b, SV** output)
8267 {
8268  /* Take the union of two inversion lists and point <output> to it.  *output
8269  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8270  * the reference count to that list will be decremented if not already a
8271  * temporary (mortal); otherwise *output will be made correspondingly
8272  * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8273  * second list is returned.  If <complement_b> is TRUE, the union is taken
8274  * of the complement (inversion) of <b> instead of b itself.
8275  *
8276  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8277  * Richard Gillam, published by Addison-Wesley, and explained at some
8278  * length there.  The preface says to incorporate its examples into your
8279  * code at your own risk.
8280  *
8281  * The algorithm is like a merge sort.
8282  *
8283  * XXX A potential performance improvement is to keep track as we go along
8284  * if only one of the inputs contributes to the result, meaning the other
8285  * is a subset of that one.  In that case, we can skip the final copy and
8286  * return the larger of the input lists, but then outside code might need
8287  * to keep track of whether to free the input list or not */
8288
8289  const UV* array_a;    /* a's array */
8290  const UV* array_b;
8291  UV len_a;     /* length of a's array */
8292  UV len_b;
8293
8294  SV* u;   /* the resulting union */
8295  UV* array_u;
8296  UV len_u;
8297
8298  UV i_a = 0;      /* current index into a's array */
8299  UV i_b = 0;
8300  UV i_u = 0;
8301
8302  /* running count, as explained in the algorithm source book; items are
8303  * stopped accumulating and are output when the count changes to/from 0.
8304  * The count is incremented when we start a range that's in the set, and
8305  * decremented when we start a range that's not in the set.  So its range
8306  * is 0 to 2.  Only when the count is zero is something not in the set.
8307  */
8308  UV count = 0;
8309
8310  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8311  assert(a != b);
8312
8313  /* If either one is empty, the union is the other one */
8314  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8315   bool make_temp = FALSE; /* Should we mortalize the result? */
8316
8317   if (*output == a) {
8318    if (a != NULL) {
8319     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8320      SvREFCNT_dec_NN(a);
8321     }
8322    }
8323   }
8324   if (*output != b) {
8325    *output = invlist_clone(b);
8326    if (complement_b) {
8327     _invlist_invert(*output);
8328    }
8329   } /* else *output already = b; */
8330
8331   if (make_temp) {
8332    sv_2mortal(*output);
8333   }
8334   return;
8335  }
8336  else if ((len_b = _invlist_len(b)) == 0) {
8337   bool make_temp = FALSE;
8338   if (*output == b) {
8339    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8340     SvREFCNT_dec_NN(b);
8341    }
8342   }
8343
8344   /* The complement of an empty list is a list that has everything in it,
8345   * so the union with <a> includes everything too */
8346   if (complement_b) {
8347    if (a == *output) {
8348     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8349      SvREFCNT_dec_NN(a);
8350     }
8351    }
8352    *output = _new_invlist(1);
8353    _append_range_to_invlist(*output, 0, UV_MAX);
8354   }
8355   else if (*output != a) {
8356    *output = invlist_clone(a);
8357   }
8358   /* else *output already = a; */
8359
8360   if (make_temp) {
8361    sv_2mortal(*output);
8362   }
8363   return;
8364  }
8365
8366  /* Here both lists exist and are non-empty */
8367  array_a = invlist_array(a);
8368  array_b = invlist_array(b);
8369
8370  /* If are to take the union of 'a' with the complement of b, set it
8371  * up so are looking at b's complement. */
8372  if (complement_b) {
8373
8374   /* To complement, we invert: if the first element is 0, remove it.  To
8375   * do this, we just pretend the array starts one later */
8376   if (array_b[0] == 0) {
8377    array_b++;
8378    len_b--;
8379   }
8380   else {
8381
8382    /* But if the first element is not zero, we pretend the list starts
8383    * at the 0 that is always stored immediately before the array. */
8384    array_b--;
8385    len_b++;
8386   }
8387  }
8388
8389  /* Size the union for the worst case: that the sets are completely
8390  * disjoint */
8391  u = _new_invlist(len_a + len_b);
8392
8393  /* Will contain U+0000 if either component does */
8394  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8395          || (len_b > 0 && array_b[0] == 0));
8396
8397  /* Go through each list item by item, stopping when exhausted one of
8398  * them */
8399  while (i_a < len_a && i_b < len_b) {
8400   UV cp;     /* The element to potentially add to the union's array */
8401   bool cp_in_set;   /* is it in the the input list's set or not */
8402
8403   /* We need to take one or the other of the two inputs for the union.
8404   * Since we are merging two sorted lists, we take the smaller of the
8405   * next items.  In case of a tie, we take the one that is in its set
8406   * first.  If we took one not in the set first, it would decrement the
8407   * count, possibly to 0 which would cause it to be output as ending the
8408   * range, and the next time through we would take the same number, and
8409   * output it again as beginning the next range.  By doing it the
8410   * opposite way, there is no possibility that the count will be
8411   * momentarily decremented to 0, and thus the two adjoining ranges will
8412   * be seamlessly merged.  (In a tie and both are in the set or both not
8413   * in the set, it doesn't matter which we take first.) */
8414   if (array_a[i_a] < array_b[i_b]
8415    || (array_a[i_a] == array_b[i_b]
8416     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8417   {
8418    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8419    cp= array_a[i_a++];
8420   }
8421   else {
8422    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8423    cp = array_b[i_b++];
8424   }
8425
8426   /* Here, have chosen which of the two inputs to look at.  Only output
8427   * if the running count changes to/from 0, which marks the
8428   * beginning/end of a range in that's in the set */
8429   if (cp_in_set) {
8430    if (count == 0) {
8431     array_u[i_u++] = cp;
8432    }
8433    count++;
8434   }
8435   else {
8436    count--;
8437    if (count == 0) {
8438     array_u[i_u++] = cp;
8439    }
8440   }
8441  }
8442
8443  /* Here, we are finished going through at least one of the lists, which
8444  * means there is something remaining in at most one.  We check if the list
8445  * that hasn't been exhausted is positioned such that we are in the middle
8446  * of a range in its set or not.  (i_a and i_b point to the element beyond
8447  * the one we care about.) If in the set, we decrement 'count'; if 0, there
8448  * is potentially more to output.
8449  * There are four cases:
8450  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
8451  *    in the union is entirely from the non-exhausted set.
8452  * 2) Both were in their sets, count is 2.  Nothing further should
8453  *    be output, as everything that remains will be in the exhausted
8454  *    list's set, hence in the union; decrementing to 1 but not 0 insures
8455  *    that
8456  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8457  *    Nothing further should be output because the union includes
8458  *    everything from the exhausted set.  Not decrementing ensures that.
8459  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8460  *    decrementing to 0 insures that we look at the remainder of the
8461  *    non-exhausted set */
8462  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8463   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8464  {
8465   count--;
8466  }
8467
8468  /* The final length is what we've output so far, plus what else is about to
8469  * be output.  (If 'count' is non-zero, then the input list we exhausted
8470  * has everything remaining up to the machine's limit in its set, and hence
8471  * in the union, so there will be no further output. */
8472  len_u = i_u;
8473  if (count == 0) {
8474   /* At most one of the subexpressions will be non-zero */
8475   len_u += (len_a - i_a) + (len_b - i_b);
8476  }
8477
8478  /* Set result to final length, which can change the pointer to array_u, so
8479  * re-find it */
8480  if (len_u != _invlist_len(u)) {
8481   invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8482   invlist_trim(u);
8483   array_u = invlist_array(u);
8484  }
8485
8486  /* When 'count' is 0, the list that was exhausted (if one was shorter than
8487  * the other) ended with everything above it not in its set.  That means
8488  * that the remaining part of the union is precisely the same as the
8489  * non-exhausted list, so can just copy it unchanged.  (If both list were
8490  * exhausted at the same time, then the operations below will be both 0.)
8491  */
8492  if (count == 0) {
8493   IV copy_count; /* At most one will have a non-zero copy count */
8494   if ((copy_count = len_a - i_a) > 0) {
8495    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8496   }
8497   else if ((copy_count = len_b - i_b) > 0) {
8498    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8499   }
8500  }
8501
8502  /*  We may be removing a reference to one of the inputs.  If so, the output
8503  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8504  *  count decremented) */
8505  if (a == *output || b == *output) {
8506   assert(! invlist_is_iterating(*output));
8507   if ((SvTEMP(*output))) {
8508    sv_2mortal(u);
8509   }
8510   else {
8511    SvREFCNT_dec_NN(*output);
8512   }
8513  }
8514
8515  *output = u;
8516
8517  return;
8518 }
8519
8520 void
8521 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8522            const bool complement_b, SV** i)
8523 {
8524  /* Take the intersection of two inversion lists and point <i> to it.  *i
8525  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8526  * the reference count to that list will be decremented if not already a
8527  * temporary (mortal); otherwise *i will be made correspondingly mortal.
8528  * The first list, <a>, may be NULL, in which case an empty list is
8529  * returned.  If <complement_b> is TRUE, the result will be the
8530  * intersection of <a> and the complement (or inversion) of <b> instead of
8531  * <b> directly.
8532  *
8533  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8534  * Richard Gillam, published by Addison-Wesley, and explained at some
8535  * length there.  The preface says to incorporate its examples into your
8536  * code at your own risk.  In fact, it had bugs
8537  *
8538  * The algorithm is like a merge sort, and is essentially the same as the
8539  * union above
8540  */
8541
8542  const UV* array_a;  /* a's array */
8543  const UV* array_b;
8544  UV len_a; /* length of a's array */
8545  UV len_b;
8546
8547  SV* r;       /* the resulting intersection */
8548  UV* array_r;
8549  UV len_r;
8550
8551  UV i_a = 0;      /* current index into a's array */
8552  UV i_b = 0;
8553  UV i_r = 0;
8554
8555  /* running count, as explained in the algorithm source book; items are
8556  * stopped accumulating and are output when the count changes to/from 2.
8557  * The count is incremented when we start a range that's in the set, and
8558  * decremented when we start a range that's not in the set.  So its range
8559  * is 0 to 2.  Only when the count is 2 is something in the intersection.
8560  */
8561  UV count = 0;
8562
8563  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8564  assert(a != b);
8565
8566  /* Special case if either one is empty */
8567  len_a = (a == NULL) ? 0 : _invlist_len(a);
8568  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8569   bool make_temp = FALSE;
8570
8571   if (len_a != 0 && complement_b) {
8572
8573    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8574    * be empty.  Here, also we are using 'b's complement, which hence
8575    * must be every possible code point.  Thus the intersection is
8576    * simply 'a'. */
8577    if (*i != a) {
8578     if (*i == b) {
8579      if (! (make_temp = cBOOL(SvTEMP(b)))) {
8580       SvREFCNT_dec_NN(b);
8581      }
8582     }
8583
8584     *i = invlist_clone(a);
8585    }
8586    /* else *i is already 'a' */
8587
8588    if (make_temp) {
8589     sv_2mortal(*i);
8590    }
8591    return;
8592   }
8593
8594   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8595   * intersection must be empty */
8596   if (*i == a) {
8597    if (! (make_temp = cBOOL(SvTEMP(a)))) {
8598     SvREFCNT_dec_NN(a);
8599    }
8600   }
8601   else if (*i == b) {
8602    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8603     SvREFCNT_dec_NN(b);
8604    }
8605   }
8606   *i = _new_invlist(0);
8607   if (make_temp) {
8608    sv_2mortal(*i);
8609   }
8610
8611   return;
8612  }
8613
8614  /* Here both lists exist and are non-empty */
8615  array_a = invlist_array(a);
8616  array_b = invlist_array(b);
8617
8618  /* If are to take the intersection of 'a' with the complement of b, set it
8619  * up so are looking at b's complement. */
8620  if (complement_b) {
8621
8622   /* To complement, we invert: if the first element is 0, remove it.  To
8623   * do this, we just pretend the array starts one later */
8624   if (array_b[0] == 0) {
8625    array_b++;
8626    len_b--;
8627   }
8628   else {
8629
8630    /* But if the first element is not zero, we pretend the list starts
8631    * at the 0 that is always stored immediately before the array. */
8632    array_b--;
8633    len_b++;
8634   }
8635  }
8636
8637  /* Size the intersection for the worst case: that the intersection ends up
8638  * fragmenting everything to be completely disjoint */
8639  r= _new_invlist(len_a + len_b);
8640
8641  /* Will contain U+0000 iff both components do */
8642  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8643          && len_b > 0 && array_b[0] == 0);
8644
8645  /* Go through each list item by item, stopping when exhausted one of
8646  * them */
8647  while (i_a < len_a && i_b < len_b) {
8648   UV cp;     /* The element to potentially add to the intersection's
8649      array */
8650   bool cp_in_set; /* Is it in the input list's set or not */
8651
8652   /* We need to take one or the other of the two inputs for the
8653   * intersection.  Since we are merging two sorted lists, we take the
8654   * smaller of the next items.  In case of a tie, we take the one that
8655   * is not in its set first (a difference from the union algorithm).  If
8656   * we took one in the set first, it would increment the count, possibly
8657   * to 2 which would cause it to be output as starting a range in the
8658   * intersection, and the next time through we would take that same
8659   * number, and output it again as ending the set.  By doing it the
8660   * opposite of this, there is no possibility that the count will be
8661   * momentarily incremented to 2.  (In a tie and both are in the set or
8662   * both not in the set, it doesn't matter which we take first.) */
8663   if (array_a[i_a] < array_b[i_b]
8664    || (array_a[i_a] == array_b[i_b]
8665     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8666   {
8667    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8668    cp= array_a[i_a++];
8669   }
8670   else {
8671    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8672    cp= array_b[i_b++];
8673   }
8674
8675   /* Here, have chosen which of the two inputs to look at.  Only output
8676   * if the running count changes to/from 2, which marks the
8677   * beginning/end of a range that's in the intersection */
8678   if (cp_in_set) {
8679    count++;
8680    if (count == 2) {
8681     array_r[i_r++] = cp;
8682    }
8683   }
8684   else {
8685    if (count == 2) {
8686     array_r[i_r++] = cp;
8687    }
8688    count--;
8689   }
8690  }
8691
8692  /* Here, we are finished going through at least one of the lists, which
8693  * means there is something remaining in at most one.  We check if the list
8694  * that has been exhausted is positioned such that we are in the middle
8695  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8696  * the ones we care about.)  There are four cases:
8697  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
8698  *    nothing left in the intersection.
8699  * 2) Both were in their sets, count is 2 and perhaps is incremented to
8700  *    above 2.  What should be output is exactly that which is in the
8701  *    non-exhausted set, as everything it has is also in the intersection
8702  *    set, and everything it doesn't have can't be in the intersection
8703  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8704  *    gets incremented to 2.  Like the previous case, the intersection is
8705  *    everything that remains in the non-exhausted set.
8706  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8707  *    remains 1.  And the intersection has nothing more. */
8708  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8709   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8710  {
8711   count++;
8712  }
8713
8714  /* The final length is what we've output so far plus what else is in the
8715  * intersection.  At most one of the subexpressions below will be non-zero
8716  * */
8717  len_r = i_r;
8718  if (count >= 2) {
8719   len_r += (len_a - i_a) + (len_b - i_b);
8720  }
8721
8722  /* Set result to final length, which can change the pointer to array_r, so
8723  * re-find it */
8724  if (len_r != _invlist_len(r)) {
8725   invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8726   invlist_trim(r);
8727   array_r = invlist_array(r);
8728  }
8729
8730  /* Finish outputting any remaining */
8731  if (count >= 2) { /* At most one will have a non-zero copy count */
8732   IV copy_count;
8733   if ((copy_count = len_a - i_a) > 0) {
8734    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8735   }
8736   else if ((copy_count = len_b - i_b) > 0) {
8737    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8738   }
8739  }
8740
8741  /*  We may be removing a reference to one of the inputs.  If so, the output
8742  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8743  *  count decremented) */
8744  if (a == *i || b == *i) {
8745   assert(! invlist_is_iterating(*i));
8746   if (SvTEMP(*i)) {
8747    sv_2mortal(r);
8748   }
8749   else {
8750    SvREFCNT_dec_NN(*i);
8751   }
8752  }
8753
8754  *i = r;
8755
8756  return;
8757 }
8758
8759 SV*
8760 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8761 {
8762  /* Add the range from 'start' to 'end' inclusive to the inversion list's
8763  * set.  A pointer to the inversion list is returned.  This may actually be
8764  * a new list, in which case the passed in one has been destroyed.  The
8765  * passed in inversion list can be NULL, in which case a new one is created
8766  * with just the one range in it */
8767
8768  SV* range_invlist;
8769  UV len;
8770
8771  if (invlist == NULL) {
8772   invlist = _new_invlist(2);
8773   len = 0;
8774  }
8775  else {
8776   len = _invlist_len(invlist);
8777  }
8778
8779  /* If comes after the final entry actually in the list, can just append it
8780  * to the end, */
8781  if (len == 0
8782   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8783    && start >= invlist_array(invlist)[len - 1]))
8784  {
8785   _append_range_to_invlist(invlist, start, end);
8786   return invlist;
8787  }
8788
8789  /* Here, can't just append things, create and return a new inversion list
8790  * which is the union of this range and the existing inversion list */
8791  range_invlist = _new_invlist(2);
8792  _append_range_to_invlist(range_invlist, start, end);
8793
8794  _invlist_union(invlist, range_invlist, &invlist);
8795
8796  /* The temporary can be freed */
8797  SvREFCNT_dec_NN(range_invlist);
8798
8799  return invlist;
8800 }
8801
8802 SV*
8803 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8804         UV** other_elements_ptr)
8805 {
8806  /* Create and return an inversion list whose contents are to be populated
8807  * by the caller.  The caller gives the number of elements (in 'size') and
8808  * the very first element ('element0').  This function will set
8809  * '*other_elements_ptr' to an array of UVs, where the remaining elements
8810  * are to be placed.
8811  *
8812  * Obviously there is some trust involved that the caller will properly
8813  * fill in the other elements of the array.
8814  *
8815  * (The first element needs to be passed in, as the underlying code does
8816  * things differently depending on whether it is zero or non-zero) */
8817
8818  SV* invlist = _new_invlist(size);
8819  bool offset;
8820
8821  PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8822
8823  _append_range_to_invlist(invlist, element0, element0);
8824  offset = *get_invlist_offset_addr(invlist);
8825
8826  invlist_set_len(invlist, size, offset);
8827  *other_elements_ptr = invlist_array(invlist) + 1;
8828  return invlist;
8829 }
8830
8831 #endif
8832
8833 PERL_STATIC_INLINE SV*
8834 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8835  return _add_range_to_invlist(invlist, cp, cp);
8836 }
8837
8838 #ifndef PERL_IN_XSUB_RE
8839 void
8840 Perl__invlist_invert(pTHX_ SV* const invlist)
8841 {
8842  /* Complement the input inversion list.  This adds a 0 if the list didn't
8843  * have a zero; removes it otherwise.  As described above, the data
8844  * structure is set up so that this is very efficient */
8845
8846  PERL_ARGS_ASSERT__INVLIST_INVERT;
8847
8848  assert(! invlist_is_iterating(invlist));
8849
8850  /* The inverse of matching nothing is matching everything */
8851  if (_invlist_len(invlist) == 0) {
8852   _append_range_to_invlist(invlist, 0, UV_MAX);
8853   return;
8854  }
8855
8856  *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8857 }
8858
8859 #endif
8860
8861 PERL_STATIC_INLINE SV*
8862 S_invlist_clone(pTHX_ SV* const invlist)
8863 {
8864
8865  /* Return a new inversion list that is a copy of the input one, which is
8866  * unchanged.  The new list will not be mortal even if the old one was. */
8867
8868  /* Need to allocate extra space to accommodate Perl's addition of a
8869  * trailing NUL to SvPV's, since it thinks they are always strings */
8870  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8871  STRLEN physical_length = SvCUR(invlist);
8872  bool offset = *(get_invlist_offset_addr(invlist));
8873
8874  PERL_ARGS_ASSERT_INVLIST_CLONE;
8875
8876  *(get_invlist_offset_addr(new_invlist)) = offset;
8877  invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8878  Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8879
8880  return new_invlist;
8881 }
8882
8883 PERL_STATIC_INLINE STRLEN*
8884 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8885 {
8886  /* Return the address of the UV that contains the current iteration
8887  * position */
8888
8889  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8890
8891  assert(SvTYPE(invlist) == SVt_INVLIST);
8892
8893  return &(((XINVLIST*) SvANY(invlist))->iterator);
8894 }
8895
8896 PERL_STATIC_INLINE void
8897 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8898 {
8899  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8900
8901  *get_invlist_iter_addr(invlist) = 0;
8902 }
8903
8904 PERL_STATIC_INLINE void
8905 S_invlist_iterfinish(pTHX_ SV* invlist)
8906 {
8907  /* Terminate iterator for invlist.  This is to catch development errors.
8908  * Any iteration that is interrupted before completed should call this
8909  * function.  Functions that add code points anywhere else but to the end
8910  * of an inversion list assert that they are not in the middle of an
8911  * iteration.  If they were, the addition would make the iteration
8912  * problematical: if the iteration hadn't reached the place where things
8913  * were being added, it would be ok */
8914
8915  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8916
8917  *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8918 }
8919
8920 STATIC bool
8921 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8922 {
8923  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8924  * This call sets in <*start> and <*end>, the next range in <invlist>.
8925  * Returns <TRUE> if successful and the next call will return the next
8926  * range; <FALSE> if was already at the end of the list.  If the latter,
8927  * <*start> and <*end> are unchanged, and the next call to this function
8928  * will start over at the beginning of the list */
8929
8930  STRLEN* pos = get_invlist_iter_addr(invlist);
8931  UV len = _invlist_len(invlist);
8932  UV *array;
8933
8934  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8935
8936  if (*pos >= len) {
8937   *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8938   return FALSE;
8939  }
8940
8941  array = invlist_array(invlist);
8942
8943  *start = array[(*pos)++];
8944
8945  if (*pos >= len) {
8946   *end = UV_MAX;
8947  }
8948  else {
8949   *end = array[(*pos)++] - 1;
8950  }
8951
8952  return TRUE;
8953 }
8954
8955 PERL_STATIC_INLINE bool
8956 S_invlist_is_iterating(pTHX_ SV* const invlist)
8957 {
8958  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8959
8960  return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8961 }
8962
8963 PERL_STATIC_INLINE UV
8964 S_invlist_highest(pTHX_ SV* const invlist)
8965 {
8966  /* Returns the highest code point that matches an inversion list.  This API
8967  * has an ambiguity, as it returns 0 under either the highest is actually
8968  * 0, or if the list is empty.  If this distinction matters to you, check
8969  * for emptiness before calling this function */
8970
8971  UV len = _invlist_len(invlist);
8972  UV *array;
8973
8974  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8975
8976  if (len == 0) {
8977   return 0;
8978  }
8979
8980  array = invlist_array(invlist);
8981
8982  /* The last element in the array in the inversion list always starts a
8983  * range that goes to infinity.  That range may be for code points that are
8984  * matched in the inversion list, or it may be for ones that aren't
8985  * matched.  In the latter case, the highest code point in the set is one
8986  * less than the beginning of this range; otherwise it is the final element
8987  * of this range: infinity */
8988  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8989   ? UV_MAX
8990   : array[len - 1] - 1;
8991 }
8992
8993 #ifndef PERL_IN_XSUB_RE
8994 SV *
8995 Perl__invlist_contents(pTHX_ SV* const invlist)
8996 {
8997  /* Get the contents of an inversion list into a string SV so that they can
8998  * be printed out.  It uses the format traditionally done for debug tracing
8999  */
9000
9001  UV start, end;
9002  SV* output = newSVpvs("\n");
9003
9004  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9005
9006  assert(! invlist_is_iterating(invlist));
9007
9008  invlist_iterinit(invlist);
9009  while (invlist_iternext(invlist, &start, &end)) {
9010   if (end == UV_MAX) {
9011    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9012   }
9013   else if (end != start) {
9014    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9015      start,       end);
9016   }
9017   else {
9018    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9019   }
9020  }
9021
9022  return output;
9023 }
9024 #endif
9025
9026 #ifndef PERL_IN_XSUB_RE
9027 void
9028 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9029       const char * const indent, SV* const invlist)
9030 {
9031  /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9032  * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9033  * the string 'indent'.  The output looks like this:
9034   [0] 0x000A .. 0x000D
9035   [2] 0x0085
9036   [4] 0x2028 .. 0x2029
9037   [6] 0x3104 .. INFINITY
9038  * This means that the first range of code points matched by the list are
9039  * 0xA through 0xD; the second range contains only the single code point
9040  * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9041  * are used to define each range (except if the final range extends to
9042  * infinity, only a single element is needed).  The array index of the
9043  * first element for the corresponding range is given in brackets. */
9044
9045  UV start, end;
9046  STRLEN count = 0;
9047
9048  PERL_ARGS_ASSERT__INVLIST_DUMP;
9049
9050  if (invlist_is_iterating(invlist)) {
9051   Perl_dump_indent(aTHX_ level, file,
9052    "%sCan't dump inversion list because is in middle of iterating\n",
9053    indent);
9054   return;
9055  }
9056
9057  invlist_iterinit(invlist);
9058  while (invlist_iternext(invlist, &start, &end)) {
9059   if (end == UV_MAX) {
9060    Perl_dump_indent(aTHX_ level, file,
9061          "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9062         indent, (UV)count, start);
9063   }
9064   else if (end != start) {
9065    Perl_dump_indent(aTHX_ level, file,
9066          "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9067         indent, (UV)count, start,         end);
9068   }
9069   else {
9070    Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9071            indent, (UV)count, start);
9072   }
9073   count += 2;
9074  }
9075 }
9076 #endif
9077
9078 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9079 bool
9080 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9081 {
9082  /* Return a boolean as to if the two passed in inversion lists are
9083  * identical.  The final argument, if TRUE, says to take the complement of
9084  * the second inversion list before doing the comparison */
9085
9086  const UV* array_a = invlist_array(a);
9087  const UV* array_b = invlist_array(b);
9088  UV len_a = _invlist_len(a);
9089  UV len_b = _invlist_len(b);
9090
9091  UV i = 0;      /* current index into the arrays */
9092  bool retval = TRUE;     /* Assume are identical until proven otherwise */
9093
9094  PERL_ARGS_ASSERT__INVLISTEQ;
9095
9096  /* If are to compare 'a' with the complement of b, set it
9097  * up so are looking at b's complement. */
9098  if (complement_b) {
9099
9100   /* The complement of nothing is everything, so <a> would have to have
9101   * just one element, starting at zero (ending at infinity) */
9102   if (len_b == 0) {
9103    return (len_a == 1 && array_a[0] == 0);
9104   }
9105   else if (array_b[0] == 0) {
9106
9107    /* Otherwise, to complement, we invert.  Here, the first element is
9108    * 0, just remove it.  To do this, we just pretend the array starts
9109    * one later */
9110
9111    array_b++;
9112    len_b--;
9113   }
9114   else {
9115
9116    /* But if the first element is not zero, we pretend the list starts
9117    * at the 0 that is always stored immediately before the array. */
9118    array_b--;
9119    len_b++;
9120   }
9121  }
9122
9123  /* Make sure that the lengths are the same, as well as the final element
9124  * before looping through the remainder.  (Thus we test the length, final,
9125  * and first elements right off the bat) */
9126  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9127   retval = FALSE;
9128  }
9129  else for (i = 0; i < len_a - 1; i++) {
9130   if (array_a[i] != array_b[i]) {
9131    retval = FALSE;
9132    break;
9133   }
9134  }
9135
9136  return retval;
9137 }
9138 #endif
9139
9140 #undef HEADER_LENGTH
9141 #undef TO_INTERNAL_SIZE
9142 #undef FROM_INTERNAL_SIZE
9143 #undef INVLIST_VERSION_ID
9144
9145 /* End of inversion list object */
9146
9147 STATIC void
9148 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9149 {
9150  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9151  * constructs, and updates RExC_flags with them.  On input, RExC_parse
9152  * should point to the first flag; it is updated on output to point to the
9153  * final ')' or ':'.  There needs to be at least one flag, or this will
9154  * abort */
9155
9156  /* for (?g), (?gc), and (?o) warnings; warning
9157  about (?c) will warn about (?g) -- japhy    */
9158
9159 #define WASTED_O  0x01
9160 #define WASTED_G  0x02
9161 #define WASTED_C  0x04
9162 #define WASTED_GC (WASTED_G|WASTED_C)
9163  I32 wastedflags = 0x00;
9164  U32 posflags = 0, negflags = 0;
9165  U32 *flagsp = &posflags;
9166  char has_charset_modifier = '\0';
9167  regex_charset cs;
9168  bool has_use_defaults = FALSE;
9169  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9170
9171  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9172
9173  /* '^' as an initial flag sets certain defaults */
9174  if (UCHARAT(RExC_parse) == '^') {
9175   RExC_parse++;
9176   has_use_defaults = TRUE;
9177   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9178   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9179           ? REGEX_UNICODE_CHARSET
9180           : REGEX_DEPENDS_CHARSET);
9181  }
9182
9183  cs = get_regex_charset(RExC_flags);
9184  if (cs == REGEX_DEPENDS_CHARSET
9185   && (RExC_utf8 || RExC_uni_semantics))
9186  {
9187   cs = REGEX_UNICODE_CHARSET;
9188  }
9189
9190  while (*RExC_parse) {
9191   /* && strchr("iogcmsx", *RExC_parse) */
9192   /* (?g), (?gc) and (?o) are useless here
9193   and must be globally applied -- japhy */
9194   switch (*RExC_parse) {
9195
9196    /* Code for the imsx flags */
9197    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9198
9199    case LOCALE_PAT_MOD:
9200     if (has_charset_modifier) {
9201      goto excess_modifier;
9202     }
9203     else if (flagsp == &negflags) {
9204      goto neg_modifier;
9205     }
9206     cs = REGEX_LOCALE_CHARSET;
9207     has_charset_modifier = LOCALE_PAT_MOD;
9208     break;
9209    case UNICODE_PAT_MOD:
9210     if (has_charset_modifier) {
9211      goto excess_modifier;
9212     }
9213     else if (flagsp == &negflags) {
9214      goto neg_modifier;
9215     }
9216     cs = REGEX_UNICODE_CHARSET;
9217     has_charset_modifier = UNICODE_PAT_MOD;
9218     break;
9219    case ASCII_RESTRICT_PAT_MOD:
9220     if (flagsp == &negflags) {
9221      goto neg_modifier;
9222     }
9223     if (has_charset_modifier) {
9224      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9225       goto excess_modifier;
9226      }
9227      /* Doubled modifier implies more restricted */
9228      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9229     }
9230     else {
9231      cs = REGEX_ASCII_RESTRICTED_CHARSET;
9232     }
9233     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9234     break;
9235    case DEPENDS_PAT_MOD:
9236     if (has_use_defaults) {
9237      goto fail_modifiers;
9238     }
9239     else if (flagsp == &negflags) {
9240      goto neg_modifier;
9241     }
9242     else if (has_charset_modifier) {
9243      goto excess_modifier;
9244     }
9245
9246     /* The dual charset means unicode semantics if the
9247     * pattern (or target, not known until runtime) are
9248     * utf8, or something in the pattern indicates unicode
9249     * semantics */
9250     cs = (RExC_utf8 || RExC_uni_semantics)
9251      ? REGEX_UNICODE_CHARSET
9252      : REGEX_DEPENDS_CHARSET;
9253     has_charset_modifier = DEPENDS_PAT_MOD;
9254     break;
9255    excess_modifier:
9256     RExC_parse++;
9257     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9258      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9259     }
9260     else if (has_charset_modifier == *(RExC_parse - 1)) {
9261      vFAIL2("Regexp modifier \"%c\" may not appear twice",
9262           *(RExC_parse - 1));
9263     }
9264     else {
9265      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9266     }
9267     /*NOTREACHED*/
9268    neg_modifier:
9269     RExC_parse++;
9270     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9271          *(RExC_parse - 1));
9272     /*NOTREACHED*/
9273    case ONCE_PAT_MOD: /* 'o' */
9274    case GLOBAL_PAT_MOD: /* 'g' */
9275     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9276      const I32 wflagbit = *RExC_parse == 'o'
9277           ? WASTED_O
9278           : WASTED_G;
9279      if (! (wastedflags & wflagbit) ) {
9280       wastedflags |= wflagbit;
9281       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9282       vWARN5(
9283        RExC_parse + 1,
9284        "Useless (%s%c) - %suse /%c modifier",
9285        flagsp == &negflags ? "?-" : "?",
9286        *RExC_parse,
9287        flagsp == &negflags ? "don't " : "",
9288        *RExC_parse
9289       );
9290      }
9291     }
9292     break;
9293
9294    case CONTINUE_PAT_MOD: /* 'c' */
9295     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9296      if (! (wastedflags & WASTED_C) ) {
9297       wastedflags |= WASTED_GC;
9298       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9299       vWARN3(
9300        RExC_parse + 1,
9301        "Useless (%sc) - %suse /gc modifier",
9302        flagsp == &negflags ? "?-" : "?",
9303        flagsp == &negflags ? "don't " : ""
9304       );
9305      }
9306     }
9307     break;
9308    case KEEPCOPY_PAT_MOD: /* 'p' */
9309     if (flagsp == &negflags) {
9310      if (SIZE_ONLY)
9311       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9312     } else {
9313      *flagsp |= RXf_PMf_KEEPCOPY;
9314     }
9315     break;
9316    case '-':
9317     /* A flag is a default iff it is following a minus, so
9318     * if there is a minus, it means will be trying to
9319     * re-specify a default which is an error */
9320     if (has_use_defaults || flagsp == &negflags) {
9321      goto fail_modifiers;
9322     }
9323     flagsp = &negflags;
9324     wastedflags = 0;  /* reset so (?g-c) warns twice */
9325     break;
9326    case ':':
9327    case ')':
9328     RExC_flags |= posflags;
9329     RExC_flags &= ~negflags;
9330     set_regex_charset(&RExC_flags, cs);
9331     if (RExC_flags & RXf_PMf_FOLD) {
9332      RExC_contains_i = 1;
9333     }
9334     return;
9335     /*NOTREACHED*/
9336    default:
9337    fail_modifiers:
9338     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9339     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9340     vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9341      UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9342     /*NOTREACHED*/
9343   }
9344
9345   ++RExC_parse;
9346  }
9347 }
9348
9349 /*
9350  - reg - regular expression, i.e. main body or parenthesized thing
9351  *
9352  * Caller must absorb opening parenthesis.
9353  *
9354  * Combining parenthesis handling with the base level of regular expression
9355  * is a trifle forced, but the need to tie the tails of the branches to what
9356  * follows makes it hard to avoid.
9357  */
9358 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9359 #ifdef DEBUGGING
9360 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9361 #else
9362 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9363 #endif
9364
9365 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9366    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9367    needs to be restarted.
9368    Otherwise would only return NULL if regbranch() returns NULL, which
9369    cannot happen.  */
9370 STATIC regnode *
9371 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9372  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9373  * 2 is like 1, but indicates that nextchar() has been called to advance
9374  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9375  * this flag alerts us to the need to check for that */
9376 {
9377  dVAR;
9378  regnode *ret;  /* Will be the head of the group. */
9379  regnode *br;
9380  regnode *lastbr;
9381  regnode *ender = NULL;
9382  I32 parno = 0;
9383  I32 flags;
9384  U32 oregflags = RExC_flags;
9385  bool have_branch = 0;
9386  bool is_open = 0;
9387  I32 freeze_paren = 0;
9388  I32 after_freeze = 0;
9389
9390  char * parse_start = RExC_parse; /* MJD */
9391  char * const oregcomp_parse = RExC_parse;
9392
9393  GET_RE_DEBUG_FLAGS_DECL;
9394
9395  PERL_ARGS_ASSERT_REG;
9396  DEBUG_PARSE("reg ");
9397
9398  *flagp = 0;    /* Tentatively. */
9399
9400
9401  /* Make an OPEN node, if parenthesized. */
9402  if (paren) {
9403
9404   /* Under /x, space and comments can be gobbled up between the '(' and
9405   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9406   * intervening space, as the sequence is a token, and a token should be
9407   * indivisible */
9408   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9409
9410   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9411    char *start_verb = RExC_parse;
9412    STRLEN verb_len = 0;
9413    char *start_arg = NULL;
9414    unsigned char op = 0;
9415    int argok = 1;
9416    int internal_argval = 0; /* internal_argval is only useful if
9417           !argok */
9418
9419    if (has_intervening_patws && SIZE_ONLY) {
9420     ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9421    }
9422    while ( *RExC_parse && *RExC_parse != ')' ) {
9423     if ( *RExC_parse == ':' ) {
9424      start_arg = RExC_parse + 1;
9425      break;
9426     }
9427     RExC_parse++;
9428    }
9429    ++start_verb;
9430    verb_len = RExC_parse - start_verb;
9431    if ( start_arg ) {
9432     RExC_parse++;
9433     while ( *RExC_parse && *RExC_parse != ')' )
9434      RExC_parse++;
9435     if ( *RExC_parse != ')' )
9436      vFAIL("Unterminated verb pattern argument");
9437     if ( RExC_parse == start_arg )
9438      start_arg = NULL;
9439    } else {
9440     if ( *RExC_parse != ')' )
9441      vFAIL("Unterminated verb pattern");
9442    }
9443
9444    switch ( *start_verb ) {
9445    case 'A':  /* (*ACCEPT) */
9446     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9447      op = ACCEPT;
9448      internal_argval = RExC_nestroot;
9449     }
9450     break;
9451    case 'C':  /* (*COMMIT) */
9452     if ( memEQs(start_verb,verb_len,"COMMIT") )
9453      op = COMMIT;
9454     break;
9455    case 'F':  /* (*FAIL) */
9456     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9457      op = OPFAIL;
9458      argok = 0;
9459     }
9460     break;
9461    case ':':  /* (*:NAME) */
9462    case 'M':  /* (*MARK:NAME) */
9463     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9464      op = MARKPOINT;
9465      argok = -1;
9466     }
9467     break;
9468    case 'P':  /* (*PRUNE) */
9469     if ( memEQs(start_verb,verb_len,"PRUNE") )
9470      op = PRUNE;
9471     break;
9472    case 'S':   /* (*SKIP) */
9473     if ( memEQs(start_verb,verb_len,"SKIP") )
9474      op = SKIP;
9475     break;
9476    case 'T':  /* (*THEN) */
9477     /* [19:06] <TimToady> :: is then */
9478     if ( memEQs(start_verb,verb_len,"THEN") ) {
9479      op = CUTGROUP;
9480      RExC_seen |= REG_CUTGROUP_SEEN;
9481     }
9482     break;
9483    }
9484    if ( ! op ) {
9485     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9486     vFAIL2utf8f(
9487      "Unknown verb pattern '%"UTF8f"'",
9488      UTF8fARG(UTF, verb_len, start_verb));
9489    }
9490    if ( argok ) {
9491     if ( start_arg && internal_argval ) {
9492      vFAIL3("Verb pattern '%.*s' may not have an argument",
9493       verb_len, start_verb);
9494     } else if ( argok < 0 && !start_arg ) {
9495      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9496       verb_len, start_verb);
9497     } else {
9498      ret = reganode(pRExC_state, op, internal_argval);
9499      if ( ! internal_argval && ! SIZE_ONLY ) {
9500       if (start_arg) {
9501        SV *sv = newSVpvn( start_arg,
9502            RExC_parse - start_arg);
9503        ARG(ret) = add_data( pRExC_state,
9504             STR_WITH_LEN("S"));
9505        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9506        ret->flags = 0;
9507       } else {
9508        ret->flags = 1;
9509       }
9510      }
9511     }
9512     if (!internal_argval)
9513      RExC_seen |= REG_VERBARG_SEEN;
9514    } else if ( start_arg ) {
9515     vFAIL3("Verb pattern '%.*s' may not have an argument",
9516       verb_len, start_verb);
9517    } else {
9518     ret = reg_node(pRExC_state, op);
9519    }
9520    nextchar(pRExC_state);
9521    return ret;
9522   }
9523   else if (*RExC_parse == '?') { /* (?...) */
9524    bool is_logical = 0;
9525    const char * const seqstart = RExC_parse;
9526    if (has_intervening_patws && SIZE_ONLY) {
9527     ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9528    }
9529
9530    RExC_parse++;
9531    paren = *RExC_parse++;
9532    ret = NULL;   /* For look-ahead/behind. */
9533    switch (paren) {
9534
9535    case 'P': /* (?P...) variants for those used to PCRE/Python */
9536     paren = *RExC_parse++;
9537     if ( paren == '<')         /* (?P<...>) named capture */
9538      goto named_capture;
9539     else if (paren == '>') {   /* (?P>name) named recursion */
9540      goto named_recursion;
9541     }
9542     else if (paren == '=') {   /* (?P=...)  named backref */
9543      /* this pretty much dupes the code for \k<NAME> in
9544      * regatom(), if you change this make sure you change that
9545      * */
9546      char* name_start = RExC_parse;
9547      U32 num = 0;
9548      SV *sv_dat = reg_scan_name(pRExC_state,
9549       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9550      if (RExC_parse == name_start || *RExC_parse != ')')
9551       /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9552       vFAIL2("Sequence %.3s... not terminated",parse_start);
9553
9554      if (!SIZE_ONLY) {
9555       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9556       RExC_rxi->data->data[num]=(void*)sv_dat;
9557       SvREFCNT_inc_simple_void(sv_dat);
9558      }
9559      RExC_sawback = 1;
9560      ret = reganode(pRExC_state,
9561         ((! FOLD)
9562          ? NREF
9563          : (ASCII_FOLD_RESTRICTED)
9564          ? NREFFA
9565          : (AT_LEAST_UNI_SEMANTICS)
9566           ? NREFFU
9567           : (LOC)
9568           ? NREFFL
9569           : NREFF),
9570          num);
9571      *flagp |= HASWIDTH;
9572
9573      Set_Node_Offset(ret, parse_start+1);
9574      Set_Node_Cur_Length(ret, parse_start);
9575
9576      nextchar(pRExC_state);
9577      return ret;
9578     }
9579     RExC_parse++;
9580     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9581     vFAIL3("Sequence (%.*s...) not recognized",
9582         RExC_parse-seqstart, seqstart);
9583     /*NOTREACHED*/
9584    case '<':           /* (?<...) */
9585     if (*RExC_parse == '!')
9586      paren = ',';
9587     else if (*RExC_parse != '=')
9588    named_capture:
9589     {               /* (?<...>) */
9590      char *name_start;
9591      SV *svname;
9592      paren= '>';
9593    case '\'':          /* (?'...') */
9594       name_start= RExC_parse;
9595       svname = reg_scan_name(pRExC_state,
9596       SIZE_ONLY    /* reverse test from the others */
9597       ? REG_RSN_RETURN_NAME
9598       : REG_RSN_RETURN_NULL);
9599      if (RExC_parse == name_start || *RExC_parse != paren)
9600       vFAIL2("Sequence (?%c... not terminated",
9601        paren=='>' ? '<' : paren);
9602      if (SIZE_ONLY) {
9603       HE *he_str;
9604       SV *sv_dat = NULL;
9605       if (!svname) /* shouldn't happen */
9606        Perl_croak(aTHX_
9607         "panic: reg_scan_name returned NULL");
9608       if (!RExC_paren_names) {
9609        RExC_paren_names= newHV();
9610        sv_2mortal(MUTABLE_SV(RExC_paren_names));
9611 #ifdef DEBUGGING
9612        RExC_paren_name_list= newAV();
9613        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9614 #endif
9615       }
9616       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9617       if ( he_str )
9618        sv_dat = HeVAL(he_str);
9619       if ( ! sv_dat ) {
9620        /* croak baby croak */
9621        Perl_croak(aTHX_
9622         "panic: paren_name hash element allocation failed");
9623       } else if ( SvPOK(sv_dat) ) {
9624        /* (?|...) can mean we have dupes so scan to check
9625        its already been stored. Maybe a flag indicating
9626        we are inside such a construct would be useful,
9627        but the arrays are likely to be quite small, so
9628        for now we punt -- dmq */
9629        IV count = SvIV(sv_dat);
9630        I32 *pv = (I32*)SvPVX(sv_dat);
9631        IV i;
9632        for ( i = 0 ; i < count ; i++ ) {
9633         if ( pv[i] == RExC_npar ) {
9634          count = 0;
9635          break;
9636         }
9637        }
9638        if ( count ) {
9639         pv = (I32*)SvGROW(sv_dat,
9640             SvCUR(sv_dat) + sizeof(I32)+1);
9641         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9642         pv[count] = RExC_npar;
9643         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9644        }
9645       } else {
9646        (void)SvUPGRADE(sv_dat,SVt_PVNV);
9647        sv_setpvn(sv_dat, (char *)&(RExC_npar),
9648                 sizeof(I32));
9649        SvIOK_on(sv_dat);
9650        SvIV_set(sv_dat, 1);
9651       }
9652 #ifdef DEBUGGING
9653       /* Yes this does cause a memory leak in debugging Perls
9654       * */
9655       if (!av_store(RExC_paren_name_list,
9656          RExC_npar, SvREFCNT_inc(svname)))
9657        SvREFCNT_dec_NN(svname);
9658 #endif
9659
9660       /*sv_dump(sv_dat);*/
9661      }
9662      nextchar(pRExC_state);
9663      paren = 1;
9664      goto capturing_parens;
9665     }
9666     RExC_seen |= REG_LOOKBEHIND_SEEN;
9667     RExC_in_lookbehind++;
9668     RExC_parse++;
9669    case '=':           /* (?=...) */
9670     RExC_seen_zerolen++;
9671     break;
9672    case '!':           /* (?!...) */
9673     RExC_seen_zerolen++;
9674     if (*RExC_parse == ')') {
9675      ret=reg_node(pRExC_state, OPFAIL);
9676      nextchar(pRExC_state);
9677      return ret;
9678     }
9679     break;
9680    case '|':           /* (?|...) */
9681     /* branch reset, behave like a (?:...) except that
9682     buffers in alternations share the same numbers */
9683     paren = ':';
9684     after_freeze = freeze_paren = RExC_npar;
9685     break;
9686    case ':':           /* (?:...) */
9687    case '>':           /* (?>...) */
9688     break;
9689    case '$':           /* (?$...) */
9690    case '@':           /* (?@...) */
9691     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9692     break;
9693    case '#':           /* (?#...) */
9694     /* XXX As soon as we disallow separating the '?' and '*' (by
9695     * spaces or (?#...) comment), it is believed that this case
9696     * will be unreachable and can be removed.  See
9697     * [perl #117327] */
9698     while (*RExC_parse && *RExC_parse != ')')
9699      RExC_parse++;
9700     if (*RExC_parse != ')')
9701      FAIL("Sequence (?#... not terminated");
9702     nextchar(pRExC_state);
9703     *flagp = TRYAGAIN;
9704     return NULL;
9705    case '0' :           /* (?0) */
9706    case 'R' :           /* (?R) */
9707     if (*RExC_parse != ')')
9708      FAIL("Sequence (?R) not terminated");
9709     ret = reg_node(pRExC_state, GOSTART);
9710      RExC_seen |= REG_GOSTART_SEEN;
9711     *flagp |= POSTPONED;
9712     nextchar(pRExC_state);
9713     return ret;
9714     /*notreached*/
9715    { /* named and numeric backreferences */
9716     I32 num;
9717    case '&':            /* (?&NAME) */
9718     parse_start = RExC_parse - 1;
9719    named_recursion:
9720     {
9721       SV *sv_dat = reg_scan_name(pRExC_state,
9722        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9723       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9724     }
9725     if (RExC_parse == RExC_end || *RExC_parse != ')')
9726      vFAIL("Sequence (?&... not terminated");
9727     goto gen_recurse_regop;
9728     assert(0); /* NOT REACHED */
9729    case '+':
9730     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9731      RExC_parse++;
9732      vFAIL("Illegal pattern");
9733     }
9734     goto parse_recursion;
9735     /* NOT REACHED*/
9736    case '-': /* (?-1) */
9737     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9738      RExC_parse--; /* rewind to let it be handled later */
9739      goto parse_flags;
9740     }
9741     /*FALLTHROUGH */
9742    case '1': case '2': case '3': case '4': /* (?1) */
9743    case '5': case '6': case '7': case '8': case '9':
9744     RExC_parse--;
9745    parse_recursion:
9746     num = atoi(RExC_parse);
9747     parse_start = RExC_parse - 1; /* MJD */
9748     if (*RExC_parse == '-')
9749      RExC_parse++;
9750     while (isDIGIT(*RExC_parse))
9751       RExC_parse++;
9752     if (*RExC_parse!=')')
9753      vFAIL("Expecting close bracket");
9754
9755    gen_recurse_regop:
9756     if ( paren == '-' ) {
9757      /*
9758      Diagram of capture buffer numbering.
9759      Top line is the normal capture buffer numbers
9760      Bottom line is the negative indexing as from
9761      the X (the (?-2))
9762
9763      +   1 2    3 4 5 X          6 7
9764      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9765      -   5 4    3 2 1 X          x x
9766
9767      */
9768      num = RExC_npar + num;
9769      if (num < 1)  {
9770       RExC_parse++;
9771       vFAIL("Reference to nonexistent group");
9772      }
9773     } else if ( paren == '+' ) {
9774      num = RExC_npar + num - 1;
9775     }
9776
9777     ret = reganode(pRExC_state, GOSUB, num);
9778     if (!SIZE_ONLY) {
9779      if (num > (I32)RExC_rx->nparens) {
9780       RExC_parse++;
9781       vFAIL("Reference to nonexistent group");
9782      }
9783      ARG2L_SET( ret, RExC_recurse_count++);
9784      RExC_emit++;
9785      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9786       "Recurse #%"UVuf" to %"IVdf"\n",
9787        (UV)ARG(ret), (IV)ARG2L(ret)));
9788     } else {
9789      RExC_size++;
9790      }
9791      RExC_seen |= REG_RECURSE_SEEN;
9792     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9793     Set_Node_Offset(ret, parse_start); /* MJD */
9794
9795     *flagp |= POSTPONED;
9796     nextchar(pRExC_state);
9797     return ret;
9798    } /* named and numeric backreferences */
9799    assert(0); /* NOT REACHED */
9800
9801    case '?':           /* (??...) */
9802     is_logical = 1;
9803     if (*RExC_parse != '{') {
9804      RExC_parse++;
9805      /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9806      vFAIL2utf8f(
9807       "Sequence (%"UTF8f"...) not recognized",
9808       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9809      /*NOTREACHED*/
9810     }
9811     *flagp |= POSTPONED;
9812     paren = *RExC_parse++;
9813     /* FALL THROUGH */
9814    case '{':           /* (?{...}) */
9815    {
9816     U32 n = 0;
9817     struct reg_code_block *cb;
9818
9819     RExC_seen_zerolen++;
9820
9821     if (   !pRExC_state->num_code_blocks
9822      || pRExC_state->code_index >= pRExC_state->num_code_blocks
9823      || pRExC_state->code_blocks[pRExC_state->code_index].start
9824       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9825        - RExC_start)
9826     ) {
9827      if (RExC_pm_flags & PMf_USE_RE_EVAL)
9828       FAIL("panic: Sequence (?{...}): no code block found\n");
9829      FAIL("Eval-group not allowed at runtime, use re 'eval'");
9830     }
9831     /* this is a pre-compiled code block (?{...}) */
9832     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9833     RExC_parse = RExC_start + cb->end;
9834     if (!SIZE_ONLY) {
9835      OP *o = cb->block;
9836      if (cb->src_regex) {
9837       n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9838       RExC_rxi->data->data[n] =
9839        (void*)SvREFCNT_inc((SV*)cb->src_regex);
9840       RExC_rxi->data->data[n+1] = (void*)o;
9841      }
9842      else {
9843       n = add_data(pRExC_state,
9844        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9845       RExC_rxi->data->data[n] = (void*)o;
9846      }
9847     }
9848     pRExC_state->code_index++;
9849     nextchar(pRExC_state);
9850
9851     if (is_logical) {
9852      regnode *eval;
9853      ret = reg_node(pRExC_state, LOGICAL);
9854      eval = reganode(pRExC_state, EVAL, n);
9855      if (!SIZE_ONLY) {
9856       ret->flags = 2;
9857       /* for later propagation into (??{}) return value */
9858       eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9859      }
9860      REGTAIL(pRExC_state, ret, eval);
9861      /* deal with the length of this later - MJD */
9862      return ret;
9863     }
9864     ret = reganode(pRExC_state, EVAL, n);
9865     Set_Node_Length(ret, RExC_parse - parse_start + 1);
9866     Set_Node_Offset(ret, parse_start);
9867     return ret;
9868    }
9869    case '(':           /* (?(?{...})...) and (?(?=...)...) */
9870    {
9871     int is_define= 0;
9872     if (RExC_parse[0] == '?') {        /* (?(?...)) */
9873      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9874       || RExC_parse[1] == '<'
9875       || RExC_parse[1] == '{') { /* Lookahead or eval. */
9876       I32 flag;
9877       regnode *tail;
9878
9879       ret = reg_node(pRExC_state, LOGICAL);
9880       if (!SIZE_ONLY)
9881        ret->flags = 1;
9882
9883       tail = reg(pRExC_state, 1, &flag, depth+1);
9884       if (flag & RESTART_UTF8) {
9885        *flagp = RESTART_UTF8;
9886        return NULL;
9887       }
9888       REGTAIL(pRExC_state, ret, tail);
9889       goto insert_if;
9890      }
9891     }
9892     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9893       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9894     {
9895      char ch = RExC_parse[0] == '<' ? '>' : '\'';
9896      char *name_start= RExC_parse++;
9897      U32 num = 0;
9898      SV *sv_dat=reg_scan_name(pRExC_state,
9899       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9900      if (RExC_parse == name_start || *RExC_parse != ch)
9901       vFAIL2("Sequence (?(%c... not terminated",
9902        (ch == '>' ? '<' : ch));
9903      RExC_parse++;
9904      if (!SIZE_ONLY) {
9905       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9906       RExC_rxi->data->data[num]=(void*)sv_dat;
9907       SvREFCNT_inc_simple_void(sv_dat);
9908      }
9909      ret = reganode(pRExC_state,NGROUPP,num);
9910      goto insert_if_check_paren;
9911     }
9912     else if (RExC_parse[0] == 'D' &&
9913       RExC_parse[1] == 'E' &&
9914       RExC_parse[2] == 'F' &&
9915       RExC_parse[3] == 'I' &&
9916       RExC_parse[4] == 'N' &&
9917       RExC_parse[5] == 'E')
9918     {
9919      ret = reganode(pRExC_state,DEFINEP,0);
9920      RExC_parse +=6 ;
9921      is_define = 1;
9922      goto insert_if_check_paren;
9923     }
9924     else if (RExC_parse[0] == 'R') {
9925      RExC_parse++;
9926      parno = 0;
9927      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9928       parno = atoi(RExC_parse++);
9929       while (isDIGIT(*RExC_parse))
9930        RExC_parse++;
9931      } else if (RExC_parse[0] == '&') {
9932       SV *sv_dat;
9933       RExC_parse++;
9934       sv_dat = reg_scan_name(pRExC_state,
9935        SIZE_ONLY
9936        ? REG_RSN_RETURN_NULL
9937        : REG_RSN_RETURN_DATA);
9938        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9939      }
9940      ret = reganode(pRExC_state,INSUBP,parno);
9941      goto insert_if_check_paren;
9942     }
9943     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9944      /* (?(1)...) */
9945      char c;
9946      char *tmp;
9947      parno = atoi(RExC_parse++);
9948
9949      while (isDIGIT(*RExC_parse))
9950       RExC_parse++;
9951      ret = reganode(pRExC_state, GROUPP, parno);
9952
9953     insert_if_check_paren:
9954      if (*(tmp = nextchar(pRExC_state)) != ')') {
9955       /* nextchar also skips comments, so undo its work
9956       * and skip over the the next character.
9957       */
9958       RExC_parse = tmp;
9959       RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9960       vFAIL("Switch condition not recognized");
9961      }
9962     insert_if:
9963      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9964      br = regbranch(pRExC_state, &flags, 1,depth+1);
9965      if (br == NULL) {
9966       if (flags & RESTART_UTF8) {
9967        *flagp = RESTART_UTF8;
9968        return NULL;
9969       }
9970       FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9971        (UV) flags);
9972      } else
9973       REGTAIL(pRExC_state, br, reganode(pRExC_state,
9974               LONGJMP, 0));
9975      c = *nextchar(pRExC_state);
9976      if (flags&HASWIDTH)
9977       *flagp |= HASWIDTH;
9978      if (c == '|') {
9979       if (is_define)
9980        vFAIL("(?(DEFINE)....) does not allow branches");
9981
9982       /* Fake one for optimizer.  */
9983       lastbr = reganode(pRExC_state, IFTHEN, 0);
9984
9985       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9986        if (flags & RESTART_UTF8) {
9987         *flagp = RESTART_UTF8;
9988         return NULL;
9989        }
9990        FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9991         (UV) flags);
9992       }
9993       REGTAIL(pRExC_state, ret, lastbr);
9994       if (flags&HASWIDTH)
9995        *flagp |= HASWIDTH;
9996       c = *nextchar(pRExC_state);
9997      }
9998      else
9999       lastbr = NULL;
10000      if (c != ')')
10001       vFAIL("Switch (?(condition)... contains too many branches");
10002      ender = reg_node(pRExC_state, TAIL);
10003      REGTAIL(pRExC_state, br, ender);
10004      if (lastbr) {
10005       REGTAIL(pRExC_state, lastbr, ender);
10006       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10007      }
10008      else
10009       REGTAIL(pRExC_state, ret, ender);
10010      RExC_size++; /* XXX WHY do we need this?!!
10011          For large programs it seems to be required
10012          but I can't figure out why. -- dmq*/
10013      return ret;
10014     }
10015     else {
10016      RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10017      vFAIL("Unknown switch condition (?(...))");
10018     }
10019    }
10020    case '[':           /* (?[ ... ]) */
10021     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10022           oregcomp_parse);
10023    case 0:
10024     RExC_parse--; /* for vFAIL to print correctly */
10025     vFAIL("Sequence (? incomplete");
10026     break;
10027    default: /* e.g., (?i) */
10028     --RExC_parse;
10029    parse_flags:
10030     parse_lparen_question_flags(pRExC_state);
10031     if (UCHARAT(RExC_parse) != ':') {
10032      nextchar(pRExC_state);
10033      *flagp = TRYAGAIN;
10034      return NULL;
10035     }
10036     paren = ':';
10037     nextchar(pRExC_state);
10038     ret = NULL;
10039     goto parse_rest;
10040    } /* end switch */
10041   }
10042   else {                  /* (...) */
10043   capturing_parens:
10044    parno = RExC_npar;
10045    RExC_npar++;
10046
10047    ret = reganode(pRExC_state, OPEN, parno);
10048    if (!SIZE_ONLY ){
10049     if (!RExC_nestroot)
10050      RExC_nestroot = parno;
10051     if (RExC_seen & REG_RECURSE_SEEN
10052      && !RExC_open_parens[parno-1])
10053     {
10054      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10055       "Setting open paren #%"IVdf" to %d\n",
10056       (IV)parno, REG_NODE_NUM(ret)));
10057      RExC_open_parens[parno-1]= ret;
10058     }
10059    }
10060    Set_Node_Length(ret, 1); /* MJD */
10061    Set_Node_Offset(ret, RExC_parse); /* MJD */
10062    is_open = 1;
10063   }
10064  }
10065  else                        /* ! paren */
10066   ret = NULL;
10067
10068    parse_rest:
10069  /* Pick up the branches, linking them together. */
10070  parse_start = RExC_parse;   /* MJD */
10071  br = regbranch(pRExC_state, &flags, 1,depth+1);
10072
10073  /*     branch_len = (paren != 0); */
10074
10075  if (br == NULL) {
10076   if (flags & RESTART_UTF8) {
10077    *flagp = RESTART_UTF8;
10078    return NULL;
10079   }
10080   FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10081  }
10082  if (*RExC_parse == '|') {
10083   if (!SIZE_ONLY && RExC_extralen) {
10084    reginsert(pRExC_state, BRANCHJ, br, depth+1);
10085   }
10086   else {                  /* MJD */
10087    reginsert(pRExC_state, BRANCH, br, depth+1);
10088    Set_Node_Length(br, paren != 0);
10089    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10090   }
10091   have_branch = 1;
10092   if (SIZE_ONLY)
10093    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
10094  }
10095  else if (paren == ':') {
10096   *flagp |= flags&SIMPLE;
10097  }
10098  if (is_open) {    /* Starts with OPEN. */
10099   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10100  }
10101  else if (paren != '?')  /* Not Conditional */
10102   ret = br;
10103  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10104  lastbr = br;
10105  while (*RExC_parse == '|') {
10106   if (!SIZE_ONLY && RExC_extralen) {
10107    ender = reganode(pRExC_state, LONGJMP,0);
10108
10109    /* Append to the previous. */
10110    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10111   }
10112   if (SIZE_ONLY)
10113    RExC_extralen += 2;  /* Account for LONGJMP. */
10114   nextchar(pRExC_state);
10115   if (freeze_paren) {
10116    if (RExC_npar > after_freeze)
10117     after_freeze = RExC_npar;
10118    RExC_npar = freeze_paren;
10119   }
10120   br = regbranch(pRExC_state, &flags, 0, depth+1);
10121
10122   if (br == NULL) {
10123    if (flags & RESTART_UTF8) {
10124     *flagp = RESTART_UTF8;
10125     return NULL;
10126    }
10127    FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10128   }
10129   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10130   lastbr = br;
10131   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10132  }
10133
10134  if (have_branch || paren != ':') {
10135   /* Make a closing node, and hook it on the end. */
10136   switch (paren) {
10137   case ':':
10138    ender = reg_node(pRExC_state, TAIL);
10139    break;
10140   case 1: case 2:
10141    ender = reganode(pRExC_state, CLOSE, parno);
10142    if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10143     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10144       "Setting close paren #%"IVdf" to %d\n",
10145       (IV)parno, REG_NODE_NUM(ender)));
10146     RExC_close_parens[parno-1]= ender;
10147     if (RExC_nestroot == parno)
10148      RExC_nestroot = 0;
10149    }
10150    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10151    Set_Node_Length(ender,1); /* MJD */
10152    break;
10153   case '<':
10154   case ',':
10155   case '=':
10156   case '!':
10157    *flagp &= ~HASWIDTH;
10158    /* FALL THROUGH */
10159   case '>':
10160    ender = reg_node(pRExC_state, SUCCEED);
10161    break;
10162   case 0:
10163    ender = reg_node(pRExC_state, END);
10164    if (!SIZE_ONLY) {
10165     assert(!RExC_opend); /* there can only be one! */
10166     RExC_opend = ender;
10167    }
10168    break;
10169   }
10170   DEBUG_PARSE_r(if (!SIZE_ONLY) {
10171    SV * const mysv_val1=sv_newmortal();
10172    SV * const mysv_val2=sv_newmortal();
10173    DEBUG_PARSE_MSG("lsbr");
10174    regprop(RExC_rx, mysv_val1, lastbr, NULL);
10175    regprop(RExC_rx, mysv_val2, ender, NULL);
10176    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10177       SvPV_nolen_const(mysv_val1),
10178       (IV)REG_NODE_NUM(lastbr),
10179       SvPV_nolen_const(mysv_val2),
10180       (IV)REG_NODE_NUM(ender),
10181       (IV)(ender - lastbr)
10182    );
10183   });
10184   REGTAIL(pRExC_state, lastbr, ender);
10185
10186   if (have_branch && !SIZE_ONLY) {
10187    char is_nothing= 1;
10188    if (depth==1)
10189     RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10190
10191    /* Hook the tails of the branches to the closing node. */
10192    for (br = ret; br; br = regnext(br)) {
10193     const U8 op = PL_regkind[OP(br)];
10194     if (op == BRANCH) {
10195      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10196      if ( OP(NEXTOPER(br)) != NOTHING
10197       || regnext(NEXTOPER(br)) != ender)
10198       is_nothing= 0;
10199     }
10200     else if (op == BRANCHJ) {
10201      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10202      /* for now we always disable this optimisation * /
10203      if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10204       || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10205      */
10206       is_nothing= 0;
10207     }
10208    }
10209    if (is_nothing) {
10210     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10211     DEBUG_PARSE_r(if (!SIZE_ONLY) {
10212      SV * const mysv_val1=sv_newmortal();
10213      SV * const mysv_val2=sv_newmortal();
10214      DEBUG_PARSE_MSG("NADA");
10215      regprop(RExC_rx, mysv_val1, ret, NULL);
10216      regprop(RExC_rx, mysv_val2, ender, NULL);
10217      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10218         SvPV_nolen_const(mysv_val1),
10219         (IV)REG_NODE_NUM(ret),
10220         SvPV_nolen_const(mysv_val2),
10221         (IV)REG_NODE_NUM(ender),
10222         (IV)(ender - ret)
10223      );
10224     });
10225     OP(br)= NOTHING;
10226     if (OP(ender) == TAIL) {
10227      NEXT_OFF(br)= 0;
10228      RExC_emit= br + 1;
10229     } else {
10230      regnode *opt;
10231      for ( opt= br + 1; opt < ender ; opt++ )
10232       OP(opt)= OPTIMIZED;
10233      NEXT_OFF(br)= ender - br;
10234     }
10235    }
10236   }
10237  }
10238
10239  {
10240   const char *p;
10241   static const char parens[] = "=!<,>";
10242
10243   if (paren && (p = strchr(parens, paren))) {
10244    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10245    int flag = (p - parens) > 1;
10246
10247    if (paren == '>')
10248     node = SUSPEND, flag = 0;
10249    reginsert(pRExC_state, node,ret, depth+1);
10250    Set_Node_Cur_Length(ret, parse_start);
10251    Set_Node_Offset(ret, parse_start + 1);
10252    ret->flags = flag;
10253    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10254   }
10255  }
10256
10257  /* Check for proper termination. */
10258  if (paren) {
10259   /* restore original flags, but keep (?p) */
10260   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10261   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10262    RExC_parse = oregcomp_parse;
10263    vFAIL("Unmatched (");
10264   }
10265  }
10266  else if (!paren && RExC_parse < RExC_end) {
10267   if (*RExC_parse == ')') {
10268    RExC_parse++;
10269    vFAIL("Unmatched )");
10270   }
10271   else
10272    FAIL("Junk on end of regexp"); /* "Can't happen". */
10273   assert(0); /* NOTREACHED */
10274  }
10275
10276  if (RExC_in_lookbehind) {
10277   RExC_in_lookbehind--;
10278  }
10279  if (after_freeze > RExC_npar)
10280   RExC_npar = after_freeze;
10281  return(ret);
10282 }
10283
10284 /*
10285  - regbranch - one alternative of an | operator
10286  *
10287  * Implements the concatenation operator.
10288  *
10289  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10290  * restarted.
10291  */
10292 STATIC regnode *
10293 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10294 {
10295  dVAR;
10296  regnode *ret;
10297  regnode *chain = NULL;
10298  regnode *latest;
10299  I32 flags = 0, c = 0;
10300  GET_RE_DEBUG_FLAGS_DECL;
10301
10302  PERL_ARGS_ASSERT_REGBRANCH;
10303
10304  DEBUG_PARSE("brnc");
10305
10306  if (first)
10307   ret = NULL;
10308  else {
10309   if (!SIZE_ONLY && RExC_extralen)
10310    ret = reganode(pRExC_state, BRANCHJ,0);
10311   else {
10312    ret = reg_node(pRExC_state, BRANCH);
10313    Set_Node_Length(ret, 1);
10314   }
10315  }
10316
10317  if (!first && SIZE_ONLY)
10318   RExC_extralen += 1;   /* BRANCHJ */
10319
10320  *flagp = WORST;   /* Tentatively. */
10321
10322  RExC_parse--;
10323  nextchar(pRExC_state);
10324  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10325   flags &= ~TRYAGAIN;
10326   latest = regpiece(pRExC_state, &flags,depth+1);
10327   if (latest == NULL) {
10328    if (flags & TRYAGAIN)
10329     continue;
10330    if (flags & RESTART_UTF8) {
10331     *flagp = RESTART_UTF8;
10332     return NULL;
10333    }
10334    FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10335   }
10336   else if (ret == NULL)
10337    ret = latest;
10338   *flagp |= flags&(HASWIDTH|POSTPONED);
10339   if (chain == NULL)  /* First piece. */
10340    *flagp |= flags&SPSTART;
10341   else {
10342    RExC_naughty++;
10343    REGTAIL(pRExC_state, chain, latest);
10344   }
10345   chain = latest;
10346   c++;
10347  }
10348  if (chain == NULL) { /* Loop ran zero times. */
10349   chain = reg_node(pRExC_state, NOTHING);
10350   if (ret == NULL)
10351    ret = chain;
10352  }
10353  if (c == 1) {
10354   *flagp |= flags&SIMPLE;
10355  }
10356
10357  return ret;
10358 }
10359
10360 /*
10361  - regpiece - something followed by possible [*+?]
10362  *
10363  * Note that the branching code sequences used for ? and the general cases
10364  * of * and + are somewhat optimized:  they use the same NOTHING node as
10365  * both the endmarker for their branch list and the body of the last branch.
10366  * It might seem that this node could be dispensed with entirely, but the
10367  * endmarker role is not redundant.
10368  *
10369  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10370  * TRYAGAIN.
10371  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10372  * restarted.
10373  */
10374 STATIC regnode *
10375 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10376 {
10377  dVAR;
10378  regnode *ret;
10379  char op;
10380  char *next;
10381  I32 flags;
10382  const char * const origparse = RExC_parse;
10383  I32 min;
10384  I32 max = REG_INFTY;
10385 #ifdef RE_TRACK_PATTERN_OFFSETS
10386  char *parse_start;
10387 #endif
10388  const char *maxpos = NULL;
10389
10390  /* Save the original in case we change the emitted regop to a FAIL. */
10391  regnode * const orig_emit = RExC_emit;
10392
10393  GET_RE_DEBUG_FLAGS_DECL;
10394
10395  PERL_ARGS_ASSERT_REGPIECE;
10396
10397  DEBUG_PARSE("piec");
10398
10399  ret = regatom(pRExC_state, &flags,depth+1);
10400  if (ret == NULL) {
10401   if (flags & (TRYAGAIN|RESTART_UTF8))
10402    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10403   else
10404    FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10405   return(NULL);
10406  }
10407
10408  op = *RExC_parse;
10409
10410  if (op == '{' && regcurly(RExC_parse, FALSE)) {
10411   maxpos = NULL;
10412 #ifdef RE_TRACK_PATTERN_OFFSETS
10413   parse_start = RExC_parse; /* MJD */
10414 #endif
10415   next = RExC_parse + 1;
10416   while (isDIGIT(*next) || *next == ',') {
10417    if (*next == ',') {
10418     if (maxpos)
10419      break;
10420     else
10421      maxpos = next;
10422    }
10423    next++;
10424   }
10425   if (*next == '}') {  /* got one */
10426    if (!maxpos)
10427     maxpos = next;
10428    RExC_parse++;
10429    min = atoi(RExC_parse);
10430    if (*maxpos == ',')
10431     maxpos++;
10432    else
10433     maxpos = RExC_parse;
10434    max = atoi(maxpos);
10435    if (!max && *maxpos != '0')
10436     max = REG_INFTY;  /* meaning "infinity" */
10437    else if (max >= REG_INFTY)
10438     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10439    RExC_parse = next;
10440    nextchar(pRExC_state);
10441    if (max < min) {    /* If can't match, warn and optimize to fail
10442         unconditionally */
10443     if (SIZE_ONLY) {
10444      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10445
10446      /* We can't back off the size because we have to reserve
10447      * enough space for all the things we are about to throw
10448      * away, but we can shrink it by the ammount we are about
10449      * to re-use here */
10450      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10451     }
10452     else {
10453      RExC_emit = orig_emit;
10454     }
10455     ret = reg_node(pRExC_state, OPFAIL);
10456     return ret;
10457    }
10458    else if (min == max
10459      && RExC_parse < RExC_end
10460      && (*RExC_parse == '?' || *RExC_parse == '+'))
10461    {
10462     if (SIZE_ONLY) {
10463      ckWARN2reg(RExC_parse + 1,
10464        "Useless use of greediness modifier '%c'",
10465        *RExC_parse);
10466     }
10467     /* Absorb the modifier, so later code doesn't see nor use
10468      * it */
10469     nextchar(pRExC_state);
10470    }
10471
10472   do_curly:
10473    if ((flags&SIMPLE)) {
10474     RExC_naughty += 2 + RExC_naughty / 2;
10475     reginsert(pRExC_state, CURLY, ret, depth+1);
10476     Set_Node_Offset(ret, parse_start+1); /* MJD */
10477     Set_Node_Cur_Length(ret, parse_start);
10478    }
10479    else {
10480     regnode * const w = reg_node(pRExC_state, WHILEM);
10481
10482     w->flags = 0;
10483     REGTAIL(pRExC_state, ret, w);
10484     if (!SIZE_ONLY && RExC_extralen) {
10485      reginsert(pRExC_state, LONGJMP,ret, depth+1);
10486      reginsert(pRExC_state, NOTHING,ret, depth+1);
10487      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10488     }
10489     reginsert(pRExC_state, CURLYX,ret, depth+1);
10490         /* MJD hk */
10491     Set_Node_Offset(ret, parse_start+1);
10492     Set_Node_Length(ret,
10493         op == '{' ? (RExC_parse - parse_start) : 1);
10494
10495     if (!SIZE_ONLY && RExC_extralen)
10496      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10497     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10498     if (SIZE_ONLY)
10499      RExC_whilem_seen++, RExC_extralen += 3;
10500     RExC_naughty += 4 + RExC_naughty; /* compound interest */
10501    }
10502    ret->flags = 0;
10503
10504    if (min > 0)
10505     *flagp = WORST;
10506    if (max > 0)
10507     *flagp |= HASWIDTH;
10508    if (!SIZE_ONLY) {
10509     ARG1_SET(ret, (U16)min);
10510     ARG2_SET(ret, (U16)max);
10511    }
10512    if (max == REG_INFTY)
10513     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10514
10515    goto nest_check;
10516   }
10517  }
10518
10519  if (!ISMULT1(op)) {
10520   *flagp = flags;
10521   return(ret);
10522  }
10523
10524 #if 0    /* Now runtime fix should be reliable. */
10525
10526  /* if this is reinstated, don't forget to put this back into perldiag:
10527
10528    =item Regexp *+ operand could be empty at {#} in regex m/%s/
10529
10530   (F) The part of the regexp subject to either the * or + quantifier
10531   could match an empty string. The {#} shows in the regular
10532   expression about where the problem was discovered.
10533
10534  */
10535
10536  if (!(flags&HASWIDTH) && op != '?')
10537  vFAIL("Regexp *+ operand could be empty");
10538 #endif
10539
10540 #ifdef RE_TRACK_PATTERN_OFFSETS
10541  parse_start = RExC_parse;
10542 #endif
10543  nextchar(pRExC_state);
10544
10545  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10546
10547  if (op == '*' && (flags&SIMPLE)) {
10548   reginsert(pRExC_state, STAR, ret, depth+1);
10549   ret->flags = 0;
10550   RExC_naughty += 4;
10551   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10552  }
10553  else if (op == '*') {
10554   min = 0;
10555   goto do_curly;
10556  }
10557  else if (op == '+' && (flags&SIMPLE)) {
10558   reginsert(pRExC_state, PLUS, ret, depth+1);
10559   ret->flags = 0;
10560   RExC_naughty += 3;
10561   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10562  }
10563  else if (op == '+') {
10564   min = 1;
10565   goto do_curly;
10566  }
10567  else if (op == '?') {
10568   min = 0; max = 1;
10569   goto do_curly;
10570  }
10571   nest_check:
10572  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10573   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10574   ckWARN2reg(RExC_parse,
10575     "%"UTF8f" matches null string many times",
10576     UTF8fARG(UTF, (RExC_parse >= origparse
10577         ? RExC_parse - origparse
10578         : 0),
10579     origparse));
10580   (void)ReREFCNT_inc(RExC_rx_sv);
10581  }
10582
10583  if (RExC_parse < RExC_end && *RExC_parse == '?') {
10584   nextchar(pRExC_state);
10585   reginsert(pRExC_state, MINMOD, ret, depth+1);
10586   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10587  }
10588  else
10589  if (RExC_parse < RExC_end && *RExC_parse == '+') {
10590   regnode *ender;
10591   nextchar(pRExC_state);
10592   ender = reg_node(pRExC_state, SUCCEED);
10593   REGTAIL(pRExC_state, ret, ender);
10594   reginsert(pRExC_state, SUSPEND, ret, depth+1);
10595   ret->flags = 0;
10596   ender = reg_node(pRExC_state, TAIL);
10597   REGTAIL(pRExC_state, ret, ender);
10598  }
10599
10600  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10601   RExC_parse++;
10602   vFAIL("Nested quantifiers");
10603  }
10604
10605  return(ret);
10606 }
10607
10608 STATIC bool
10609 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10610      UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10611      const bool strict   /* Apply stricter parsing rules? */
10612  )
10613 {
10614
10615  /* This is expected to be called by a parser routine that has recognized '\N'
10616    and needs to handle the rest. RExC_parse is expected to point at the first
10617    char following the N at the time of the call.  On successful return,
10618    RExC_parse has been updated to point to just after the sequence identified
10619    by this routine, and <*flagp> has been updated.
10620
10621    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10622    character class.
10623
10624    \N may begin either a named sequence, or if outside a character class, mean
10625    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10626    attempted to decide which, and in the case of a named sequence, converted it
10627    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10628    where c1... are the characters in the sequence.  For single-quoted regexes,
10629    the tokenizer passes the \N sequence through unchanged; this code will not
10630    attempt to determine this nor expand those, instead raising a syntax error.
10631    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10632    or there is no '}', it signals that this \N occurrence means to match a
10633    non-newline.
10634
10635    Only the \N{U+...} form should occur in a character class, for the same
10636    reason that '.' inside a character class means to just match a period: it
10637    just doesn't make sense.
10638
10639    The function raises an error (via vFAIL), and doesn't return for various
10640    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10641    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10642    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10643    only possible if node_p is non-NULL.
10644
10645
10646    If <valuep> is non-null, it means the caller can accept an input sequence
10647    consisting of a just a single code point; <*valuep> is set to that value
10648    if the input is such.
10649
10650    If <node_p> is non-null it signifies that the caller can accept any other
10651    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10652    is set as follows:
10653  1) \N means not-a-NL: points to a newly created REG_ANY node;
10654  2) \N{}:              points to a new NOTHING node;
10655  3) otherwise:         points to a new EXACT node containing the resolved
10656       string.
10657    Note that FALSE is returned for single code point sequences if <valuep> is
10658    null.
10659  */
10660
10661  char * endbrace;    /* '}' following the name */
10662  char* p;
10663  char *endchar; /* Points to '.' or '}' ending cur char in the input
10664       stream */
10665  bool has_multiple_chars; /* true if the input stream contains a sequence of
10666         more than one character */
10667
10668  GET_RE_DEBUG_FLAGS_DECL;
10669
10670  PERL_ARGS_ASSERT_GROK_BSLASH_N;
10671
10672  GET_RE_DEBUG_FLAGS;
10673
10674  assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10675
10676  /* The [^\n] meaning of \N ignores spaces and comments under the /x
10677  * modifier.  The other meaning does not, so use a temporary until we find
10678  * out which we are being called with */
10679  p = (RExC_flags & RXf_PMf_EXTENDED)
10680   ? regwhite( pRExC_state, RExC_parse )
10681   : RExC_parse;
10682
10683  /* Disambiguate between \N meaning a named character versus \N meaning
10684  * [^\n].  The former is assumed when it can't be the latter. */
10685  if (*p != '{' || regcurly(p, FALSE)) {
10686   RExC_parse = p;
10687   if (! node_p) {
10688    /* no bare \N allowed in a charclass */
10689    if (in_char_class) {
10690     vFAIL("\\N in a character class must be a named character: \\N{...}");
10691    }
10692    return FALSE;
10693   }
10694   RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10695       current char */
10696   nextchar(pRExC_state);
10697   *node_p = reg_node(pRExC_state, REG_ANY);
10698   *flagp |= HASWIDTH|SIMPLE;
10699   RExC_naughty++;
10700   Set_Node_Length(*node_p, 1); /* MJD */
10701   return TRUE;
10702  }
10703
10704  /* Here, we have decided it should be a named character or sequence */
10705
10706  /* The test above made sure that the next real character is a '{', but
10707  * under the /x modifier, it could be separated by space (or a comment and
10708  * \n) and this is not allowed (for consistency with \x{...} and the
10709  * tokenizer handling of \N{NAME}). */
10710  if (*RExC_parse != '{') {
10711   vFAIL("Missing braces on \\N{}");
10712  }
10713
10714  RExC_parse++; /* Skip past the '{' */
10715
10716  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10717   || ! (endbrace == RExC_parse  /* nothing between the {} */
10718    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10719             */
10720     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10721              */
10722  {
10723   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10724   vFAIL("\\N{NAME} must be resolved by the lexer");
10725  }
10726
10727  if (endbrace == RExC_parse) {   /* empty: \N{} */
10728   bool ret = TRUE;
10729   if (node_p) {
10730    *node_p = reg_node(pRExC_state,NOTHING);
10731   }
10732   else if (in_char_class) {
10733    if (SIZE_ONLY && in_char_class) {
10734     if (strict) {
10735      RExC_parse++;   /* Position after the "}" */
10736      vFAIL("Zero length \\N{}");
10737     }
10738     else {
10739      ckWARNreg(RExC_parse,
10740        "Ignoring zero length \\N{} in character class");
10741     }
10742    }
10743    ret = FALSE;
10744   }
10745   else {
10746    return FALSE;
10747   }
10748   nextchar(pRExC_state);
10749   return ret;
10750  }
10751
10752  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10753  RExC_parse += 2; /* Skip past the 'U+' */
10754
10755  endchar = RExC_parse + strcspn(RExC_parse, ".}");
10756
10757  /* Code points are separated by dots.  If none, there is only one code
10758  * point, and is terminated by the brace */
10759  has_multiple_chars = (endchar < endbrace);
10760
10761  if (valuep && (! has_multiple_chars || in_char_class)) {
10762   /* We only pay attention to the first char of
10763   multichar strings being returned in char classes. I kinda wonder
10764   if this makes sense as it does change the behaviour
10765   from earlier versions, OTOH that behaviour was broken
10766   as well. XXX Solution is to recharacterize as
10767   [rest-of-class]|multi1|multi2... */
10768
10769   STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10770   I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10771    | PERL_SCAN_DISALLOW_PREFIX
10772    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10773
10774   *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10775
10776   /* The tokenizer should have guaranteed validity, but it's possible to
10777   * bypass it by using single quoting, so check */
10778   if (length_of_hex == 0
10779    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10780   {
10781    RExC_parse += length_of_hex; /* Includes all the valid */
10782    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10783        ? UTF8SKIP(RExC_parse)
10784        : 1;
10785    /* Guard against malformed utf8 */
10786    if (RExC_parse >= endchar) {
10787     RExC_parse = endchar;
10788    }
10789    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10790   }
10791
10792   if (in_char_class && has_multiple_chars) {
10793    if (strict) {
10794     RExC_parse = endbrace;
10795     vFAIL("\\N{} in character class restricted to one character");
10796    }
10797    else {
10798     ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10799    }
10800   }
10801
10802   RExC_parse = endbrace + 1;
10803  }
10804  else if (! node_p || ! has_multiple_chars) {
10805
10806   /* Here, the input is legal, but not according to the caller's
10807   * options.  We fail without advancing the parse, so that the
10808   * caller can try again */
10809   RExC_parse = p;
10810   return FALSE;
10811  }
10812  else {
10813
10814   /* What is done here is to convert this to a sub-pattern of the form
10815   * (?:\x{char1}\x{char2}...)
10816   * and then call reg recursively.  That way, it retains its atomicness,
10817   * while not having to worry about special handling that some code
10818   * points may have.  toke.c has converted the original Unicode values
10819   * to native, so that we can just pass on the hex values unchanged.  We
10820   * do have to set a flag to keep recoding from happening in the
10821   * recursion */
10822
10823   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10824   STRLEN len;
10825   char *orig_end = RExC_end;
10826   I32 flags;
10827
10828   while (RExC_parse < endbrace) {
10829
10830    /* Convert to notation the rest of the code understands */
10831    sv_catpv(substitute_parse, "\\x{");
10832    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10833    sv_catpv(substitute_parse, "}");
10834
10835    /* Point to the beginning of the next character in the sequence. */
10836    RExC_parse = endchar + 1;
10837    endchar = RExC_parse + strcspn(RExC_parse, ".}");
10838   }
10839   sv_catpv(substitute_parse, ")");
10840
10841   RExC_parse = SvPV(substitute_parse, len);
10842
10843   /* Don't allow empty number */
10844   if (len < 8) {
10845    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10846   }
10847   RExC_end = RExC_parse + len;
10848
10849   /* The values are Unicode, and therefore not subject to recoding */
10850   RExC_override_recoding = 1;
10851
10852   if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10853    if (flags & RESTART_UTF8) {
10854     *flagp = RESTART_UTF8;
10855     return FALSE;
10856    }
10857    FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10858     (UV) flags);
10859   }
10860   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10861
10862   RExC_parse = endbrace;
10863   RExC_end = orig_end;
10864   RExC_override_recoding = 0;
10865
10866   nextchar(pRExC_state);
10867  }
10868
10869  return TRUE;
10870 }
10871
10872
10873 /*
10874  * reg_recode
10875  *
10876  * It returns the code point in utf8 for the value in *encp.
10877  *    value: a code value in the source encoding
10878  *    encp:  a pointer to an Encode object
10879  *
10880  * If the result from Encode is not a single character,
10881  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10882  */
10883 STATIC UV
10884 S_reg_recode(pTHX_ const char value, SV **encp)
10885 {
10886  STRLEN numlen = 1;
10887  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10888  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10889  const STRLEN newlen = SvCUR(sv);
10890  UV uv = UNICODE_REPLACEMENT;
10891
10892  PERL_ARGS_ASSERT_REG_RECODE;
10893
10894  if (newlen)
10895   uv = SvUTF8(sv)
10896    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10897    : *(U8*)s;
10898
10899  if (!newlen || numlen != newlen) {
10900   uv = UNICODE_REPLACEMENT;
10901   *encp = NULL;
10902  }
10903  return uv;
10904 }
10905
10906 PERL_STATIC_INLINE U8
10907 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10908 {
10909  U8 op;
10910
10911  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10912
10913  if (! FOLD) {
10914   return EXACT;
10915  }
10916
10917  op = get_regex_charset(RExC_flags);
10918  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10919   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10920     been, so there is no hole */
10921  }
10922
10923  return op + EXACTF;
10924 }
10925
10926 PERL_STATIC_INLINE void
10927 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
10928       regnode *node, I32* flagp, STRLEN len, UV code_point,
10929       bool downgradable)
10930 {
10931  /* This knows the details about sizing an EXACTish node, setting flags for
10932  * it (by setting <*flagp>, and potentially populating it with a single
10933  * character.
10934  *
10935  * If <len> (the length in bytes) is non-zero, this function assumes that
10936  * the node has already been populated, and just does the sizing.  In this
10937  * case <code_point> should be the final code point that has already been
10938  * placed into the node.  This value will be ignored except that under some
10939  * circumstances <*flagp> is set based on it.
10940  *
10941  * If <len> is zero, the function assumes that the node is to contain only
10942  * the single character given by <code_point> and calculates what <len>
10943  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10944  * additionally will populate the node's STRING with <code_point> or its
10945  * fold if folding.
10946  *
10947  * In both cases <*flagp> is appropriately set
10948  *
10949  * It knows that under FOLD, the Latin Sharp S and UTF characters above
10950  * 255, must be folded (the former only when the rules indicate it can
10951  * match 'ss')
10952  *
10953  * When it does the populating, it looks at the flag 'downgradable'.  If
10954  * true with a node that folds, it checks if the single code point
10955  * participates in a fold, and if not downgrades the node to an EXACT.
10956  * This helps the optimizer */
10957
10958  bool len_passed_in = cBOOL(len != 0);
10959  U8 character[UTF8_MAXBYTES_CASE+1];
10960
10961  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10962
10963  /* Don't bother to check for downgrading in PASS1, as it doesn't make any
10964  * sizing difference, and is extra work that is thrown away */
10965  if (downgradable && ! PASS2) {
10966   downgradable = FALSE;
10967  }
10968
10969  if (! len_passed_in) {
10970   if (UTF) {
10971    if (UNI_IS_INVARIANT(code_point)) {
10972     if (LOC || ! FOLD) {    /* /l defers folding until runtime */
10973      *character = (U8) code_point;
10974     }
10975     else { /* Here is /i and not /l (toFOLD() is defined on just
10976       ASCII, which isn't the same thing as INVARIANT on
10977       EBCDIC, but it works there, as the extra invariants
10978       fold to themselves) */
10979      *character = toFOLD((U8) code_point);
10980      if (downgradable
10981       && *character == code_point
10982       && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
10983      {
10984       OP(node) = EXACT;
10985      }
10986     }
10987     len = 1;
10988    }
10989    else if (FOLD && (! LOC
10990        || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
10991    {   /* Folding, and ok to do so now */
10992     UV folded = _to_uni_fold_flags(
10993         code_point,
10994         character,
10995         &len,
10996         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
10997              ? FOLD_FLAGS_NOMIX_ASCII
10998              : 0));
10999     if (downgradable
11000      && folded == code_point
11001      && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11002     {
11003      OP(node) = EXACT;
11004     }
11005    }
11006    else if (code_point <= MAX_UTF8_TWO_BYTE) {
11007
11008     /* Not folding this cp, and can output it directly */
11009     *character = UTF8_TWO_BYTE_HI(code_point);
11010     *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11011     len = 2;
11012    }
11013    else {
11014     uvchr_to_utf8( character, code_point);
11015     len = UTF8SKIP(character);
11016    }
11017   } /* Else pattern isn't UTF8.  */
11018   else if (! FOLD) {
11019    *character = (U8) code_point;
11020    len = 1;
11021   } /* Else is folded non-UTF8 */
11022   else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11023
11024    /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11025    * comments at join_exact()); */
11026    *character = (U8) code_point;
11027    len = 1;
11028
11029    /* Can turn into an EXACT node if we know the fold at compile time,
11030    * and it folds to itself and doesn't particpate in other folds */
11031    if (downgradable
11032     && ! LOC
11033     && PL_fold_latin1[code_point] == code_point
11034     && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11035      || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11036    {
11037     OP(node) = EXACT;
11038    }
11039   } /* else is Sharp s.  May need to fold it */
11040   else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11041    *character = 's';
11042    *(character + 1) = 's';
11043    len = 2;
11044   }
11045   else {
11046    *character = LATIN_SMALL_LETTER_SHARP_S;
11047    len = 1;
11048   }
11049  }
11050
11051  if (SIZE_ONLY) {
11052   RExC_size += STR_SZ(len);
11053  }
11054  else {
11055   RExC_emit += STR_SZ(len);
11056   STR_LEN(node) = len;
11057   if (! len_passed_in) {
11058    Copy((char *) character, STRING(node), len, char);
11059   }
11060  }
11061
11062  *flagp |= HASWIDTH;
11063
11064  /* A single character node is SIMPLE, except for the special-cased SHARP S
11065  * under /di. */
11066  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11067   && (code_point != LATIN_SMALL_LETTER_SHARP_S
11068    || ! FOLD || ! DEPENDS_SEMANTICS))
11069  {
11070   *flagp |= SIMPLE;
11071  }
11072
11073  /* The OP may not be well defined in PASS1 */
11074  if (PASS2 && OP(node) == EXACTFL) {
11075   RExC_contains_locale = 1;
11076  }
11077 }
11078
11079
11080 /* return atoi(p), unless it's too big to sensibly be a backref,
11081  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11082
11083 static I32
11084 S_backref_value(char *p)
11085 {
11086  char *q = p;
11087
11088  for (;isDIGIT(*q); q++); /* calculate length of num */
11089  if (q - p == 0 || q - p > 9)
11090   return I32_MAX;
11091  return atoi(p);
11092 }
11093
11094
11095 /*
11096  - regatom - the lowest level
11097
11098    Try to identify anything special at the start of the pattern. If there
11099    is, then handle it as required. This may involve generating a single regop,
11100    such as for an assertion; or it may involve recursing, such as to
11101    handle a () structure.
11102
11103    If the string doesn't start with something special then we gobble up
11104    as much literal text as we can.
11105
11106    Once we have been able to handle whatever type of thing started the
11107    sequence, we return.
11108
11109    Note: we have to be careful with escapes, as they can be both literal
11110    and special, and in the case of \10 and friends, context determines which.
11111
11112    A summary of the code structure is:
11113
11114    switch (first_byte) {
11115   cases for each special:
11116    handle this special;
11117    break;
11118   case '\\':
11119    switch (2nd byte) {
11120     cases for each unambiguous special:
11121      handle this special;
11122      break;
11123     cases for each ambigous special/literal:
11124      disambiguate;
11125      if (special)  handle here
11126      else goto defchar;
11127     default: // unambiguously literal:
11128      goto defchar;
11129    }
11130   default:  // is a literal char
11131    // FALL THROUGH
11132   defchar:
11133    create EXACTish node for literal;
11134    while (more input and node isn't full) {
11135     switch (input_byte) {
11136     cases for each special;
11137      make sure parse pointer is set so that the next call to
11138       regatom will see this special first
11139      goto loopdone; // EXACTish node terminated by prev. char
11140     default:
11141      append char to EXACTISH node;
11142     }
11143     get next input byte;
11144    }
11145   loopdone:
11146    }
11147    return the generated node;
11148
11149    Specifically there are two separate switches for handling
11150    escape sequences, with the one for handling literal escapes requiring
11151    a dummy entry for all of the special escapes that are actually handled
11152    by the other.
11153
11154    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11155    TRYAGAIN.
11156    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11157    restarted.
11158    Otherwise does not return NULL.
11159 */
11160
11161 STATIC regnode *
11162 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11163 {
11164  dVAR;
11165  regnode *ret = NULL;
11166  I32 flags = 0;
11167  char *parse_start = RExC_parse;
11168  U8 op;
11169  int invert = 0;
11170
11171  GET_RE_DEBUG_FLAGS_DECL;
11172
11173  *flagp = WORST;  /* Tentatively. */
11174
11175  DEBUG_PARSE("atom");
11176
11177  PERL_ARGS_ASSERT_REGATOM;
11178
11179 tryagain:
11180  switch ((U8)*RExC_parse) {
11181  case '^':
11182   RExC_seen_zerolen++;
11183   nextchar(pRExC_state);
11184   if (RExC_flags & RXf_PMf_MULTILINE)
11185    ret = reg_node(pRExC_state, MBOL);
11186   else if (RExC_flags & RXf_PMf_SINGLELINE)
11187    ret = reg_node(pRExC_state, SBOL);
11188   else
11189    ret = reg_node(pRExC_state, BOL);
11190   Set_Node_Length(ret, 1); /* MJD */
11191   break;
11192  case '$':
11193   nextchar(pRExC_state);
11194   if (*RExC_parse)
11195    RExC_seen_zerolen++;
11196   if (RExC_flags & RXf_PMf_MULTILINE)
11197    ret = reg_node(pRExC_state, MEOL);
11198   else if (RExC_flags & RXf_PMf_SINGLELINE)
11199    ret = reg_node(pRExC_state, SEOL);
11200   else
11201    ret = reg_node(pRExC_state, EOL);
11202   Set_Node_Length(ret, 1); /* MJD */
11203   break;
11204  case '.':
11205   nextchar(pRExC_state);
11206   if (RExC_flags & RXf_PMf_SINGLELINE)
11207    ret = reg_node(pRExC_state, SANY);
11208   else
11209    ret = reg_node(pRExC_state, REG_ANY);
11210   *flagp |= HASWIDTH|SIMPLE;
11211   RExC_naughty++;
11212   Set_Node_Length(ret, 1); /* MJD */
11213   break;
11214  case '[':
11215  {
11216   char * const oregcomp_parse = ++RExC_parse;
11217   ret = regclass(pRExC_state, flagp,depth+1,
11218      FALSE, /* means parse the whole char class */
11219      TRUE, /* allow multi-char folds */
11220      FALSE, /* don't silence non-portable warnings. */
11221      NULL);
11222   if (*RExC_parse != ']') {
11223    RExC_parse = oregcomp_parse;
11224    vFAIL("Unmatched [");
11225   }
11226   if (ret == NULL) {
11227    if (*flagp & RESTART_UTF8)
11228     return NULL;
11229    FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11230     (UV) *flagp);
11231   }
11232   nextchar(pRExC_state);
11233   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11234   break;
11235  }
11236  case '(':
11237   nextchar(pRExC_state);
11238   ret = reg(pRExC_state, 2, &flags,depth+1);
11239   if (ret == NULL) {
11240     if (flags & TRYAGAIN) {
11241      if (RExC_parse == RExC_end) {
11242       /* Make parent create an empty node if needed. */
11243       *flagp |= TRYAGAIN;
11244       return(NULL);
11245      }
11246      goto tryagain;
11247     }
11248     if (flags & RESTART_UTF8) {
11249      *flagp = RESTART_UTF8;
11250      return NULL;
11251     }
11252     FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11253                 (UV) flags);
11254   }
11255   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11256   break;
11257  case '|':
11258  case ')':
11259   if (flags & TRYAGAIN) {
11260    *flagp |= TRYAGAIN;
11261    return NULL;
11262   }
11263   vFAIL("Internal urp");
11264         /* Supposed to be caught earlier. */
11265   break;
11266  case '{':
11267   if (!regcurly(RExC_parse, FALSE)) {
11268    RExC_parse++;
11269    goto defchar;
11270   }
11271   /* FALL THROUGH */
11272  case '?':
11273  case '+':
11274  case '*':
11275   RExC_parse++;
11276   vFAIL("Quantifier follows nothing");
11277   break;
11278  case '\\':
11279   /* Special Escapes
11280
11281   This switch handles escape sequences that resolve to some kind
11282   of special regop and not to literal text. Escape sequnces that
11283   resolve to literal text are handled below in the switch marked
11284   "Literal Escapes".
11285
11286   Every entry in this switch *must* have a corresponding entry
11287   in the literal escape switch. However, the opposite is not
11288   required, as the default for this switch is to jump to the
11289   literal text handling code.
11290   */
11291   switch ((U8)*++RExC_parse) {
11292    U8 arg;
11293   /* Special Escapes */
11294   case 'A':
11295    RExC_seen_zerolen++;
11296    ret = reg_node(pRExC_state, SBOL);
11297    *flagp |= SIMPLE;
11298    goto finish_meta_pat;
11299   case 'G':
11300    ret = reg_node(pRExC_state, GPOS);
11301    RExC_seen |= REG_GPOS_SEEN;
11302    *flagp |= SIMPLE;
11303    goto finish_meta_pat;
11304   case 'K':
11305    RExC_seen_zerolen++;
11306    ret = reg_node(pRExC_state, KEEPS);
11307    *flagp |= SIMPLE;
11308    /* XXX:dmq : disabling in-place substitution seems to
11309    * be necessary here to avoid cases of memory corruption, as
11310    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11311    */
11312    RExC_seen |= REG_LOOKBEHIND_SEEN;
11313    goto finish_meta_pat;
11314   case 'Z':
11315    ret = reg_node(pRExC_state, SEOL);
11316    *flagp |= SIMPLE;
11317    RExC_seen_zerolen++;  /* Do not optimize RE away */
11318    goto finish_meta_pat;
11319   case 'z':
11320    ret = reg_node(pRExC_state, EOS);
11321    *flagp |= SIMPLE;
11322    RExC_seen_zerolen++;  /* Do not optimize RE away */
11323    goto finish_meta_pat;
11324   case 'C':
11325    ret = reg_node(pRExC_state, CANY);
11326    RExC_seen |= REG_CANY_SEEN;
11327    *flagp |= HASWIDTH|SIMPLE;
11328    goto finish_meta_pat;
11329   case 'X':
11330    ret = reg_node(pRExC_state, CLUMP);
11331    *flagp |= HASWIDTH;
11332    goto finish_meta_pat;
11333
11334   case 'W':
11335    invert = 1;
11336    /* FALLTHROUGH */
11337   case 'w':
11338    arg = ANYOF_WORDCHAR;
11339    goto join_posix;
11340
11341   case 'b':
11342    RExC_seen_zerolen++;
11343    RExC_seen |= REG_LOOKBEHIND_SEEN;
11344    op = BOUND + get_regex_charset(RExC_flags);
11345    if (op > BOUNDA) {  /* /aa is same as /a */
11346     op = BOUNDA;
11347    }
11348    else if (op == BOUNDL) {
11349     RExC_contains_locale = 1;
11350    }
11351    ret = reg_node(pRExC_state, op);
11352    FLAGS(ret) = get_regex_charset(RExC_flags);
11353    *flagp |= SIMPLE;
11354    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11355     /* diag_listed_as: Use "%s" instead of "%s" */
11356     vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11357    }
11358    goto finish_meta_pat;
11359   case 'B':
11360    RExC_seen_zerolen++;
11361    RExC_seen |= REG_LOOKBEHIND_SEEN;
11362    op = NBOUND + get_regex_charset(RExC_flags);
11363    if (op > NBOUNDA) { /* /aa is same as /a */
11364     op = NBOUNDA;
11365    }
11366    else if (op == NBOUNDL) {
11367     RExC_contains_locale = 1;
11368    }
11369    ret = reg_node(pRExC_state, op);
11370    FLAGS(ret) = get_regex_charset(RExC_flags);
11371    *flagp |= SIMPLE;
11372    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11373     /* diag_listed_as: Use "%s" instead of "%s" */
11374     vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11375    }
11376    goto finish_meta_pat;
11377
11378   case 'D':
11379    invert = 1;
11380    /* FALLTHROUGH */
11381   case 'd':
11382    arg = ANYOF_DIGIT;
11383    goto join_posix;
11384
11385   case 'R':
11386    ret = reg_node(pRExC_state, LNBREAK);
11387    *flagp |= HASWIDTH|SIMPLE;
11388    goto finish_meta_pat;
11389
11390   case 'H':
11391    invert = 1;
11392    /* FALLTHROUGH */
11393   case 'h':
11394    arg = ANYOF_BLANK;
11395    op = POSIXU;
11396    goto join_posix_op_known;
11397
11398   case 'V':
11399    invert = 1;
11400    /* FALLTHROUGH */
11401   case 'v':
11402    arg = ANYOF_VERTWS;
11403    op = POSIXU;
11404    goto join_posix_op_known;
11405
11406   case 'S':
11407    invert = 1;
11408    /* FALLTHROUGH */
11409   case 's':
11410    arg = ANYOF_SPACE;
11411
11412   join_posix:
11413
11414    op = POSIXD + get_regex_charset(RExC_flags);
11415    if (op > POSIXA) {  /* /aa is same as /a */
11416     op = POSIXA;
11417    }
11418    else if (op == POSIXL) {
11419     RExC_contains_locale = 1;
11420    }
11421
11422   join_posix_op_known:
11423
11424    if (invert) {
11425     op += NPOSIXD - POSIXD;
11426    }
11427
11428    ret = reg_node(pRExC_state, op);
11429    if (! SIZE_ONLY) {
11430     FLAGS(ret) = namedclass_to_classnum(arg);
11431    }
11432
11433    *flagp |= HASWIDTH|SIMPLE;
11434    /* FALL THROUGH */
11435
11436   finish_meta_pat:
11437    nextchar(pRExC_state);
11438    Set_Node_Length(ret, 2); /* MJD */
11439    break;
11440   case 'p':
11441   case 'P':
11442    {
11443 #ifdef DEBUGGING
11444     char* parse_start = RExC_parse - 2;
11445 #endif
11446
11447     RExC_parse--;
11448
11449     ret = regclass(pRExC_state, flagp,depth+1,
11450        TRUE, /* means just parse this element */
11451        FALSE, /* don't allow multi-char folds */
11452        FALSE, /* don't silence non-portable warnings.
11453           It would be a bug if these returned
11454           non-portables */
11455        NULL);
11456     /* regclass() can only return RESTART_UTF8 if multi-char folds
11457     are allowed.  */
11458     if (!ret)
11459      FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11460       (UV) *flagp);
11461
11462     RExC_parse--;
11463
11464     Set_Node_Offset(ret, parse_start + 2);
11465     Set_Node_Cur_Length(ret, parse_start);
11466     nextchar(pRExC_state);
11467    }
11468    break;
11469   case 'N':
11470    /* Handle \N and \N{NAME} with multiple code points here and not
11471    * below because it can be multicharacter. join_exact() will join
11472    * them up later on.  Also this makes sure that things like
11473    * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11474    * The options to the grok function call causes it to fail if the
11475    * sequence is just a single code point.  We then go treat it as
11476    * just another character in the current EXACT node, and hence it
11477    * gets uniform treatment with all the other characters.  The
11478    * special treatment for quantifiers is not needed for such single
11479    * character sequences */
11480    ++RExC_parse;
11481    if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11482         FALSE /* not strict */ )) {
11483     if (*flagp & RESTART_UTF8)
11484      return NULL;
11485     RExC_parse--;
11486     goto defchar;
11487    }
11488    break;
11489   case 'k':    /* Handle \k<NAME> and \k'NAME' */
11490   parse_named_seq:
11491   {
11492    char ch= RExC_parse[1];
11493    if (ch != '<' && ch != '\'' && ch != '{') {
11494     RExC_parse++;
11495     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11496     vFAIL2("Sequence %.2s... not terminated",parse_start);
11497    } else {
11498     /* this pretty much dupes the code for (?P=...) in reg(), if
11499     you change this make sure you change that */
11500     char* name_start = (RExC_parse += 2);
11501     U32 num = 0;
11502     SV *sv_dat = reg_scan_name(pRExC_state,
11503      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11504     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11505     if (RExC_parse == name_start || *RExC_parse != ch)
11506      /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11507      vFAIL2("Sequence %.3s... not terminated",parse_start);
11508
11509     if (!SIZE_ONLY) {
11510      num = add_data( pRExC_state, STR_WITH_LEN("S"));
11511      RExC_rxi->data->data[num]=(void*)sv_dat;
11512      SvREFCNT_inc_simple_void(sv_dat);
11513     }
11514
11515     RExC_sawback = 1;
11516     ret = reganode(pRExC_state,
11517        ((! FOLD)
11518         ? NREF
11519         : (ASCII_FOLD_RESTRICTED)
11520         ? NREFFA
11521         : (AT_LEAST_UNI_SEMANTICS)
11522          ? NREFFU
11523          : (LOC)
11524          ? NREFFL
11525          : NREFF),
11526         num);
11527     *flagp |= HASWIDTH;
11528
11529     /* override incorrect value set in reganode MJD */
11530     Set_Node_Offset(ret, parse_start+1);
11531     Set_Node_Cur_Length(ret, parse_start);
11532     nextchar(pRExC_state);
11533
11534    }
11535    break;
11536   }
11537   case 'g':
11538   case '1': case '2': case '3': case '4':
11539   case '5': case '6': case '7': case '8': case '9':
11540    {
11541     I32 num;
11542     bool hasbrace = 0;
11543
11544     if (*RExC_parse == 'g') {
11545      bool isrel = 0;
11546
11547      RExC_parse++;
11548      if (*RExC_parse == '{') {
11549       RExC_parse++;
11550       hasbrace = 1;
11551      }
11552      if (*RExC_parse == '-') {
11553       RExC_parse++;
11554       isrel = 1;
11555      }
11556      if (hasbrace && !isDIGIT(*RExC_parse)) {
11557       if (isrel) RExC_parse--;
11558       RExC_parse -= 2;
11559       goto parse_named_seq;
11560      }
11561
11562      num = S_backref_value(RExC_parse);
11563      if (num == 0)
11564       vFAIL("Reference to invalid group 0");
11565      else if (num == I32_MAX) {
11566       if (isDIGIT(*RExC_parse))
11567        vFAIL("Reference to nonexistent group");
11568       else
11569        vFAIL("Unterminated \\g... pattern");
11570      }
11571
11572      if (isrel) {
11573       num = RExC_npar - num;
11574       if (num < 1)
11575        vFAIL("Reference to nonexistent or unclosed group");
11576      }
11577     }
11578     else {
11579      num = S_backref_value(RExC_parse);
11580      /* bare \NNN might be backref or octal - if it is larger than or equal
11581      * RExC_npar then it is assumed to be and octal escape.
11582      * Note RExC_npar is +1 from the actual number of parens*/
11583      if (num == I32_MAX || (num > 9 && num >= RExC_npar
11584        && *RExC_parse != '8' && *RExC_parse != '9'))
11585      {
11586       /* Probably a character specified in octal, e.g. \35 */
11587       goto defchar;
11588      }
11589     }
11590
11591     /* at this point RExC_parse definitely points to a backref
11592     * number */
11593     {
11594 #ifdef RE_TRACK_PATTERN_OFFSETS
11595      char * const parse_start = RExC_parse - 1; /* MJD */
11596 #endif
11597      while (isDIGIT(*RExC_parse))
11598       RExC_parse++;
11599      if (hasbrace) {
11600       if (*RExC_parse != '}')
11601        vFAIL("Unterminated \\g{...} pattern");
11602       RExC_parse++;
11603      }
11604      if (!SIZE_ONLY) {
11605       if (num > (I32)RExC_rx->nparens)
11606        vFAIL("Reference to nonexistent group");
11607      }
11608      RExC_sawback = 1;
11609      ret = reganode(pRExC_state,
11610         ((! FOLD)
11611          ? REF
11612          : (ASCII_FOLD_RESTRICTED)
11613          ? REFFA
11614          : (AT_LEAST_UNI_SEMANTICS)
11615           ? REFFU
11616           : (LOC)
11617           ? REFFL
11618           : REFF),
11619          num);
11620      *flagp |= HASWIDTH;
11621
11622      /* override incorrect value set in reganode MJD */
11623      Set_Node_Offset(ret, parse_start+1);
11624      Set_Node_Cur_Length(ret, parse_start);
11625      RExC_parse--;
11626      nextchar(pRExC_state);
11627     }
11628    }
11629    break;
11630   case '\0':
11631    if (RExC_parse >= RExC_end)
11632     FAIL("Trailing \\");
11633    /* FALL THROUGH */
11634   default:
11635    /* Do not generate "unrecognized" warnings here, we fall
11636    back into the quick-grab loop below */
11637    parse_start--;
11638    goto defchar;
11639   }
11640   break;
11641
11642  case '#':
11643   if (RExC_flags & RXf_PMf_EXTENDED) {
11644    if ( reg_skipcomment( pRExC_state ) )
11645     goto tryagain;
11646   }
11647   /* FALL THROUGH */
11648
11649  default:
11650
11651    parse_start = RExC_parse - 1;
11652
11653    RExC_parse++;
11654
11655   defchar: {
11656    STRLEN len = 0;
11657    UV ender = 0;
11658    char *p;
11659    char *s;
11660 #define MAX_NODE_STRING_SIZE 127
11661    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11662    char *s0;
11663    U8 upper_parse = MAX_NODE_STRING_SIZE;
11664    U8 node_type = compute_EXACTish(pRExC_state);
11665    bool next_is_quantifier;
11666    char * oldp = NULL;
11667
11668    /* We can convert EXACTF nodes to EXACTFU if they contain only
11669    * characters that match identically regardless of the target
11670    * string's UTF8ness.  The reason to do this is that EXACTF is not
11671    * trie-able, EXACTFU is.
11672    *
11673    * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11674    * contain only above-Latin1 characters (hence must be in UTF8),
11675    * which don't participate in folds with Latin1-range characters,
11676    * as the latter's folds aren't known until runtime.  (We don't
11677    * need to figure this out until pass 2) */
11678    bool maybe_exactfu = PASS2
11679        && (node_type == EXACTF || node_type == EXACTFL);
11680
11681    /* If a folding node contains only code points that don't
11682    * participate in folds, it can be changed into an EXACT node,
11683    * which allows the optimizer more things to look for */
11684    bool maybe_exact;
11685
11686    ret = reg_node(pRExC_state, node_type);
11687
11688    /* In pass1, folded, we use a temporary buffer instead of the
11689    * actual node, as the node doesn't exist yet */
11690    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11691
11692    s0 = s;
11693
11694   reparse:
11695
11696    /* We do the EXACTFish to EXACT node only if folding.  (And we
11697    * don't need to figure this out until pass 2) */
11698    maybe_exact = FOLD && PASS2;
11699
11700    /* XXX The node can hold up to 255 bytes, yet this only goes to
11701    * 127.  I (khw) do not know why.  Keeping it somewhat less than
11702    * 255 allows us to not have to worry about overflow due to
11703    * converting to utf8 and fold expansion, but that value is
11704    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11705    * split up by this limit into a single one using the real max of
11706    * 255.  Even at 127, this breaks under rare circumstances.  If
11707    * folding, we do not want to split a node at a character that is a
11708    * non-final in a multi-char fold, as an input string could just
11709    * happen to want to match across the node boundary.  The join
11710    * would solve that problem if the join actually happens.  But a
11711    * series of more than two nodes in a row each of 127 would cause
11712    * the first join to succeed to get to 254, but then there wouldn't
11713    * be room for the next one, which could at be one of those split
11714    * multi-char folds.  I don't know of any fool-proof solution.  One
11715    * could back off to end with only a code point that isn't such a
11716    * non-final, but it is possible for there not to be any in the
11717    * entire node. */
11718    for (p = RExC_parse - 1;
11719     len < upper_parse && p < RExC_end;
11720     len++)
11721    {
11722     oldp = p;
11723
11724     if (RExC_flags & RXf_PMf_EXTENDED)
11725      p = regwhite( pRExC_state, p );
11726     switch ((U8)*p) {
11727     case '^':
11728     case '$':
11729     case '.':
11730     case '[':
11731     case '(':
11732     case ')':
11733     case '|':
11734      goto loopdone;
11735     case '\\':
11736      /* Literal Escapes Switch
11737
11738      This switch is meant to handle escape sequences that
11739      resolve to a literal character.
11740
11741      Every escape sequence that represents something
11742      else, like an assertion or a char class, is handled
11743      in the switch marked 'Special Escapes' above in this
11744      routine, but also has an entry here as anything that
11745      isn't explicitly mentioned here will be treated as
11746      an unescaped equivalent literal.
11747      */
11748
11749      switch ((U8)*++p) {
11750      /* These are all the special escapes. */
11751      case 'A':             /* Start assertion */
11752      case 'b': case 'B':   /* Word-boundary assertion*/
11753      case 'C':             /* Single char !DANGEROUS! */
11754      case 'd': case 'D':   /* digit class */
11755      case 'g': case 'G':   /* generic-backref, pos assertion */
11756      case 'h': case 'H':   /* HORIZWS */
11757      case 'k': case 'K':   /* named backref, keep marker */
11758      case 'p': case 'P':   /* Unicode property */
11759        case 'R':   /* LNBREAK */
11760      case 's': case 'S':   /* space class */
11761      case 'v': case 'V':   /* VERTWS */
11762      case 'w': case 'W':   /* word class */
11763      case 'X':             /* eXtended Unicode "combining
11764            character sequence" */
11765      case 'z': case 'Z':   /* End of line/string assertion */
11766       --p;
11767       goto loopdone;
11768
11769      /* Anything after here is an escape that resolves to a
11770      literal. (Except digits, which may or may not)
11771      */
11772      case 'n':
11773       ender = '\n';
11774       p++;
11775       break;
11776      case 'N': /* Handle a single-code point named character. */
11777       /* The options cause it to fail if a multiple code
11778       * point sequence.  Handle those in the switch() above
11779       * */
11780       RExC_parse = p + 1;
11781       if (! grok_bslash_N(pRExC_state, NULL, &ender,
11782            flagp, depth, FALSE,
11783            FALSE /* not strict */ ))
11784       {
11785        if (*flagp & RESTART_UTF8)
11786         FAIL("panic: grok_bslash_N set RESTART_UTF8");
11787        RExC_parse = p = oldp;
11788        goto loopdone;
11789       }
11790       p = RExC_parse;
11791       if (ender > 0xff) {
11792        REQUIRE_UTF8;
11793       }
11794       break;
11795      case 'r':
11796       ender = '\r';
11797       p++;
11798       break;
11799      case 't':
11800       ender = '\t';
11801       p++;
11802       break;
11803      case 'f':
11804       ender = '\f';
11805       p++;
11806       break;
11807      case 'e':
11808       ender = ASCII_TO_NATIVE('\033');
11809       p++;
11810       break;
11811      case 'a':
11812       ender = '\a';
11813       p++;
11814       break;
11815      case 'o':
11816       {
11817        UV result;
11818        const char* error_msg;
11819
11820        bool valid = grok_bslash_o(&p,
11821              &result,
11822              &error_msg,
11823              TRUE, /* out warnings */
11824              FALSE, /* not strict */
11825              TRUE, /* Output warnings
11826                 for non-
11827                 portables */
11828              UTF);
11829        if (! valid) {
11830         RExC_parse = p; /* going to die anyway; point
11831             to exact spot of failure */
11832         vFAIL(error_msg);
11833        }
11834        ender = result;
11835        if (PL_encoding && ender < 0x100) {
11836         goto recode_encoding;
11837        }
11838        if (ender > 0xff) {
11839         REQUIRE_UTF8;
11840        }
11841        break;
11842       }
11843      case 'x':
11844       {
11845        UV result = UV_MAX; /* initialize to erroneous
11846             value */
11847        const char* error_msg;
11848
11849        bool valid = grok_bslash_x(&p,
11850              &result,
11851              &error_msg,
11852              TRUE, /* out warnings */
11853              FALSE, /* not strict */
11854              TRUE, /* Output warnings
11855                 for non-
11856                 portables */
11857              UTF);
11858        if (! valid) {
11859         RExC_parse = p; /* going to die anyway; point
11860             to exact spot of failure */
11861         vFAIL(error_msg);
11862        }
11863        ender = result;
11864
11865        if (PL_encoding && ender < 0x100) {
11866         goto recode_encoding;
11867        }
11868        if (ender > 0xff) {
11869         REQUIRE_UTF8;
11870        }
11871        break;
11872       }
11873      case 'c':
11874       p++;
11875       ender = grok_bslash_c(*p++, SIZE_ONLY);
11876       break;
11877      case '8': case '9': /* must be a backreference */
11878       --p;
11879       goto loopdone;
11880      case '1': case '2': case '3':case '4':
11881      case '5': case '6': case '7':
11882       /* When we parse backslash escapes there is ambiguity
11883       * between backreferences and octal escapes. Any escape
11884       * from \1 - \9 is a backreference, any multi-digit
11885       * escape which does not start with 0 and which when
11886       * evaluated as decimal could refer to an already
11887       * parsed capture buffer is a backslash. Anything else
11888       * is octal.
11889       *
11890       * Note this implies that \118 could be interpreted as
11891       * 118 OR as "\11" . "8" depending on whether there
11892       * were 118 capture buffers defined already in the
11893       * pattern.  */
11894
11895       /* NOTE, RExC_npar is 1 more than the actual number of
11896       * parens we have seen so far, hence the < RExC_npar below. */
11897
11898       if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11899       {  /* Not to be treated as an octal constant, go
11900         find backref */
11901        --p;
11902        goto loopdone;
11903       }
11904      case '0':
11905       {
11906        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11907        STRLEN numlen = 3;
11908        ender = grok_oct(p, &numlen, &flags, NULL);
11909        if (ender > 0xff) {
11910         REQUIRE_UTF8;
11911        }
11912        p += numlen;
11913        if (SIZE_ONLY   /* like \08, \178 */
11914         && numlen < 3
11915         && p < RExC_end
11916         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11917        {
11918         reg_warn_non_literal_string(
11919           p + 1,
11920           form_short_octal_warning(p, numlen));
11921        }
11922       }
11923       if (PL_encoding && ender < 0x100)
11924        goto recode_encoding;
11925       break;
11926      recode_encoding:
11927       if (! RExC_override_recoding) {
11928        SV* enc = PL_encoding;
11929        ender = reg_recode((const char)(U8)ender, &enc);
11930        if (!enc && SIZE_ONLY)
11931         ckWARNreg(p, "Invalid escape in the specified encoding");
11932        REQUIRE_UTF8;
11933       }
11934       break;
11935      case '\0':
11936       if (p >= RExC_end)
11937        FAIL("Trailing \\");
11938       /* FALL THROUGH */
11939      default:
11940       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11941        /* Include any { following the alpha to emphasize
11942        * that it could be part of an escape at some point
11943        * in the future */
11944        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11945        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11946       }
11947       goto normal_default;
11948      } /* End of switch on '\' */
11949      break;
11950     default:    /* A literal character */
11951
11952      if (! SIZE_ONLY
11953       && RExC_flags & RXf_PMf_EXTENDED
11954       && ckWARN_d(WARN_DEPRECATED)
11955       && is_PATWS_non_low_safe(p, RExC_end, UTF))
11956      {
11957       vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11958         "Escape literal pattern white space under /x");
11959      }
11960
11961     normal_default:
11962      if (UTF8_IS_START(*p) && UTF) {
11963       STRLEN numlen;
11964       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11965            &numlen, UTF8_ALLOW_DEFAULT);
11966       p += numlen;
11967      }
11968      else
11969       ender = (U8) *p++;
11970      break;
11971     } /* End of switch on the literal */
11972
11973     /* Here, have looked at the literal character and <ender>
11974     * contains its ordinal, <p> points to the character after it
11975     */
11976
11977     if ( RExC_flags & RXf_PMf_EXTENDED)
11978      p = regwhite( pRExC_state, p );
11979
11980     /* If the next thing is a quantifier, it applies to this
11981     * character only, which means that this character has to be in
11982     * its own node and can't just be appended to the string in an
11983     * existing node, so if there are already other characters in
11984     * the node, close the node with just them, and set up to do
11985     * this character again next time through, when it will be the
11986     * only thing in its new node */
11987     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11988     {
11989      p = oldp;
11990      goto loopdone;
11991     }
11992
11993     if (! FOLD   /* The simple case, just append the literal */
11994      || (LOC  /* Also don't fold for tricky chars under /l */
11995       && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
11996     {
11997      if (UTF) {
11998       const STRLEN unilen = reguni(pRExC_state, ender, s);
11999       if (unilen > 0) {
12000       s   += unilen;
12001       len += unilen;
12002       }
12003
12004       /* The loop increments <len> each time, as all but this
12005       * path (and one other) through it add a single byte to
12006       * the EXACTish node.  But this one has changed len to
12007       * be the correct final value, so subtract one to
12008       * cancel out the increment that follows */
12009       len--;
12010      }
12011      else {
12012       REGC((char)ender, s++);
12013      }
12014
12015      /* Can get here if folding only if is one of the /l
12016      * characters whose fold depends on the locale.  The
12017      * occurrence of any of these indicate that we can't
12018      * simplify things */
12019      if (FOLD) {
12020       maybe_exact = FALSE;
12021       maybe_exactfu = FALSE;
12022      }
12023     }
12024     else             /* FOLD */
12025      if (! ( UTF
12026       /* See comments for join_exact() as to why we fold this
12027       * non-UTF at compile time */
12028       || (node_type == EXACTFU
12029        && ender == LATIN_SMALL_LETTER_SHARP_S)))
12030     {
12031      /* Here, are folding and are not UTF-8 encoded; therefore
12032      * the character must be in the range 0-255, and is not /l
12033      * (Not /l because we already handled these under /l in
12034      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12035      if (IS_IN_SOME_FOLD_L1(ender)) {
12036       maybe_exact = FALSE;
12037
12038       /* See if the character's fold differs between /d and
12039       * /u.  This includes the multi-char fold SHARP S to
12040       * 'ss' */
12041       if (maybe_exactfu
12042        && (PL_fold[ender] != PL_fold_latin1[ender]
12043         || ender == LATIN_SMALL_LETTER_SHARP_S
12044         || (len > 0
12045         && isARG2_lower_or_UPPER_ARG1('s', ender)
12046         && isARG2_lower_or_UPPER_ARG1('s',
12047                 *(s-1)))))
12048       {
12049        maybe_exactfu = FALSE;
12050       }
12051      }
12052
12053      /* Even when folding, we store just the input character, as
12054      * we have an array that finds its fold quickly */
12055      *(s++) = (char) ender;
12056     }
12057     else {  /* FOLD and UTF */
12058      /* Unlike the non-fold case, we do actually have to
12059      * calculate the results here in pass 1.  This is for two
12060      * reasons, the folded length may be longer than the
12061      * unfolded, and we have to calculate how many EXACTish
12062      * nodes it will take; and we may run out of room in a node
12063      * in the middle of a potential multi-char fold, and have
12064      * to back off accordingly.  (Hence we can't use REGC for
12065      * the simple case just below.) */
12066
12067      UV folded;
12068      if (isASCII(ender)) {
12069       folded = toFOLD(ender);
12070       *(s)++ = (U8) folded;
12071      }
12072      else {
12073       STRLEN foldlen;
12074
12075       folded = _to_uni_fold_flags(
12076          ender,
12077          (U8 *) s,
12078          &foldlen,
12079          FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12080               ? FOLD_FLAGS_NOMIX_ASCII
12081               : 0));
12082       s += foldlen;
12083
12084       /* The loop increments <len> each time, as all but this
12085       * path (and one other) through it add a single byte to
12086       * the EXACTish node.  But this one has changed len to
12087       * be the correct final value, so subtract one to
12088       * cancel out the increment that follows */
12089       len += foldlen - 1;
12090      }
12091      /* If this node only contains non-folding code points so
12092      * far, see if this new one is also non-folding */
12093      if (maybe_exact) {
12094       if (folded != ender) {
12095        maybe_exact = FALSE;
12096       }
12097       else {
12098        /* Here the fold is the original; we have to check
12099        * further to see if anything folds to it */
12100        if (_invlist_contains_cp(PL_utf8_foldable,
12101               ender))
12102        {
12103         maybe_exact = FALSE;
12104        }
12105       }
12106      }
12107      ender = folded;
12108     }
12109
12110     if (next_is_quantifier) {
12111
12112      /* Here, the next input is a quantifier, and to get here,
12113      * the current character is the only one in the node.
12114      * Also, here <len> doesn't include the final byte for this
12115      * character */
12116      len++;
12117      goto loopdone;
12118     }
12119
12120    } /* End of loop through literal characters */
12121
12122    /* Here we have either exhausted the input or ran out of room in
12123    * the node.  (If we encountered a character that can't be in the
12124    * node, transfer is made directly to <loopdone>, and so we
12125    * wouldn't have fallen off the end of the loop.)  In the latter
12126    * case, we artificially have to split the node into two, because
12127    * we just don't have enough space to hold everything.  This
12128    * creates a problem if the final character participates in a
12129    * multi-character fold in the non-final position, as a match that
12130    * should have occurred won't, due to the way nodes are matched,
12131    * and our artificial boundary.  So back off until we find a non-
12132    * problematic character -- one that isn't at the beginning or
12133    * middle of such a fold.  (Either it doesn't participate in any
12134    * folds, or appears only in the final position of all the folds it
12135    * does participate in.)  A better solution with far fewer false
12136    * positives, and that would fill the nodes more completely, would
12137    * be to actually have available all the multi-character folds to
12138    * test against, and to back-off only far enough to be sure that
12139    * this node isn't ending with a partial one.  <upper_parse> is set
12140    * further below (if we need to reparse the node) to include just
12141    * up through that final non-problematic character that this code
12142    * identifies, so when it is set to less than the full node, we can
12143    * skip the rest of this */
12144    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12145
12146     const STRLEN full_len = len;
12147
12148     assert(len >= MAX_NODE_STRING_SIZE);
12149
12150     /* Here, <s> points to the final byte of the final character.
12151     * Look backwards through the string until find a non-
12152     * problematic character */
12153
12154     if (! UTF) {
12155
12156      /* This has no multi-char folds to non-UTF characters */
12157      if (ASCII_FOLD_RESTRICTED) {
12158       goto loopdone;
12159      }
12160
12161      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12162      len = s - s0 + 1;
12163     }
12164     else {
12165      if (!  PL_NonL1NonFinalFold) {
12166       PL_NonL1NonFinalFold = _new_invlist_C_array(
12167           NonL1_Perl_Non_Final_Folds_invlist);
12168      }
12169
12170      /* Point to the first byte of the final character */
12171      s = (char *) utf8_hop((U8 *) s, -1);
12172
12173      while (s >= s0) {   /* Search backwards until find
12174           non-problematic char */
12175       if (UTF8_IS_INVARIANT(*s)) {
12176
12177        /* There are no ascii characters that participate
12178        * in multi-char folds under /aa.  In EBCDIC, the
12179        * non-ascii invariants are all control characters,
12180        * so don't ever participate in any folds. */
12181        if (ASCII_FOLD_RESTRICTED
12182         || ! IS_NON_FINAL_FOLD(*s))
12183        {
12184         break;
12185        }
12186       }
12187       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12188        if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12189                 *s, *(s+1))))
12190        {
12191         break;
12192        }
12193       }
12194       else if (! _invlist_contains_cp(
12195           PL_NonL1NonFinalFold,
12196           valid_utf8_to_uvchr((U8 *) s, NULL)))
12197       {
12198        break;
12199       }
12200
12201       /* Here, the current character is problematic in that
12202       * it does occur in the non-final position of some
12203       * fold, so try the character before it, but have to
12204       * special case the very first byte in the string, so
12205       * we don't read outside the string */
12206       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12207      } /* End of loop backwards through the string */
12208
12209      /* If there were only problematic characters in the string,
12210      * <s> will point to before s0, in which case the length
12211      * should be 0, otherwise include the length of the
12212      * non-problematic character just found */
12213      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12214     }
12215
12216     /* Here, have found the final character, if any, that is
12217     * non-problematic as far as ending the node without splitting
12218     * it across a potential multi-char fold.  <len> contains the
12219     * number of bytes in the node up-to and including that
12220     * character, or is 0 if there is no such character, meaning
12221     * the whole node contains only problematic characters.  In
12222     * this case, give up and just take the node as-is.  We can't
12223     * do any better */
12224     if (len == 0) {
12225      len = full_len;
12226
12227      /* If the node ends in an 's' we make sure it stays EXACTF,
12228      * as if it turns into an EXACTFU, it could later get
12229      * joined with another 's' that would then wrongly match
12230      * the sharp s */
12231      if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12232      {
12233       maybe_exactfu = FALSE;
12234      }
12235     } else {
12236
12237      /* Here, the node does contain some characters that aren't
12238      * problematic.  If one such is the final character in the
12239      * node, we are done */
12240      if (len == full_len) {
12241       goto loopdone;
12242      }
12243      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12244
12245       /* If the final character is problematic, but the
12246       * penultimate is not, back-off that last character to
12247       * later start a new node with it */
12248       p = oldp;
12249       goto loopdone;
12250      }
12251
12252      /* Here, the final non-problematic character is earlier
12253      * in the input than the penultimate character.  What we do
12254      * is reparse from the beginning, going up only as far as
12255      * this final ok one, thus guaranteeing that the node ends
12256      * in an acceptable character.  The reason we reparse is
12257      * that we know how far in the character is, but we don't
12258      * know how to correlate its position with the input parse.
12259      * An alternate implementation would be to build that
12260      * correlation as we go along during the original parse,
12261      * but that would entail extra work for every node, whereas
12262      * this code gets executed only when the string is too
12263      * large for the node, and the final two characters are
12264      * problematic, an infrequent occurrence.  Yet another
12265      * possible strategy would be to save the tail of the
12266      * string, and the next time regatom is called, initialize
12267      * with that.  The problem with this is that unless you
12268      * back off one more character, you won't be guaranteed
12269      * regatom will get called again, unless regbranch,
12270      * regpiece ... are also changed.  If you do back off that
12271      * extra character, so that there is input guaranteed to
12272      * force calling regatom, you can't handle the case where
12273      * just the first character in the node is acceptable.  I
12274      * (khw) decided to try this method which doesn't have that
12275      * pitfall; if performance issues are found, we can do a
12276      * combination of the current approach plus that one */
12277      upper_parse = len;
12278      len = 0;
12279      s = s0;
12280      goto reparse;
12281     }
12282    }   /* End of verifying node ends with an appropriate char */
12283
12284   loopdone:   /* Jumped to when encounters something that shouldn't be in
12285      the node */
12286
12287    /* I (khw) don't know if you can get here with zero length, but the
12288    * old code handled this situation by creating a zero-length EXACT
12289    * node.  Might as well be NOTHING instead */
12290    if (len == 0) {
12291     OP(ret) = NOTHING;
12292    }
12293    else {
12294     if (FOLD) {
12295      /* If 'maybe_exact' is still set here, means there are no
12296      * code points in the node that participate in folds;
12297      * similarly for 'maybe_exactfu' and code points that match
12298      * differently depending on UTF8ness of the target string
12299      * (for /u), or depending on locale for /l */
12300      if (maybe_exact) {
12301       OP(ret) = EXACT;
12302      }
12303      else if (maybe_exactfu) {
12304       OP(ret) = EXACTFU;
12305      }
12306     }
12307     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12308           FALSE /* Don't look to see if could
12309              be turned into an EXACT
12310              node, as we have already
12311              computed that */
12312           );
12313    }
12314
12315    RExC_parse = p - 1;
12316    Set_Node_Cur_Length(ret, parse_start);
12317    nextchar(pRExC_state);
12318    {
12319     /* len is STRLEN which is unsigned, need to copy to signed */
12320     IV iv = len;
12321     if (iv < 0)
12322      vFAIL("Internal disaster");
12323    }
12324
12325   } /* End of label 'defchar:' */
12326   break;
12327  } /* End of giant switch on input character */
12328
12329  return(ret);
12330 }
12331
12332 STATIC char *
12333 S_regwhite( RExC_state_t *pRExC_state, char *p )
12334 {
12335  const char *e = RExC_end;
12336
12337  PERL_ARGS_ASSERT_REGWHITE;
12338
12339  while (p < e) {
12340   if (isSPACE(*p))
12341    ++p;
12342   else if (*p == '#') {
12343    bool ended = 0;
12344    do {
12345     if (*p++ == '\n') {
12346      ended = 1;
12347      break;
12348     }
12349    } while (p < e);
12350    if (!ended)
12351     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12352   }
12353   else
12354    break;
12355  }
12356  return p;
12357 }
12358
12359 STATIC char *
12360 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12361 {
12362  /* Returns the next non-pattern-white space, non-comment character (the
12363  * latter only if 'recognize_comment is true) in the string p, which is
12364  * ended by RExC_end.  If there is no line break ending a comment,
12365  * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
12366  const char *e = RExC_end;
12367
12368  PERL_ARGS_ASSERT_REGPATWS;
12369
12370  while (p < e) {
12371   STRLEN len;
12372   if ((len = is_PATWS_safe(p, e, UTF))) {
12373    p += len;
12374   }
12375   else if (recognize_comment && *p == '#') {
12376    bool ended = 0;
12377    do {
12378     p++;
12379     if (is_LNBREAK_safe(p, e, UTF)) {
12380      ended = 1;
12381      break;
12382     }
12383    } while (p < e);
12384    if (!ended)
12385     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12386   }
12387   else
12388    break;
12389  }
12390  return p;
12391 }
12392
12393 STATIC void
12394 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12395 {
12396  /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12397  * sets up the bitmap and any flags, removing those code points from the
12398  * inversion list, setting it to NULL should it become completely empty */
12399
12400  PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12401  assert(PL_regkind[OP(node)] == ANYOF);
12402
12403  ANYOF_BITMAP_ZERO(node);
12404  if (*invlist_ptr) {
12405
12406   /* This gets set if we actually need to modify things */
12407   bool change_invlist = FALSE;
12408
12409   UV start, end;
12410
12411   /* Start looking through *invlist_ptr */
12412   invlist_iterinit(*invlist_ptr);
12413   while (invlist_iternext(*invlist_ptr, &start, &end)) {
12414    UV high;
12415    int i;
12416
12417    if (end == UV_MAX && start <= 256) {
12418     ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12419    }
12420    else if (end >= 256) {
12421     ANYOF_FLAGS(node) |= ANYOF_UTF8;
12422    }
12423
12424    /* Quit if are above what we should change */
12425    if (start > 255) {
12426     break;
12427    }
12428
12429    change_invlist = TRUE;
12430
12431    /* Set all the bits in the range, up to the max that we are doing */
12432    high = (end < 255) ? end : 255;
12433    for (i = start; i <= (int) high; i++) {
12434     if (! ANYOF_BITMAP_TEST(node, i)) {
12435      ANYOF_BITMAP_SET(node, i);
12436     }
12437    }
12438   }
12439   invlist_iterfinish(*invlist_ptr);
12440
12441   /* Done with loop; remove any code points that are in the bitmap from
12442   * *invlist_ptr; similarly for code points above latin1 if we have a
12443   * flag to match all of them anyways */
12444   if (change_invlist) {
12445    _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12446   }
12447   if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12448    _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12449   }
12450
12451   /* If have completely emptied it, remove it completely */
12452   if (_invlist_len(*invlist_ptr) == 0) {
12453    SvREFCNT_dec_NN(*invlist_ptr);
12454    *invlist_ptr = NULL;
12455   }
12456  }
12457 }
12458
12459 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12460    Character classes ([:foo:]) can also be negated ([:^foo:]).
12461    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12462    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12463    but trigger failures because they are currently unimplemented. */
12464
12465 #define POSIXCC_DONE(c)   ((c) == ':')
12466 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12467 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12468
12469 PERL_STATIC_INLINE I32
12470 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12471 {
12472  dVAR;
12473  I32 namedclass = OOB_NAMEDCLASS;
12474
12475  PERL_ARGS_ASSERT_REGPPOSIXCC;
12476
12477  if (value == '[' && RExC_parse + 1 < RExC_end &&
12478   /* I smell either [: or [= or [. -- POSIX has been here, right? */
12479   POSIXCC(UCHARAT(RExC_parse)))
12480  {
12481   const char c = UCHARAT(RExC_parse);
12482   char* const s = RExC_parse++;
12483
12484   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12485    RExC_parse++;
12486   if (RExC_parse == RExC_end) {
12487    if (strict) {
12488
12489     /* Try to give a better location for the error (than the end of
12490     * the string) by looking for the matching ']' */
12491     RExC_parse = s;
12492     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12493      RExC_parse++;
12494     }
12495     vFAIL2("Unmatched '%c' in POSIX class", c);
12496    }
12497    /* Grandfather lone [:, [=, [. */
12498    RExC_parse = s;
12499   }
12500   else {
12501    const char* const t = RExC_parse++; /* skip over the c */
12502    assert(*t == c);
12503
12504    if (UCHARAT(RExC_parse) == ']') {
12505     const char *posixcc = s + 1;
12506     RExC_parse++; /* skip over the ending ] */
12507
12508     if (*s == ':') {
12509      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12510      const I32 skip = t - posixcc;
12511
12512      /* Initially switch on the length of the name.  */
12513      switch (skip) {
12514      case 4:
12515       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12516               this is the Perl \w
12517               */
12518        namedclass = ANYOF_WORDCHAR;
12519       break;
12520      case 5:
12521       /* Names all of length 5.  */
12522       /* alnum alpha ascii blank cntrl digit graph lower
12523       print punct space upper  */
12524       /* Offset 4 gives the best switch position.  */
12525       switch (posixcc[4]) {
12526       case 'a':
12527        if (memEQ(posixcc, "alph", 4)) /* alpha */
12528         namedclass = ANYOF_ALPHA;
12529        break;
12530       case 'e':
12531        if (memEQ(posixcc, "spac", 4)) /* space */
12532         namedclass = ANYOF_PSXSPC;
12533        break;
12534       case 'h':
12535        if (memEQ(posixcc, "grap", 4)) /* graph */
12536         namedclass = ANYOF_GRAPH;
12537        break;
12538       case 'i':
12539        if (memEQ(posixcc, "asci", 4)) /* ascii */
12540         namedclass = ANYOF_ASCII;
12541        break;
12542       case 'k':
12543        if (memEQ(posixcc, "blan", 4)) /* blank */
12544         namedclass = ANYOF_BLANK;
12545        break;
12546       case 'l':
12547        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12548         namedclass = ANYOF_CNTRL;
12549        break;
12550       case 'm':
12551        if (memEQ(posixcc, "alnu", 4)) /* alnum */
12552         namedclass = ANYOF_ALPHANUMERIC;
12553        break;
12554       case 'r':
12555        if (memEQ(posixcc, "lowe", 4)) /* lower */
12556         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12557        else if (memEQ(posixcc, "uppe", 4)) /* upper */
12558         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12559        break;
12560       case 't':
12561        if (memEQ(posixcc, "digi", 4)) /* digit */
12562         namedclass = ANYOF_DIGIT;
12563        else if (memEQ(posixcc, "prin", 4)) /* print */
12564         namedclass = ANYOF_PRINT;
12565        else if (memEQ(posixcc, "punc", 4)) /* punct */
12566         namedclass = ANYOF_PUNCT;
12567        break;
12568       }
12569       break;
12570      case 6:
12571       if (memEQ(posixcc, "xdigit", 6))
12572        namedclass = ANYOF_XDIGIT;
12573       break;
12574      }
12575
12576      if (namedclass == OOB_NAMEDCLASS)
12577       vFAIL2utf8f(
12578        "POSIX class [:%"UTF8f":] unknown",
12579        UTF8fARG(UTF, t - s - 1, s + 1));
12580
12581      /* The #defines are structured so each complement is +1 to
12582      * the normal one */
12583      if (complement) {
12584       namedclass++;
12585      }
12586      assert (posixcc[skip] == ':');
12587      assert (posixcc[skip+1] == ']');
12588     } else if (!SIZE_ONLY) {
12589      /* [[=foo=]] and [[.foo.]] are still future. */
12590
12591      /* adjust RExC_parse so the warning shows after
12592      the class closes */
12593      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12594       RExC_parse++;
12595      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12596     }
12597    } else {
12598     /* Maternal grandfather:
12599     * "[:" ending in ":" but not in ":]" */
12600     if (strict) {
12601      vFAIL("Unmatched '[' in POSIX class");
12602     }
12603
12604     /* Grandfather lone [:, [=, [. */
12605     RExC_parse = s;
12606    }
12607   }
12608  }
12609
12610  return namedclass;
12611 }
12612
12613 STATIC bool
12614 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12615 {
12616  /* This applies some heuristics at the current parse position (which should
12617  * be at a '[') to see if what follows might be intended to be a [:posix:]
12618  * class.  It returns true if it really is a posix class, of course, but it
12619  * also can return true if it thinks that what was intended was a posix
12620  * class that didn't quite make it.
12621  *
12622  * It will return true for
12623  *      [:alphanumerics:
12624  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12625  *                         ')' indicating the end of the (?[
12626  *      [:any garbage including %^&$ punctuation:]
12627  *
12628  * This is designed to be called only from S_handle_regex_sets; it could be
12629  * easily adapted to be called from the spot at the beginning of regclass()
12630  * that checks to see in a normal bracketed class if the surrounding []
12631  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12632  * change long-standing behavior, so I (khw) didn't do that */
12633  char* p = RExC_parse + 1;
12634  char first_char = *p;
12635
12636  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12637
12638  assert(*(p - 1) == '[');
12639
12640  if (! POSIXCC(first_char)) {
12641   return FALSE;
12642  }
12643
12644  p++;
12645  while (p < RExC_end && isWORDCHAR(*p)) p++;
12646
12647  if (p >= RExC_end) {
12648   return FALSE;
12649  }
12650
12651  if (p - RExC_parse > 2    /* Got at least 1 word character */
12652   && (*p == first_char
12653    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12654  {
12655   return TRUE;
12656  }
12657
12658  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12659
12660  return (p
12661    && p - RExC_parse > 2 /* [:] evaluates to colon;
12662          [::] is a bad posix class. */
12663    && first_char == *(p - 1));
12664 }
12665
12666 STATIC regnode *
12667 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12668      I32 *flagp, U32 depth,
12669      char * const oregcomp_parse)
12670 {
12671  /* Handle the (?[...]) construct to do set operations */
12672
12673  U8 curchar;
12674  UV start, end; /* End points of code point ranges */
12675  SV* result_string;
12676  char *save_end, *save_parse;
12677  SV* final;
12678  STRLEN len;
12679  regnode* node;
12680  AV* stack;
12681  const bool save_fold = FOLD;
12682
12683  GET_RE_DEBUG_FLAGS_DECL;
12684
12685  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12686
12687  if (LOC) {
12688   vFAIL("(?[...]) not valid in locale");
12689  }
12690  RExC_uni_semantics = 1;
12691
12692  /* This will return only an ANYOF regnode, or (unlikely) something smaller
12693  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12694  * call regclass to handle '[]' so as to not have to reinvent its parsing
12695  * rules here (throwing away the size it computes each time).  And, we exit
12696  * upon an unescaped ']' that isn't one ending a regclass.  To do both
12697  * these things, we need to realize that something preceded by a backslash
12698  * is escaped, so we have to keep track of backslashes */
12699  if (SIZE_ONLY) {
12700   UV depth = 0; /* how many nested (?[...]) constructs */
12701
12702   Perl_ck_warner_d(aTHX_
12703    packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12704    "The regex_sets feature is experimental" REPORT_LOCATION,
12705     UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12706     UTF8fARG(UTF,
12707       RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12708       RExC_precomp + (RExC_parse - RExC_precomp)));
12709
12710   while (RExC_parse < RExC_end) {
12711    SV* current = NULL;
12712    RExC_parse = regpatws(pRExC_state, RExC_parse,
12713         TRUE); /* means recognize comments */
12714    switch (*RExC_parse) {
12715     case '?':
12716      if (RExC_parse[1] == '[') depth++, RExC_parse++;
12717      /* FALL THROUGH */
12718     default:
12719      break;
12720     case '\\':
12721      /* Skip the next byte (which could cause us to end up in
12722      * the middle of a UTF-8 character, but since none of those
12723      * are confusable with anything we currently handle in this
12724      * switch (invariants all), it's safe.  We'll just hit the
12725      * default: case next time and keep on incrementing until
12726      * we find one of the invariants we do handle. */
12727      RExC_parse++;
12728      break;
12729     case '[':
12730     {
12731      /* If this looks like it is a [:posix:] class, leave the
12732      * parse pointer at the '[' to fool regclass() into
12733      * thinking it is part of a '[[:posix:]]'.  That function
12734      * will use strict checking to force a syntax error if it
12735      * doesn't work out to a legitimate class */
12736      bool is_posix_class
12737          = could_it_be_a_POSIX_class(pRExC_state);
12738      if (! is_posix_class) {
12739       RExC_parse++;
12740      }
12741
12742      /* regclass() can only return RESTART_UTF8 if multi-char
12743      folds are allowed.  */
12744      if (!regclass(pRExC_state, flagp,depth+1,
12745         is_posix_class, /* parse the whole char
12746              class only if not a
12747              posix class */
12748         FALSE, /* don't allow multi-char folds */
12749         TRUE, /* silence non-portable warnings. */
12750         &current))
12751       FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12752        (UV) *flagp);
12753
12754      /* function call leaves parse pointing to the ']', except
12755      * if we faked it */
12756      if (is_posix_class) {
12757       RExC_parse--;
12758      }
12759
12760      SvREFCNT_dec(current);   /* In case it returned something */
12761      break;
12762     }
12763
12764     case ']':
12765      if (depth--) break;
12766      RExC_parse++;
12767      if (RExC_parse < RExC_end
12768       && *RExC_parse == ')')
12769      {
12770       node = reganode(pRExC_state, ANYOF, 0);
12771       RExC_size += ANYOF_SKIP;
12772       nextchar(pRExC_state);
12773       Set_Node_Length(node,
12774         RExC_parse - oregcomp_parse + 1); /* MJD */
12775       return node;
12776      }
12777      goto no_close;
12778    }
12779    RExC_parse++;
12780   }
12781
12782   no_close:
12783   FAIL("Syntax error in (?[...])");
12784  }
12785
12786  /* Pass 2 only after this.  Everything in this construct is a
12787  * metacharacter.  Operands begin with either a '\' (for an escape
12788  * sequence), or a '[' for a bracketed character class.  Any other
12789  * character should be an operator, or parenthesis for grouping.  Both
12790  * types of operands are handled by calling regclass() to parse them.  It
12791  * is called with a parameter to indicate to return the computed inversion
12792  * list.  The parsing here is implemented via a stack.  Each entry on the
12793  * stack is a single character representing one of the operators, or the
12794  * '('; or else a pointer to an operand inversion list. */
12795
12796 #define IS_OPERAND(a)  (! SvIOK(a))
12797
12798  /* The stack starts empty.  It is a syntax error if the first thing parsed
12799  * is a binary operator; everything else is pushed on the stack.  When an
12800  * operand is parsed, the top of the stack is examined.  If it is a binary
12801  * operator, the item before it should be an operand, and both are replaced
12802  * by the result of doing that operation on the new operand and the one on
12803  * the stack.   Thus a sequence of binary operands is reduced to a single
12804  * one before the next one is parsed.
12805  *
12806  * A unary operator may immediately follow a binary in the input, for
12807  * example
12808  *      [a] + ! [b]
12809  * When an operand is parsed and the top of the stack is a unary operator,
12810  * the operation is performed, and then the stack is rechecked to see if
12811  * this new operand is part of a binary operation; if so, it is handled as
12812  * above.
12813  *
12814  * A '(' is simply pushed on the stack; it is valid only if the stack is
12815  * empty, or the top element of the stack is an operator or another '('
12816  * (for which the parenthesized expression will become an operand).  By the
12817  * time the corresponding ')' is parsed everything in between should have
12818  * been parsed and evaluated to a single operand (or else is a syntax
12819  * error), and is handled as a regular operand */
12820
12821  sv_2mortal((SV *)(stack = newAV()));
12822
12823  while (RExC_parse < RExC_end) {
12824   I32 top_index = av_tindex(stack);
12825   SV** top_ptr;
12826   SV* current = NULL;
12827
12828   /* Skip white space */
12829   RExC_parse = regpatws(pRExC_state, RExC_parse,
12830         TRUE); /* means recognize comments */
12831   if (RExC_parse >= RExC_end) {
12832    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12833   }
12834   if ((curchar = UCHARAT(RExC_parse)) == ']') {
12835    break;
12836   }
12837
12838   switch (curchar) {
12839
12840    case '?':
12841     if (av_tindex(stack) >= 0   /* This makes sure that we can
12842            safely subtract 1 from
12843            RExC_parse in the next clause.
12844            If we have something on the
12845            stack, we have parsed something
12846            */
12847      && UCHARAT(RExC_parse - 1) == '('
12848      && RExC_parse < RExC_end)
12849     {
12850      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12851      * This happens when we have some thing like
12852      *
12853      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12854      *   ...
12855      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12856      *
12857      * Here we would be handling the interpolated
12858      * '$thai_or_lao'.  We handle this by a recursive call to
12859      * ourselves which returns the inversion list the
12860      * interpolated expression evaluates to.  We use the flags
12861      * from the interpolated pattern. */
12862      U32 save_flags = RExC_flags;
12863      const char * const save_parse = ++RExC_parse;
12864
12865      parse_lparen_question_flags(pRExC_state);
12866
12867      if (RExC_parse == save_parse  /* Makes sure there was at
12868              least one flag (or this
12869              embedding wasn't compiled)
12870             */
12871       || RExC_parse >= RExC_end - 4
12872       || UCHARAT(RExC_parse) != ':'
12873       || UCHARAT(++RExC_parse) != '('
12874       || UCHARAT(++RExC_parse) != '?'
12875       || UCHARAT(++RExC_parse) != '[')
12876      {
12877
12878       /* In combination with the above, this moves the
12879       * pointer to the point just after the first erroneous
12880       * character (or if there are no flags, to where they
12881       * should have been) */
12882       if (RExC_parse >= RExC_end - 4) {
12883        RExC_parse = RExC_end;
12884       }
12885       else if (RExC_parse != save_parse) {
12886        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12887       }
12888       vFAIL("Expecting '(?flags:(?[...'");
12889      }
12890      RExC_parse++;
12891      (void) handle_regex_sets(pRExC_state, &current, flagp,
12892              depth+1, oregcomp_parse);
12893
12894      /* Here, 'current' contains the embedded expression's
12895      * inversion list, and RExC_parse points to the trailing
12896      * ']'; the next character should be the ')' which will be
12897      * paired with the '(' that has been put on the stack, so
12898      * the whole embedded expression reduces to '(operand)' */
12899      RExC_parse++;
12900
12901      RExC_flags = save_flags;
12902      goto handle_operand;
12903     }
12904     /* FALL THROUGH */
12905
12906    default:
12907     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12908     vFAIL("Unexpected character");
12909
12910    case '\\':
12911     /* regclass() can only return RESTART_UTF8 if multi-char
12912     folds are allowed.  */
12913     if (!regclass(pRExC_state, flagp,depth+1,
12914        TRUE, /* means parse just the next thing */
12915        FALSE, /* don't allow multi-char folds */
12916        FALSE, /* don't silence non-portable warnings.  */
12917        &current))
12918      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12919       (UV) *flagp);
12920     /* regclass() will return with parsing just the \ sequence,
12921     * leaving the parse pointer at the next thing to parse */
12922     RExC_parse--;
12923     goto handle_operand;
12924
12925    case '[':   /* Is a bracketed character class */
12926    {
12927     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12928
12929     if (! is_posix_class) {
12930      RExC_parse++;
12931     }
12932
12933     /* regclass() can only return RESTART_UTF8 if multi-char
12934     folds are allowed.  */
12935     if(!regclass(pRExC_state, flagp,depth+1,
12936        is_posix_class, /* parse the whole char class
12937             only if not a posix class */
12938        FALSE, /* don't allow multi-char folds */
12939        FALSE, /* don't silence non-portable warnings.  */
12940        &current))
12941      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12942       (UV) *flagp);
12943     /* function call leaves parse pointing to the ']', except if we
12944     * faked it */
12945     if (is_posix_class) {
12946      RExC_parse--;
12947     }
12948
12949     goto handle_operand;
12950    }
12951
12952    case '&':
12953    case '|':
12954    case '+':
12955    case '-':
12956    case '^':
12957     if (top_index < 0
12958      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12959      || ! IS_OPERAND(*top_ptr))
12960     {
12961      RExC_parse++;
12962      vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12963     }
12964     av_push(stack, newSVuv(curchar));
12965     break;
12966
12967    case '!':
12968     av_push(stack, newSVuv(curchar));
12969     break;
12970
12971    case '(':
12972     if (top_index >= 0) {
12973      top_ptr = av_fetch(stack, top_index, FALSE);
12974      assert(top_ptr);
12975      if (IS_OPERAND(*top_ptr)) {
12976       RExC_parse++;
12977       vFAIL("Unexpected '(' with no preceding operator");
12978      }
12979     }
12980     av_push(stack, newSVuv(curchar));
12981     break;
12982
12983    case ')':
12984    {
12985     SV* lparen;
12986     if (top_index < 1
12987      || ! (current = av_pop(stack))
12988      || ! IS_OPERAND(current)
12989      || ! (lparen = av_pop(stack))
12990      || IS_OPERAND(lparen)
12991      || SvUV(lparen) != '(')
12992     {
12993      SvREFCNT_dec(current);
12994      RExC_parse++;
12995      vFAIL("Unexpected ')'");
12996     }
12997     top_index -= 2;
12998     SvREFCNT_dec_NN(lparen);
12999
13000     /* FALL THROUGH */
13001    }
13002
13003    handle_operand:
13004
13005     /* Here, we have an operand to process, in 'current' */
13006
13007     if (top_index < 0) {    /* Just push if stack is empty */
13008      av_push(stack, current);
13009     }
13010     else {
13011      SV* top = av_pop(stack);
13012      SV *prev = NULL;
13013      char current_operator;
13014
13015      if (IS_OPERAND(top)) {
13016       SvREFCNT_dec_NN(top);
13017       SvREFCNT_dec_NN(current);
13018       vFAIL("Operand with no preceding operator");
13019      }
13020      current_operator = (char) SvUV(top);
13021      switch (current_operator) {
13022       case '(':   /* Push the '(' back on followed by the new
13023          operand */
13024        av_push(stack, top);
13025        av_push(stack, current);
13026        SvREFCNT_inc(top);  /* Counters the '_dec' done
13027             just after the 'break', so
13028             it doesn't get wrongly freed
13029             */
13030        break;
13031
13032       case '!':
13033        _invlist_invert(current);
13034
13035        /* Unlike binary operators, the top of the stack,
13036        * now that this unary one has been popped off, may
13037        * legally be an operator, and we now have operand
13038        * for it. */
13039        top_index--;
13040        SvREFCNT_dec_NN(top);
13041        goto handle_operand;
13042
13043       case '&':
13044        prev = av_pop(stack);
13045        _invlist_intersection(prev,
13046             current,
13047             &current);
13048        av_push(stack, current);
13049        break;
13050
13051       case '|':
13052       case '+':
13053        prev = av_pop(stack);
13054        _invlist_union(prev, current, &current);
13055        av_push(stack, current);
13056        break;
13057
13058       case '-':
13059        prev = av_pop(stack);;
13060        _invlist_subtract(prev, current, &current);
13061        av_push(stack, current);
13062        break;
13063
13064       case '^':   /* The union minus the intersection */
13065       {
13066        SV* i = NULL;
13067        SV* u = NULL;
13068        SV* element;
13069
13070        prev = av_pop(stack);
13071        _invlist_union(prev, current, &u);
13072        _invlist_intersection(prev, current, &i);
13073        /* _invlist_subtract will overwrite current
13074         without freeing what it already contains */
13075        element = current;
13076        _invlist_subtract(u, i, &current);
13077        av_push(stack, current);
13078        SvREFCNT_dec_NN(i);
13079        SvREFCNT_dec_NN(u);
13080        SvREFCNT_dec_NN(element);
13081        break;
13082       }
13083
13084       default:
13085        Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13086     }
13087     SvREFCNT_dec_NN(top);
13088     SvREFCNT_dec(prev);
13089    }
13090   }
13091
13092   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13093  }
13094
13095  if (av_tindex(stack) < 0   /* Was empty */
13096   || ((final = av_pop(stack)) == NULL)
13097   || ! IS_OPERAND(final)
13098   || av_tindex(stack) >= 0)  /* More left on stack */
13099  {
13100   vFAIL("Incomplete expression within '(?[ ])'");
13101  }
13102
13103  /* Here, 'final' is the resultant inversion list from evaluating the
13104  * expression.  Return it if so requested */
13105  if (return_invlist) {
13106   *return_invlist = final;
13107   return END;
13108  }
13109
13110  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13111  * expecting a string of ranges and individual code points */
13112  invlist_iterinit(final);
13113  result_string = newSVpvs("");
13114  while (invlist_iternext(final, &start, &end)) {
13115   if (start == end) {
13116    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13117   }
13118   else {
13119    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13120              start,          end);
13121   }
13122  }
13123
13124  save_parse = RExC_parse;
13125  RExC_parse = SvPV(result_string, len);
13126  save_end = RExC_end;
13127  RExC_end = RExC_parse + len;
13128
13129  /* We turn off folding around the call, as the class we have constructed
13130  * already has all folding taken into consideration, and we don't want
13131  * regclass() to add to that */
13132  RExC_flags &= ~RXf_PMf_FOLD;
13133  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13134  */
13135  node = regclass(pRExC_state, flagp,depth+1,
13136      FALSE, /* means parse the whole char class */
13137      FALSE, /* don't allow multi-char folds */
13138      TRUE, /* silence non-portable warnings.  The above may very
13139        well have generated non-portable code points, but
13140        they're valid on this machine */
13141      NULL);
13142  if (!node)
13143   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13144      PTR2UV(flagp));
13145  if (save_fold) {
13146   RExC_flags |= RXf_PMf_FOLD;
13147  }
13148  RExC_parse = save_parse + 1;
13149  RExC_end = save_end;
13150  SvREFCNT_dec_NN(final);
13151  SvREFCNT_dec_NN(result_string);
13152
13153  nextchar(pRExC_state);
13154  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13155  return node;
13156 }
13157 #undef IS_OPERAND
13158
13159 /* The names of properties whose definitions are not known at compile time are
13160  * stored in this SV, after a constant heading.  So if the length has been
13161  * changed since initialization, then there is a run-time definition. */
13162 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13163           (SvCUR(listsv) != initial_listsv_len)
13164
13165 STATIC regnode *
13166 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13167     const bool stop_at_1,  /* Just parse the next thing, don't
13168           look for a full character class */
13169     bool allow_multi_folds,
13170     const bool silence_non_portable,   /* Don't output warnings
13171              about too large
13172              characters */
13173     SV** ret_invlist)  /* Return an inversion list, not a node */
13174 {
13175  /* parse a bracketed class specification.  Most of these will produce an
13176  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13177  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13178  * under /i with multi-character folds: it will be rewritten following the
13179  * paradigm of this example, where the <multi-fold>s are characters which
13180  * fold to multiple character sequences:
13181  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13182  * gets effectively rewritten as:
13183  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13184  * reg() gets called (recursively) on the rewritten version, and this
13185  * function will return what it constructs.  (Actually the <multi-fold>s
13186  * aren't physically removed from the [abcdefghi], it's just that they are
13187  * ignored in the recursion by means of a flag:
13188  * <RExC_in_multi_char_class>.)
13189  *
13190  * ANYOF nodes contain a bit map for the first 256 characters, with the
13191  * corresponding bit set if that character is in the list.  For characters
13192  * above 255, a range list or swash is used.  There are extra bits for \w,
13193  * etc. in locale ANYOFs, as what these match is not determinable at
13194  * compile time
13195  *
13196  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13197  * to be restarted.  This can only happen if ret_invlist is non-NULL.
13198  */
13199
13200  dVAR;
13201  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13202  IV range = 0;
13203  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13204  regnode *ret;
13205  STRLEN numlen;
13206  IV namedclass = OOB_NAMEDCLASS;
13207  char *rangebegin = NULL;
13208  bool need_class = 0;
13209  SV *listsv = NULL;
13210  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13211          than just initialized.  */
13212  SV* properties = NULL;    /* Code points that match \p{} \P{} */
13213  SV* posixes = NULL;     /* Code points that match classes like [:word:],
13214        extended beyond the Latin1 range.  These have to
13215        be kept separate from other code points for much
13216        of this function because their handling  is
13217        different under /i, and for most classes under
13218        /d as well */
13219  SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13220        separate for a while from the non-complemented
13221        versions because of complications with /d
13222        matching */
13223  UV element_count = 0;   /* Number of distinct elements in the class.
13224        Optimizations may be possible if this is tiny */
13225  AV * multi_char_matches = NULL; /* Code points that fold to more than one
13226          character; used under /i */
13227  UV n;
13228  char * stop_ptr = RExC_end;    /* where to stop parsing */
13229  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13230             space? */
13231  const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13232
13233  /* Unicode properties are stored in a swash; this holds the current one
13234  * being parsed.  If this swash is the only above-latin1 component of the
13235  * character class, an optimization is to pass it directly on to the
13236  * execution engine.  Otherwise, it is set to NULL to indicate that there
13237  * are other things in the class that have to be dealt with at execution
13238  * time */
13239  SV* swash = NULL;  /* Code points that match \p{} \P{} */
13240
13241  /* Set if a component of this character class is user-defined; just passed
13242  * on to the engine */
13243  bool has_user_defined_property = FALSE;
13244
13245  /* inversion list of code points this node matches only when the target
13246  * string is in UTF-8.  (Because is under /d) */
13247  SV* depends_list = NULL;
13248
13249  /* Inversion list of code points this node matches regardless of things
13250  * like locale, folding, utf8ness of the target string */
13251  SV* cp_list = NULL;
13252
13253  /* Like cp_list, but code points on this list need to be checked for things
13254  * that fold to/from them under /i */
13255  SV* cp_foldable_list = NULL;
13256
13257  /* Like cp_list, but code points on this list are valid only when the
13258  * runtime locale is UTF-8 */
13259  SV* only_utf8_locale_list = NULL;
13260
13261 #ifdef EBCDIC
13262  /* In a range, counts how many 0-2 of the ends of it came from literals,
13263  * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13264  UV literal_endpoint = 0;
13265 #endif
13266  bool invert = FALSE;    /* Is this class to be complemented */
13267
13268  bool warn_super = ALWAYS_WARN_SUPER;
13269
13270  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13271   case we need to change the emitted regop to an EXACT. */
13272  const char * orig_parse = RExC_parse;
13273  const SSize_t orig_size = RExC_size;
13274  bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13275  GET_RE_DEBUG_FLAGS_DECL;
13276
13277  PERL_ARGS_ASSERT_REGCLASS;
13278 #ifndef DEBUGGING
13279  PERL_UNUSED_ARG(depth);
13280 #endif
13281
13282  DEBUG_PARSE("clas");
13283
13284  /* Assume we are going to generate an ANYOF node. */
13285  ret = reganode(pRExC_state, ANYOF, 0);
13286
13287  if (SIZE_ONLY) {
13288   RExC_size += ANYOF_SKIP;
13289   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13290  }
13291  else {
13292   ANYOF_FLAGS(ret) = 0;
13293
13294   RExC_emit += ANYOF_SKIP;
13295   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13296   initial_listsv_len = SvCUR(listsv);
13297   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13298  }
13299
13300  if (skip_white) {
13301   RExC_parse = regpatws(pRExC_state, RExC_parse,
13302        FALSE /* means don't recognize comments */);
13303  }
13304
13305  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13306   RExC_parse++;
13307   invert = TRUE;
13308   allow_multi_folds = FALSE;
13309   RExC_naughty++;
13310   if (skip_white) {
13311    RExC_parse = regpatws(pRExC_state, RExC_parse,
13312         FALSE /* means don't recognize comments */);
13313   }
13314  }
13315
13316  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13317  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13318   const char *s = RExC_parse;
13319   const char  c = *s++;
13320
13321   while (isWORDCHAR(*s))
13322    s++;
13323   if (*s && c == *s && s[1] == ']') {
13324    SAVEFREESV(RExC_rx_sv);
13325    ckWARN3reg(s+2,
13326      "POSIX syntax [%c %c] belongs inside character classes",
13327      c, c);
13328    (void)ReREFCNT_inc(RExC_rx_sv);
13329   }
13330  }
13331
13332  /* If the caller wants us to just parse a single element, accomplish this
13333  * by faking the loop ending condition */
13334  if (stop_at_1 && RExC_end > RExC_parse) {
13335   stop_ptr = RExC_parse + 1;
13336  }
13337
13338  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13339  if (UCHARAT(RExC_parse) == ']')
13340   goto charclassloop;
13341
13342 parseit:
13343  while (1) {
13344   if  (RExC_parse >= stop_ptr) {
13345    break;
13346   }
13347
13348   if (skip_white) {
13349    RExC_parse = regpatws(pRExC_state, RExC_parse,
13350         FALSE /* means don't recognize comments */);
13351   }
13352
13353   if  (UCHARAT(RExC_parse) == ']') {
13354    break;
13355   }
13356
13357  charclassloop:
13358
13359   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13360   save_value = value;
13361   save_prevvalue = prevvalue;
13362
13363   if (!range) {
13364    rangebegin = RExC_parse;
13365    element_count++;
13366   }
13367   if (UTF) {
13368    value = utf8n_to_uvchr((U8*)RExC_parse,
13369         RExC_end - RExC_parse,
13370         &numlen, UTF8_ALLOW_DEFAULT);
13371    RExC_parse += numlen;
13372   }
13373   else
13374    value = UCHARAT(RExC_parse++);
13375
13376   if (value == '['
13377    && RExC_parse < RExC_end
13378    && POSIXCC(UCHARAT(RExC_parse)))
13379   {
13380    namedclass = regpposixcc(pRExC_state, value, strict);
13381   }
13382   else if (value == '\\') {
13383    if (UTF) {
13384     value = utf8n_to_uvchr((U8*)RExC_parse,
13385         RExC_end - RExC_parse,
13386         &numlen, UTF8_ALLOW_DEFAULT);
13387     RExC_parse += numlen;
13388    }
13389    else
13390     value = UCHARAT(RExC_parse++);
13391
13392    /* Some compilers cannot handle switching on 64-bit integer
13393    * values, therefore value cannot be an UV.  Yes, this will
13394    * be a problem later if we want switch on Unicode.
13395    * A similar issue a little bit later when switching on
13396    * namedclass. --jhi */
13397
13398    /* If the \ is escaping white space when white space is being
13399    * skipped, it means that that white space is wanted literally, and
13400    * is already in 'value'.  Otherwise, need to translate the escape
13401    * into what it signifies. */
13402    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13403
13404    case 'w': namedclass = ANYOF_WORDCHAR; break;
13405    case 'W': namedclass = ANYOF_NWORDCHAR; break;
13406    case 's': namedclass = ANYOF_SPACE; break;
13407    case 'S': namedclass = ANYOF_NSPACE; break;
13408    case 'd': namedclass = ANYOF_DIGIT; break;
13409    case 'D': namedclass = ANYOF_NDIGIT; break;
13410    case 'v': namedclass = ANYOF_VERTWS; break;
13411    case 'V': namedclass = ANYOF_NVERTWS; break;
13412    case 'h': namedclass = ANYOF_HORIZWS; break;
13413    case 'H': namedclass = ANYOF_NHORIZWS; break;
13414    case 'N':  /* Handle \N{NAME} in class */
13415     {
13416      /* We only pay attention to the first char of
13417      multichar strings being returned. I kinda wonder
13418      if this makes sense as it does change the behaviour
13419      from earlier versions, OTOH that behaviour was broken
13420      as well. */
13421      if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13422          TRUE, /* => charclass */
13423          strict))
13424      {
13425       if (*flagp & RESTART_UTF8)
13426        FAIL("panic: grok_bslash_N set RESTART_UTF8");
13427       goto parseit;
13428      }
13429     }
13430     break;
13431    case 'p':
13432    case 'P':
13433     {
13434     char *e;
13435
13436     /* We will handle any undefined properties ourselves */
13437     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13438          /* And we actually would prefer to get
13439           * the straight inversion list of the
13440           * swash, since we will be accessing it
13441           * anyway, to save a little time */
13442          |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13443
13444     if (RExC_parse >= RExC_end)
13445      vFAIL2("Empty \\%c{}", (U8)value);
13446     if (*RExC_parse == '{') {
13447      const U8 c = (U8)value;
13448      e = strchr(RExC_parse++, '}');
13449      if (!e)
13450       vFAIL2("Missing right brace on \\%c{}", c);
13451      while (isSPACE(UCHARAT(RExC_parse)))
13452       RExC_parse++;
13453      if (e == RExC_parse)
13454       vFAIL2("Empty \\%c{}", c);
13455      n = e - RExC_parse;
13456      while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13457       n--;
13458     }
13459     else {
13460      e = RExC_parse;
13461      n = 1;
13462     }
13463     if (!SIZE_ONLY) {
13464      SV* invlist;
13465      char* formatted;
13466      char* name;
13467
13468      if (UCHARAT(RExC_parse) == '^') {
13469       RExC_parse++;
13470       n--;
13471       /* toggle.  (The rhs xor gets the single bit that
13472       * differs between P and p; the other xor inverts just
13473       * that bit) */
13474       value ^= 'P' ^ 'p';
13475
13476       while (isSPACE(UCHARAT(RExC_parse))) {
13477        RExC_parse++;
13478        n--;
13479       }
13480      }
13481      /* Try to get the definition of the property into
13482      * <invlist>.  If /i is in effect, the effective property
13483      * will have its name be <__NAME_i>.  The design is
13484      * discussed in commit
13485      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13486      formatted = Perl_form(aTHX_
13487           "%s%.*s%s\n",
13488           (FOLD) ? "__" : "",
13489           (int)n,
13490           RExC_parse,
13491           (FOLD) ? "_i" : ""
13492         );
13493      name = savepvn(formatted, strlen(formatted));
13494
13495      /* Look up the property name, and get its swash and
13496      * inversion list, if the property is found  */
13497      if (swash) {
13498       SvREFCNT_dec_NN(swash);
13499      }
13500      swash = _core_swash_init("utf8", name, &PL_sv_undef,
13501            1, /* binary */
13502            0, /* not tr/// */
13503            NULL, /* No inversion list */
13504            &swash_init_flags
13505            );
13506      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13507       if (swash) {
13508        SvREFCNT_dec_NN(swash);
13509        swash = NULL;
13510       }
13511
13512       /* Here didn't find it.  It could be a user-defined
13513       * property that will be available at run-time.  If we
13514       * accept only compile-time properties, is an error;
13515       * otherwise add it to the list for run-time look up */
13516       if (ret_invlist) {
13517        RExC_parse = e + 1;
13518        vFAIL2utf8f(
13519         "Property '%"UTF8f"' is unknown",
13520         UTF8fARG(UTF, n, name));
13521       }
13522       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13523           (value == 'p' ? '+' : '!'),
13524           UTF8fARG(UTF, n, name));
13525       has_user_defined_property = TRUE;
13526
13527       /* We don't know yet, so have to assume that the
13528       * property could match something in the Latin1 range,
13529       * hence something that isn't utf8.  Note that this
13530       * would cause things in <depends_list> to match
13531       * inappropriately, except that any \p{}, including
13532       * this one forces Unicode semantics, which means there
13533       * is no <depends_list> */
13534       ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13535      }
13536      else {
13537
13538       /* Here, did get the swash and its inversion list.  If
13539       * the swash is from a user-defined property, then this
13540       * whole character class should be regarded as such */
13541       if (swash_init_flags
13542        & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13543       {
13544        has_user_defined_property = TRUE;
13545       }
13546       else if
13547        /* We warn on matching an above-Unicode code point
13548        * if the match would return true, except don't
13549        * warn for \p{All}, which has exactly one element
13550        * = 0 */
13551        (_invlist_contains_cp(invlist, 0x110000)
13552         && (! (_invlist_len(invlist) == 1
13553          && *invlist_array(invlist) == 0)))
13554       {
13555        warn_super = TRUE;
13556       }
13557
13558
13559       /* Invert if asking for the complement */
13560       if (value == 'P') {
13561        _invlist_union_complement_2nd(properties,
13562               invlist,
13563               &properties);
13564
13565        /* The swash can't be used as-is, because we've
13566        * inverted things; delay removing it to here after
13567        * have copied its invlist above */
13568        SvREFCNT_dec_NN(swash);
13569        swash = NULL;
13570       }
13571       else {
13572        _invlist_union(properties, invlist, &properties);
13573       }
13574      }
13575      Safefree(name);
13576     }
13577     RExC_parse = e + 1;
13578     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13579             named */
13580
13581     /* \p means they want Unicode semantics */
13582     RExC_uni_semantics = 1;
13583     }
13584     break;
13585    case 'n': value = '\n';   break;
13586    case 'r': value = '\r';   break;
13587    case 't': value = '\t';   break;
13588    case 'f': value = '\f';   break;
13589    case 'b': value = '\b';   break;
13590    case 'e': value = ASCII_TO_NATIVE('\033');break;
13591    case 'a': value = '\a';                   break;
13592    case 'o':
13593     RExC_parse--; /* function expects to be pointed at the 'o' */
13594     {
13595      const char* error_msg;
13596      bool valid = grok_bslash_o(&RExC_parse,
13597            &value,
13598            &error_msg,
13599            SIZE_ONLY,   /* warnings in pass
13600                1 only */
13601            strict,
13602            silence_non_portable,
13603            UTF);
13604      if (! valid) {
13605       vFAIL(error_msg);
13606      }
13607     }
13608     if (PL_encoding && value < 0x100) {
13609      goto recode_encoding;
13610     }
13611     break;
13612    case 'x':
13613     RExC_parse--; /* function expects to be pointed at the 'x' */
13614     {
13615      const char* error_msg;
13616      bool valid = grok_bslash_x(&RExC_parse,
13617            &value,
13618            &error_msg,
13619            TRUE, /* Output warnings */
13620            strict,
13621            silence_non_portable,
13622            UTF);
13623      if (! valid) {
13624       vFAIL(error_msg);
13625      }
13626     }
13627     if (PL_encoding && value < 0x100)
13628      goto recode_encoding;
13629     break;
13630    case 'c':
13631     value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13632     break;
13633    case '0': case '1': case '2': case '3': case '4':
13634    case '5': case '6': case '7':
13635     {
13636      /* Take 1-3 octal digits */
13637      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13638      numlen = (strict) ? 4 : 3;
13639      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13640      RExC_parse += numlen;
13641      if (numlen != 3) {
13642       if (strict) {
13643        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13644        vFAIL("Need exactly 3 octal digits");
13645       }
13646       else if (! SIZE_ONLY /* like \08, \178 */
13647         && numlen < 3
13648         && RExC_parse < RExC_end
13649         && isDIGIT(*RExC_parse)
13650         && ckWARN(WARN_REGEXP))
13651       {
13652        SAVEFREESV(RExC_rx_sv);
13653        reg_warn_non_literal_string(
13654         RExC_parse + 1,
13655         form_short_octal_warning(RExC_parse, numlen));
13656        (void)ReREFCNT_inc(RExC_rx_sv);
13657       }
13658      }
13659      if (PL_encoding && value < 0x100)
13660       goto recode_encoding;
13661      break;
13662     }
13663    recode_encoding:
13664     if (! RExC_override_recoding) {
13665      SV* enc = PL_encoding;
13666      value = reg_recode((const char)(U8)value, &enc);
13667      if (!enc) {
13668       if (strict) {
13669        vFAIL("Invalid escape in the specified encoding");
13670       }
13671       else if (SIZE_ONLY) {
13672        ckWARNreg(RExC_parse,
13673         "Invalid escape in the specified encoding");
13674       }
13675      }
13676      break;
13677     }
13678    default:
13679     /* Allow \_ to not give an error */
13680     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13681      if (strict) {
13682       vFAIL2("Unrecognized escape \\%c in character class",
13683        (int)value);
13684      }
13685      else {
13686       SAVEFREESV(RExC_rx_sv);
13687       ckWARN2reg(RExC_parse,
13688        "Unrecognized escape \\%c in character class passed through",
13689        (int)value);
13690       (void)ReREFCNT_inc(RExC_rx_sv);
13691      }
13692     }
13693     break;
13694    }   /* End of switch on char following backslash */
13695   } /* end of handling backslash escape sequences */
13696 #ifdef EBCDIC
13697   else
13698    literal_endpoint++;
13699 #endif
13700
13701   /* Here, we have the current token in 'value' */
13702
13703   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13704    U8 classnum;
13705
13706    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13707    * literal, as is the character that began the false range, i.e.
13708    * the 'a' in the examples */
13709    if (range) {
13710     if (!SIZE_ONLY) {
13711      const int w = (RExC_parse >= rangebegin)
13712         ? RExC_parse - rangebegin
13713         : 0;
13714      if (strict) {
13715       vFAIL2utf8f(
13716        "False [] range \"%"UTF8f"\"",
13717        UTF8fARG(UTF, w, rangebegin));
13718      }
13719      else {
13720       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13721       ckWARN2reg(RExC_parse,
13722        "False [] range \"%"UTF8f"\"",
13723        UTF8fARG(UTF, w, rangebegin));
13724       (void)ReREFCNT_inc(RExC_rx_sv);
13725       cp_list = add_cp_to_invlist(cp_list, '-');
13726       cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13727                prevvalue);
13728      }
13729     }
13730
13731     range = 0; /* this was not a true range */
13732     element_count += 2; /* So counts for three values */
13733    }
13734
13735    classnum = namedclass_to_classnum(namedclass);
13736
13737    if (LOC && namedclass < ANYOF_POSIXL_MAX
13738 #ifndef HAS_ISASCII
13739     && classnum != _CC_ASCII
13740 #endif
13741    ) {
13742     /* What the Posix classes (like \w, [:space:]) match in locale
13743     * isn't knowable under locale until actual match time.  Room
13744     * must be reserved (one time per outer bracketed class) to
13745     * store such classes.  The space will contain a bit for each
13746     * named class that is to be matched against.  This isn't
13747     * needed for \p{} and pseudo-classes, as they are not affected
13748     * by locale, and hence are dealt with separately */
13749     if (! need_class) {
13750      need_class = 1;
13751      if (SIZE_ONLY) {
13752       RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13753      }
13754      else {
13755       RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13756      }
13757      ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13758      ANYOF_POSIXL_ZERO(ret);
13759     }
13760
13761     /* See if it already matches the complement of this POSIX
13762     * class */
13763     if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13764      && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13765                ? -1
13766                : 1)))
13767     {
13768      posixl_matches_all = TRUE;
13769      break;  /* No need to continue.  Since it matches both
13770        e.g., \w and \W, it matches everything, and the
13771        bracketed class can be optimized into qr/./s */
13772     }
13773
13774     /* Add this class to those that should be checked at runtime */
13775     ANYOF_POSIXL_SET(ret, namedclass);
13776
13777     /* The above-Latin1 characters are not subject to locale rules.
13778     * Just add them, in the second pass, to the
13779     * unconditionally-matched list */
13780     if (! SIZE_ONLY) {
13781      SV* scratch_list = NULL;
13782
13783      /* Get the list of the above-Latin1 code points this
13784      * matches */
13785      _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13786           PL_XPosix_ptrs[classnum],
13787
13788           /* Odd numbers are complements, like
13789           * NDIGIT, NASCII, ... */
13790           namedclass % 2 != 0,
13791           &scratch_list);
13792      /* Checking if 'cp_list' is NULL first saves an extra
13793      * clone.  Its reference count will be decremented at the
13794      * next union, etc, or if this is the only instance, at the
13795      * end of the routine */
13796      if (! cp_list) {
13797       cp_list = scratch_list;
13798      }
13799      else {
13800       _invlist_union(cp_list, scratch_list, &cp_list);
13801       SvREFCNT_dec_NN(scratch_list);
13802      }
13803      continue;   /* Go get next character */
13804     }
13805    }
13806    else if (! SIZE_ONLY) {
13807
13808     /* Here, not in pass1 (in that pass we skip calculating the
13809     * contents of this class), and is /l, or is a POSIX class for
13810     * which /l doesn't matter (or is a Unicode property, which is
13811     * skipped here). */
13812     if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13813      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13814
13815       /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13816       * nor /l make a difference in what these match,
13817       * therefore we just add what they match to cp_list. */
13818       if (classnum != _CC_VERTSPACE) {
13819        assert(   namedclass == ANYOF_HORIZWS
13820         || namedclass == ANYOF_NHORIZWS);
13821
13822        /* It turns out that \h is just a synonym for
13823        * XPosixBlank */
13824        classnum = _CC_BLANK;
13825       }
13826
13827       _invlist_union_maybe_complement_2nd(
13828         cp_list,
13829         PL_XPosix_ptrs[classnum],
13830         namedclass % 2 != 0,    /* Complement if odd
13831               (NHORIZWS, NVERTWS)
13832               */
13833         &cp_list);
13834      }
13835     }
13836     else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13837       complement and use nposixes */
13838      SV** posixes_ptr = namedclass % 2 == 0
13839          ? &posixes
13840          : &nposixes;
13841      SV** source_ptr = &PL_XPosix_ptrs[classnum];
13842      _invlist_union_maybe_complement_2nd(
13843              *posixes_ptr,
13844              *source_ptr,
13845              namedclass % 2 != 0,
13846              posixes_ptr);
13847     }
13848     continue;   /* Go get next character */
13849    }
13850   } /* end of namedclass \blah */
13851
13852   /* Here, we have a single value.  If 'range' is set, it is the ending
13853   * of a range--check its validity.  Later, we will handle each
13854   * individual code point in the range.  If 'range' isn't set, this
13855   * could be the beginning of a range, so check for that by looking
13856   * ahead to see if the next real character to be processed is the range
13857   * indicator--the minus sign */
13858
13859   if (skip_white) {
13860    RExC_parse = regpatws(pRExC_state, RExC_parse,
13861         FALSE /* means don't recognize comments */);
13862   }
13863
13864   if (range) {
13865    if (prevvalue > value) /* b-a */ {
13866     const int w = RExC_parse - rangebegin;
13867     vFAIL2utf8f(
13868      "Invalid [] range \"%"UTF8f"\"",
13869      UTF8fARG(UTF, w, rangebegin));
13870     range = 0; /* not a valid range */
13871    }
13872   }
13873   else {
13874    prevvalue = value; /* save the beginning of the potential range */
13875    if (! stop_at_1     /* Can't be a range if parsing just one thing */
13876     && *RExC_parse == '-')
13877    {
13878     char* next_char_ptr = RExC_parse + 1;
13879     if (skip_white) {   /* Get the next real char after the '-' */
13880      next_char_ptr = regpatws(pRExC_state,
13881            RExC_parse + 1,
13882            FALSE); /* means don't recognize
13883               comments */
13884     }
13885
13886     /* If the '-' is at the end of the class (just before the ']',
13887     * it is a literal minus; otherwise it is a range */
13888     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13889      RExC_parse = next_char_ptr;
13890
13891      /* a bad range like \w-, [:word:]- ? */
13892      if (namedclass > OOB_NAMEDCLASS) {
13893       if (strict || ckWARN(WARN_REGEXP)) {
13894        const int w =
13895         RExC_parse >= rangebegin ?
13896         RExC_parse - rangebegin : 0;
13897        if (strict) {
13898         vFAIL4("False [] range \"%*.*s\"",
13899          w, w, rangebegin);
13900        }
13901        else {
13902         vWARN4(RExC_parse,
13903          "False [] range \"%*.*s\"",
13904          w, w, rangebegin);
13905        }
13906       }
13907       if (!SIZE_ONLY) {
13908        cp_list = add_cp_to_invlist(cp_list, '-');
13909       }
13910       element_count++;
13911      } else
13912       range = 1; /* yeah, it's a range! */
13913      continue; /* but do it the next time */
13914     }
13915    }
13916   }
13917
13918   /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13919   * if not */
13920
13921   /* non-Latin1 code point implies unicode semantics.  Must be set in
13922   * pass1 so is there for the whole of pass 2 */
13923   if (value > 255) {
13924    RExC_uni_semantics = 1;
13925   }
13926
13927   /* Ready to process either the single value, or the completed range.
13928   * For single-valued non-inverted ranges, we consider the possibility
13929   * of multi-char folds.  (We made a conscious decision to not do this
13930   * for the other cases because it can often lead to non-intuitive
13931   * results.  For example, you have the peculiar case that:
13932   *  "s s" =~ /^[^\xDF]+$/i => Y
13933   *  "ss"  =~ /^[^\xDF]+$/i => N
13934   *
13935   * See [perl #89750] */
13936   if (FOLD && allow_multi_folds && value == prevvalue) {
13937    if (value == LATIN_SMALL_LETTER_SHARP_S
13938     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13939               value)))
13940    {
13941     /* Here <value> is indeed a multi-char fold.  Get what it is */
13942
13943     U8 foldbuf[UTF8_MAXBYTES_CASE];
13944     STRLEN foldlen;
13945
13946     UV folded = _to_uni_fold_flags(
13947         value,
13948         foldbuf,
13949         &foldlen,
13950         FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
13951             ? FOLD_FLAGS_NOMIX_ASCII
13952             : 0)
13953         );
13954
13955     /* Here, <folded> should be the first character of the
13956     * multi-char fold of <value>, with <foldbuf> containing the
13957     * whole thing.  But, if this fold is not allowed (because of
13958     * the flags), <fold> will be the same as <value>, and should
13959     * be processed like any other character, so skip the special
13960     * handling */
13961     if (folded != value) {
13962
13963      /* Skip if we are recursed, currently parsing the class
13964      * again.  Otherwise add this character to the list of
13965      * multi-char folds. */
13966      if (! RExC_in_multi_char_class) {
13967       AV** this_array_ptr;
13968       AV* this_array;
13969       STRLEN cp_count = utf8_length(foldbuf,
13970              foldbuf + foldlen);
13971       SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13972
13973       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13974
13975
13976       if (! multi_char_matches) {
13977        multi_char_matches = newAV();
13978       }
13979
13980       /* <multi_char_matches> is actually an array of arrays.
13981       * There will be one or two top-level elements: [2],
13982       * and/or [3].  The [2] element is an array, each
13983       * element thereof is a character which folds to TWO
13984       * characters; [3] is for folds to THREE characters.
13985       * (Unicode guarantees a maximum of 3 characters in any
13986       * fold.)  When we rewrite the character class below,
13987       * we will do so such that the longest folds are
13988       * written first, so that it prefers the longest
13989       * matching strings first.  This is done even if it
13990       * turns out that any quantifier is non-greedy, out of
13991       * programmer laziness.  Tom Christiansen has agreed
13992       * that this is ok.  This makes the test for the
13993       * ligature 'ffi' come before the test for 'ff' */
13994       if (av_exists(multi_char_matches, cp_count)) {
13995        this_array_ptr = (AV**) av_fetch(multi_char_matches,
13996                cp_count, FALSE);
13997        this_array = *this_array_ptr;
13998       }
13999       else {
14000        this_array = newAV();
14001        av_store(multi_char_matches, cp_count,
14002          (SV*) this_array);
14003       }
14004       av_push(this_array, multi_fold);
14005      }
14006
14007      /* This element should not be processed further in this
14008      * class */
14009      element_count--;
14010      value = save_value;
14011      prevvalue = save_prevvalue;
14012      continue;
14013     }
14014    }
14015   }
14016
14017   /* Deal with this element of the class */
14018   if (! SIZE_ONLY) {
14019 #ifndef EBCDIC
14020    cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14021              prevvalue, value);
14022 #else
14023    SV* this_range = _new_invlist(1);
14024    _append_range_to_invlist(this_range, prevvalue, value);
14025
14026    /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14027    * If this range was specified using something like 'i-j', we want
14028    * to include only the 'i' and the 'j', and not anything in
14029    * between, so exclude non-ASCII, non-alphabetics from it.
14030    * However, if the range was specified with something like
14031    * [\x89-\x91] or [\x89-j], all code points within it should be
14032    * included.  literal_endpoint==2 means both ends of the range used
14033    * a literal character, not \x{foo} */
14034    if (literal_endpoint == 2
14035     && ((prevvalue >= 'a' && value <= 'z')
14036      || (prevvalue >= 'A' && value <= 'Z')))
14037    {
14038     _invlist_intersection(this_range, PL_ASCII,
14039          &this_range);
14040
14041     /* Since this above only contains ascii, the intersection of it
14042     * with anything will still yield only ascii */
14043     _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14044          &this_range);
14045    }
14046    _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14047    literal_endpoint = 0;
14048 #endif
14049   }
14050
14051   range = 0; /* this range (if it was one) is done now */
14052  } /* End of loop through all the text within the brackets */
14053
14054  /* If anything in the class expands to more than one character, we have to
14055  * deal with them by building up a substitute parse string, and recursively
14056  * calling reg() on it, instead of proceeding */
14057  if (multi_char_matches) {
14058   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14059   I32 cp_count;
14060   STRLEN len;
14061   char *save_end = RExC_end;
14062   char *save_parse = RExC_parse;
14063   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14064          a "|" */
14065   I32 reg_flags;
14066
14067   assert(! invert);
14068 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14069   because too confusing */
14070   if (invert) {
14071    sv_catpv(substitute_parse, "(?:");
14072   }
14073 #endif
14074
14075   /* Look at the longest folds first */
14076   for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14077
14078    if (av_exists(multi_char_matches, cp_count)) {
14079     AV** this_array_ptr;
14080     SV* this_sequence;
14081
14082     this_array_ptr = (AV**) av_fetch(multi_char_matches,
14083             cp_count, FALSE);
14084     while ((this_sequence = av_pop(*this_array_ptr)) !=
14085                 &PL_sv_undef)
14086     {
14087      if (! first_time) {
14088       sv_catpv(substitute_parse, "|");
14089      }
14090      first_time = FALSE;
14091
14092      sv_catpv(substitute_parse, SvPVX(this_sequence));
14093     }
14094    }
14095   }
14096
14097   /* If the character class contains anything else besides these
14098   * multi-character folds, have to include it in recursive parsing */
14099   if (element_count) {
14100    sv_catpv(substitute_parse, "|[");
14101    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14102    sv_catpv(substitute_parse, "]");
14103   }
14104
14105   sv_catpv(substitute_parse, ")");
14106 #if 0
14107   if (invert) {
14108    /* This is a way to get the parse to skip forward a whole named
14109    * sequence instead of matching the 2nd character when it fails the
14110    * first */
14111    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14112   }
14113 #endif
14114
14115   RExC_parse = SvPV(substitute_parse, len);
14116   RExC_end = RExC_parse + len;
14117   RExC_in_multi_char_class = 1;
14118   RExC_emit = (regnode *)orig_emit;
14119
14120   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14121
14122   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14123
14124   RExC_parse = save_parse;
14125   RExC_end = save_end;
14126   RExC_in_multi_char_class = 0;
14127   SvREFCNT_dec_NN(multi_char_matches);
14128   return ret;
14129  }
14130
14131  /* Here, we've gone through the entire class and dealt with multi-char
14132  * folds.  We are now in a position that we can do some checks to see if we
14133  * can optimize this ANYOF node into a simpler one, even in Pass 1.
14134  * Currently we only do two checks:
14135  * 1) is in the unlikely event that the user has specified both, eg. \w and
14136  *    \W under /l, then the class matches everything.  (This optimization
14137  *    is done only to make the optimizer code run later work.)
14138  * 2) if the character class contains only a single element (including a
14139  *    single range), we see if there is an equivalent node for it.
14140  * Other checks are possible */
14141  if (! ret_invlist   /* Can't optimize if returning the constructed
14142       inversion list */
14143   && (UNLIKELY(posixl_matches_all) || element_count == 1))
14144  {
14145   U8 op = END;
14146   U8 arg = 0;
14147
14148   if (UNLIKELY(posixl_matches_all)) {
14149    op = SANY;
14150   }
14151   else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14152             \w or [:digit:] or \p{foo}
14153             */
14154
14155    /* All named classes are mapped into POSIXish nodes, with its FLAG
14156    * argument giving which class it is */
14157    switch ((I32)namedclass) {
14158     case ANYOF_UNIPROP:
14159      break;
14160
14161     /* These don't depend on the charset modifiers.  They always
14162     * match under /u rules */
14163     case ANYOF_NHORIZWS:
14164     case ANYOF_HORIZWS:
14165      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14166      /* FALLTHROUGH */
14167
14168     case ANYOF_NVERTWS:
14169     case ANYOF_VERTWS:
14170      op = POSIXU;
14171      goto join_posix;
14172
14173     /* The actual POSIXish node for all the rest depends on the
14174     * charset modifier.  The ones in the first set depend only on
14175     * ASCII or, if available on this platform, locale */
14176     case ANYOF_ASCII:
14177     case ANYOF_NASCII:
14178 #ifdef HAS_ISASCII
14179      op = (LOC) ? POSIXL : POSIXA;
14180 #else
14181      op = POSIXA;
14182 #endif
14183      goto join_posix;
14184
14185     case ANYOF_NCASED:
14186     case ANYOF_LOWER:
14187     case ANYOF_NLOWER:
14188     case ANYOF_UPPER:
14189     case ANYOF_NUPPER:
14190      /* under /a could be alpha */
14191      if (FOLD) {
14192       if (ASCII_RESTRICTED) {
14193        namedclass = ANYOF_ALPHA + (namedclass % 2);
14194       }
14195       else if (! LOC) {
14196        break;
14197       }
14198      }
14199      /* FALLTHROUGH */
14200
14201     /* The rest have more possibilities depending on the charset.
14202     * We take advantage of the enum ordering of the charset
14203     * modifiers to get the exact node type, */
14204     default:
14205      op = POSIXD + get_regex_charset(RExC_flags);
14206      if (op > POSIXA) { /* /aa is same as /a */
14207       op = POSIXA;
14208      }
14209
14210     join_posix:
14211      /* The odd numbered ones are the complements of the
14212      * next-lower even number one */
14213      if (namedclass % 2 == 1) {
14214       invert = ! invert;
14215       namedclass--;
14216      }
14217      arg = namedclass_to_classnum(namedclass);
14218      break;
14219    }
14220   }
14221   else if (value == prevvalue) {
14222
14223    /* Here, the class consists of just a single code point */
14224
14225    if (invert) {
14226     if (! LOC && value == '\n') {
14227      op = REG_ANY; /* Optimize [^\n] */
14228      *flagp |= HASWIDTH|SIMPLE;
14229      RExC_naughty++;
14230     }
14231    }
14232    else if (value < 256 || UTF) {
14233
14234     /* Optimize a single value into an EXACTish node, but not if it
14235     * would require converting the pattern to UTF-8. */
14236     op = compute_EXACTish(pRExC_state);
14237    }
14238   } /* Otherwise is a range */
14239   else if (! LOC) {   /* locale could vary these */
14240    if (prevvalue == '0') {
14241     if (value == '9') {
14242      arg = _CC_DIGIT;
14243      op = POSIXA;
14244     }
14245    }
14246   }
14247
14248   /* Here, we have changed <op> away from its initial value iff we found
14249   * an optimization */
14250   if (op != END) {
14251
14252    /* Throw away this ANYOF regnode, and emit the calculated one,
14253    * which should correspond to the beginning, not current, state of
14254    * the parse */
14255    const char * cur_parse = RExC_parse;
14256    RExC_parse = (char *)orig_parse;
14257    if ( SIZE_ONLY) {
14258     if (! LOC) {
14259
14260      /* To get locale nodes to not use the full ANYOF size would
14261      * require moving the code above that writes the portions
14262      * of it that aren't in other nodes to after this point.
14263      * e.g.  ANYOF_POSIXL_SET */
14264      RExC_size = orig_size;
14265     }
14266    }
14267    else {
14268     RExC_emit = (regnode *)orig_emit;
14269     if (PL_regkind[op] == POSIXD) {
14270      if (op == POSIXL) {
14271       RExC_contains_locale = 1;
14272      }
14273      if (invert) {
14274       op += NPOSIXD - POSIXD;
14275      }
14276     }
14277    }
14278
14279    ret = reg_node(pRExC_state, op);
14280
14281    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14282     if (! SIZE_ONLY) {
14283      FLAGS(ret) = arg;
14284     }
14285     *flagp |= HASWIDTH|SIMPLE;
14286    }
14287    else if (PL_regkind[op] == EXACT) {
14288     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14289           TRUE /* downgradable to EXACT */
14290           );
14291    }
14292
14293    RExC_parse = (char *) cur_parse;
14294
14295    SvREFCNT_dec(posixes);
14296    SvREFCNT_dec(nposixes);
14297    SvREFCNT_dec(cp_list);
14298    SvREFCNT_dec(cp_foldable_list);
14299    return ret;
14300   }
14301  }
14302
14303  if (SIZE_ONLY)
14304   return ret;
14305  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14306
14307  /* If folding, we calculate all characters that could fold to or from the
14308  * ones already on the list */
14309  if (cp_foldable_list) {
14310   if (FOLD) {
14311    UV start, end; /* End points of code point ranges */
14312
14313    SV* fold_intersection = NULL;
14314    SV** use_list;
14315
14316    /* Our calculated list will be for Unicode rules.  For locale
14317    * matching, we have to keep a separate list that is consulted at
14318    * runtime only when the locale indicates Unicode rules.  For
14319    * non-locale, we just use to the general list */
14320    if (LOC) {
14321     use_list = &only_utf8_locale_list;
14322    }
14323    else {
14324     use_list = &cp_list;
14325    }
14326
14327    /* Only the characters in this class that participate in folds need
14328    * be checked.  Get the intersection of this class and all the
14329    * possible characters that are foldable.  This can quickly narrow
14330    * down a large class */
14331    _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14332         &fold_intersection);
14333
14334    /* The folds for all the Latin1 characters are hard-coded into this
14335    * program, but we have to go out to disk to get the others. */
14336    if (invlist_highest(cp_foldable_list) >= 256) {
14337
14338     /* This is a hash that for a particular fold gives all
14339     * characters that are involved in it */
14340     if (! PL_utf8_foldclosures) {
14341
14342      /* If the folds haven't been read in, call a fold function
14343      * to force that */
14344      if (! PL_utf8_tofold) {
14345       U8 dummy[UTF8_MAXBYTES_CASE+1];
14346
14347       /* This string is just a short named one above \xff */
14348       to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14349       assert(PL_utf8_tofold); /* Verify that worked */
14350      }
14351      PL_utf8_foldclosures
14352          = _swash_inversion_hash(PL_utf8_tofold);
14353     }
14354    }
14355
14356    /* Now look at the foldable characters in this class individually */
14357    invlist_iterinit(fold_intersection);
14358    while (invlist_iternext(fold_intersection, &start, &end)) {
14359     UV j;
14360
14361     /* Look at every character in the range */
14362     for (j = start; j <= end; j++) {
14363      U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14364      STRLEN foldlen;
14365      SV** listp;
14366
14367      if (j < 256) {
14368
14369       /* We have the latin1 folding rules hard-coded here so
14370       * that an innocent-looking character class, like
14371       * /[ks]/i won't have to go out to disk to find the
14372       * possible matches.  XXX It would be better to
14373       * generate these via regen, in case a new version of
14374       * the Unicode standard adds new mappings, though that
14375       * is not really likely, and may be caught by the
14376       * default: case of the switch below. */
14377
14378       if (IS_IN_SOME_FOLD_L1(j)) {
14379
14380        /* ASCII is always matched; non-ASCII is matched
14381        * only under Unicode rules (which could happen
14382        * under /l if the locale is a UTF-8 one */
14383        if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14384         *use_list = add_cp_to_invlist(*use_list,
14385                PL_fold_latin1[j]);
14386        }
14387        else {
14388         depends_list =
14389         add_cp_to_invlist(depends_list,
14390             PL_fold_latin1[j]);
14391        }
14392       }
14393
14394       if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14395        && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14396       {
14397        /* Certain Latin1 characters have matches outside
14398        * Latin1.  To get here, <j> is one of those
14399        * characters.   None of these matches is valid for
14400        * ASCII characters under /aa, which is why the 'if'
14401        * just above excludes those.  These matches only
14402        * happen when the target string is utf8.  The code
14403        * below adds the single fold closures for <j> to the
14404        * inversion list. */
14405
14406        switch (j) {
14407         case 'k':
14408         case 'K':
14409         *use_list =
14410          add_cp_to_invlist(*use_list, KELVIN_SIGN);
14411          break;
14412         case 's':
14413         case 'S':
14414         *use_list = add_cp_to_invlist(*use_list,
14415              LATIN_SMALL_LETTER_LONG_S);
14416          break;
14417         case MICRO_SIGN:
14418         *use_list = add_cp_to_invlist(*use_list,
14419              GREEK_CAPITAL_LETTER_MU);
14420         *use_list = add_cp_to_invlist(*use_list,
14421               GREEK_SMALL_LETTER_MU);
14422          break;
14423         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14424         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14425         *use_list =
14426         add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
14427          break;
14428         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14429         *use_list = add_cp_to_invlist(*use_list,
14430           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14431          break;
14432         case LATIN_SMALL_LETTER_SHARP_S:
14433         *use_list = add_cp_to_invlist(*use_list,
14434             LATIN_CAPITAL_LETTER_SHARP_S);
14435          break;
14436         case 'F': case 'f':
14437         case 'I': case 'i':
14438         case 'L': case 'l':
14439         case 'T': case 't':
14440         case 'A': case 'a':
14441         case 'H': case 'h':
14442         case 'J': case 'j':
14443         case 'N': case 'n':
14444         case 'W': case 'w':
14445         case 'Y': case 'y':
14446          /* These all are targets of multi-character
14447          * folds from code points that require UTF8
14448          * to express, so they can't match unless
14449          * the target string is in UTF-8, so no
14450          * action here is necessary, as regexec.c
14451          * properly handles the general case for
14452          * UTF-8 matching and multi-char folds */
14453          break;
14454         default:
14455          /* Use deprecated warning to increase the
14456          * chances of this being output */
14457          ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14458          break;
14459        }
14460       }
14461       continue;
14462      }
14463
14464      /* Here is an above Latin1 character.  We don't have the
14465      * rules hard-coded for it.  First, get its fold.  This is
14466      * the simple fold, as the multi-character folds have been
14467      * handled earlier and separated out */
14468      _to_uni_fold_flags(j, foldbuf, &foldlen,
14469               (ASCII_FOLD_RESTRICTED)
14470               ? FOLD_FLAGS_NOMIX_ASCII
14471               : 0);
14472
14473      /* Single character fold of above Latin1.  Add everything in
14474      * its fold closure to the list that this node should match.
14475      * The fold closures data structure is a hash with the keys
14476      * being the UTF-8 of every character that is folded to, like
14477      * 'k', and the values each an array of all code points that
14478      * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14479      * Multi-character folds are not included */
14480      if ((listp = hv_fetch(PL_utf8_foldclosures,
14481           (char *) foldbuf, foldlen, FALSE)))
14482      {
14483       AV* list = (AV*) *listp;
14484       IV k;
14485       for (k = 0; k <= av_tindex(list); k++) {
14486        SV** c_p = av_fetch(list, k, FALSE);
14487        UV c;
14488        if (c_p == NULL) {
14489         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14490        }
14491        c = SvUV(*c_p);
14492
14493        /* /aa doesn't allow folds between ASCII and non- */
14494        if ((ASCII_FOLD_RESTRICTED
14495         && (isASCII(c) != isASCII(j))))
14496        {
14497         continue;
14498        }
14499
14500        /* Folds under /l which cross the 255/256 boundary
14501        * are added to a separate list.  (These are valid
14502        * only when the locale is UTF-8.) */
14503        if (c < 256 && LOC) {
14504         *use_list = add_cp_to_invlist(*use_list, c);
14505         continue;
14506        }
14507
14508        if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14509        {
14510         cp_list = add_cp_to_invlist(cp_list, c);
14511        }
14512        else {
14513         /* Similarly folds involving non-ascii Latin1
14514         * characters under /d are added to their list */
14515         depends_list = add_cp_to_invlist(depends_list,
14516                 c);
14517        }
14518       }
14519      }
14520     }
14521    }
14522    SvREFCNT_dec_NN(fold_intersection);
14523   }
14524
14525   /* Now that we have finished adding all the folds, there is no reason
14526   * to keep the foldable list separate */
14527   _invlist_union(cp_list, cp_foldable_list, &cp_list);
14528   SvREFCNT_dec_NN(cp_foldable_list);
14529  }
14530
14531  /* And combine the result (if any) with any inversion list from posix
14532  * classes.  The lists are kept separate up to now because we don't want to
14533  * fold the classes (folding of those is automatically handled by the swash
14534  * fetching code) */
14535  if (posixes || nposixes) {
14536   if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14537    /* Under /a and /aa, nothing above ASCII matches these */
14538    _invlist_intersection(posixes,
14539         PL_XPosix_ptrs[_CC_ASCII],
14540         &posixes);
14541   }
14542   if (nposixes) {
14543    if (DEPENDS_SEMANTICS) {
14544     /* Under /d, everything in the upper half of the Latin1 range
14545     * matches these complements */
14546     ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14547    }
14548    else if (AT_LEAST_ASCII_RESTRICTED) {
14549     /* Under /a and /aa, everything above ASCII matches these
14550     * complements */
14551     _invlist_union_complement_2nd(nposixes,
14552            PL_XPosix_ptrs[_CC_ASCII],
14553            &nposixes);
14554    }
14555    if (posixes) {
14556     _invlist_union(posixes, nposixes, &posixes);
14557     SvREFCNT_dec_NN(nposixes);
14558    }
14559    else {
14560     posixes = nposixes;
14561    }
14562   }
14563   if (! DEPENDS_SEMANTICS) {
14564    if (cp_list) {
14565     _invlist_union(cp_list, posixes, &cp_list);
14566     SvREFCNT_dec_NN(posixes);
14567    }
14568    else {
14569     cp_list = posixes;
14570    }
14571   }
14572   else {
14573    /* Under /d, we put into a separate list the Latin1 things that
14574    * match only when the target string is utf8 */
14575    SV* nonascii_but_latin1_properties = NULL;
14576    _invlist_intersection(posixes, PL_UpperLatin1,
14577         &nonascii_but_latin1_properties);
14578    _invlist_subtract(posixes, nonascii_but_latin1_properties,
14579        &posixes);
14580    if (cp_list) {
14581     _invlist_union(cp_list, posixes, &cp_list);
14582     SvREFCNT_dec_NN(posixes);
14583    }
14584    else {
14585     cp_list = posixes;
14586    }
14587
14588    if (depends_list) {
14589     _invlist_union(depends_list, nonascii_but_latin1_properties,
14590        &depends_list);
14591     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14592    }
14593    else {
14594     depends_list = nonascii_but_latin1_properties;
14595    }
14596   }
14597  }
14598
14599  /* And combine the result (if any) with any inversion list from properties.
14600  * The lists are kept separate up to now so that we can distinguish the two
14601  * in regards to matching above-Unicode.  A run-time warning is generated
14602  * if a Unicode property is matched against a non-Unicode code point. But,
14603  * we allow user-defined properties to match anything, without any warning,
14604  * and we also suppress the warning if there is a portion of the character
14605  * class that isn't a Unicode property, and which matches above Unicode, \W
14606  * or [\x{110000}] for example.
14607  * (Note that in this case, unlike the Posix one above, there is no
14608  * <depends_list>, because having a Unicode property forces Unicode
14609  * semantics */
14610  if (properties) {
14611   if (cp_list) {
14612
14613    /* If it matters to the final outcome, see if a non-property
14614    * component of the class matches above Unicode.  If so, the
14615    * warning gets suppressed.  This is true even if just a single
14616    * such code point is specified, as though not strictly correct if
14617    * another such code point is matched against, the fact that they
14618    * are using above-Unicode code points indicates they should know
14619    * the issues involved */
14620    if (warn_super) {
14621     warn_super = ! (invert
14622        ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14623    }
14624
14625    _invlist_union(properties, cp_list, &cp_list);
14626    SvREFCNT_dec_NN(properties);
14627   }
14628   else {
14629    cp_list = properties;
14630   }
14631
14632   if (warn_super) {
14633    ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14634   }
14635  }
14636
14637  /* Here, we have calculated what code points should be in the character
14638  * class.
14639  *
14640  * Now we can see about various optimizations.  Fold calculation (which we
14641  * did above) needs to take place before inversion.  Otherwise /[^k]/i
14642  * would invert to include K, which under /i would match k, which it
14643  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14644  * folded until runtime */
14645
14646  /* If we didn't do folding, it's because some information isn't available
14647  * until runtime; set the run-time fold flag for these.  (We don't have to
14648  * worry about properties folding, as that is taken care of by the swash
14649  * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14650  * locales, or the class matches at least one 0-255 range code point */
14651  if (LOC && FOLD) {
14652   if (only_utf8_locale_list) {
14653    ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14654   }
14655   else if (cp_list) { /* Look to see if there a 0-255 code point is in
14656        the list */
14657    UV start, end;
14658    invlist_iterinit(cp_list);
14659    if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14660     ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14661    }
14662    invlist_iterfinish(cp_list);
14663   }
14664  }
14665
14666  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14667  * at compile time.  Besides not inverting folded locale now, we can't
14668  * invert if there are things such as \w, which aren't known until runtime
14669  * */
14670  if (invert
14671   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14672   && ! depends_list
14673   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14674  {
14675   _invlist_invert(cp_list);
14676
14677   /* Any swash can't be used as-is, because we've inverted things */
14678   if (swash) {
14679    SvREFCNT_dec_NN(swash);
14680    swash = NULL;
14681   }
14682
14683   /* Clear the invert flag since have just done it here */
14684   invert = FALSE;
14685  }
14686
14687  if (ret_invlist) {
14688   *ret_invlist = cp_list;
14689   SvREFCNT_dec(swash);
14690
14691   /* Discard the generated node */
14692   if (SIZE_ONLY) {
14693    RExC_size = orig_size;
14694   }
14695   else {
14696    RExC_emit = orig_emit;
14697   }
14698   return orig_emit;
14699  }
14700
14701  /* Some character classes are equivalent to other nodes.  Such nodes take
14702  * up less room and generally fewer operations to execute than ANYOF nodes.
14703  * Above, we checked for and optimized into some such equivalents for
14704  * certain common classes that are easy to test.  Getting to this point in
14705  * the code means that the class didn't get optimized there.  Since this
14706  * code is only executed in Pass 2, it is too late to save space--it has
14707  * been allocated in Pass 1, and currently isn't given back.  But turning
14708  * things into an EXACTish node can allow the optimizer to join it to any
14709  * adjacent such nodes.  And if the class is equivalent to things like /./,
14710  * expensive run-time swashes can be avoided.  Now that we have more
14711  * complete information, we can find things necessarily missed by the
14712  * earlier code.  I (khw) am not sure how much to look for here.  It would
14713  * be easy, but perhaps too slow, to check any candidates against all the
14714  * node types they could possibly match using _invlistEQ(). */
14715
14716  if (cp_list
14717   && ! invert
14718   && ! depends_list
14719   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14720   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14721
14722   /* We don't optimize if we are supposed to make sure all non-Unicode
14723    * code points raise a warning, as only ANYOF nodes have this check.
14724    * */
14725   && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14726  {
14727   UV start, end;
14728   U8 op = END;  /* The optimzation node-type */
14729   const char * cur_parse= RExC_parse;
14730
14731   invlist_iterinit(cp_list);
14732   if (! invlist_iternext(cp_list, &start, &end)) {
14733
14734    /* Here, the list is empty.  This happens, for example, when a
14735    * Unicode property is the only thing in the character class, and
14736    * it doesn't match anything.  (perluniprops.pod notes such
14737    * properties) */
14738    op = OPFAIL;
14739    *flagp |= HASWIDTH|SIMPLE;
14740   }
14741   else if (start == end) {    /* The range is a single code point */
14742    if (! invlist_iternext(cp_list, &start, &end)
14743
14744      /* Don't do this optimization if it would require changing
14745      * the pattern to UTF-8 */
14746     && (start < 256 || UTF))
14747    {
14748     /* Here, the list contains a single code point.  Can optimize
14749     * into an EXACTish node */
14750
14751     value = start;
14752
14753     if (! FOLD) {
14754      op = EXACT;
14755     }
14756     else if (LOC) {
14757
14758      /* A locale node under folding with one code point can be
14759      * an EXACTFL, as its fold won't be calculated until
14760      * runtime */
14761      op = EXACTFL;
14762     }
14763     else {
14764
14765      /* Here, we are generally folding, but there is only one
14766      * code point to match.  If we have to, we use an EXACT
14767      * node, but it would be better for joining with adjacent
14768      * nodes in the optimization pass if we used the same
14769      * EXACTFish node that any such are likely to be.  We can
14770      * do this iff the code point doesn't participate in any
14771      * folds.  For example, an EXACTF of a colon is the same as
14772      * an EXACT one, since nothing folds to or from a colon. */
14773      if (value < 256) {
14774       if (IS_IN_SOME_FOLD_L1(value)) {
14775        op = EXACT;
14776       }
14777      }
14778      else {
14779       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14780        op = EXACT;
14781       }
14782      }
14783
14784      /* If we haven't found the node type, above, it means we
14785      * can use the prevailing one */
14786      if (op == END) {
14787       op = compute_EXACTish(pRExC_state);
14788      }
14789     }
14790    }
14791   }
14792   else if (start == 0) {
14793    if (end == UV_MAX) {
14794     op = SANY;
14795     *flagp |= HASWIDTH|SIMPLE;
14796     RExC_naughty++;
14797    }
14798    else if (end == '\n' - 1
14799      && invlist_iternext(cp_list, &start, &end)
14800      && start == '\n' + 1 && end == UV_MAX)
14801    {
14802     op = REG_ANY;
14803     *flagp |= HASWIDTH|SIMPLE;
14804     RExC_naughty++;
14805    }
14806   }
14807   invlist_iterfinish(cp_list);
14808
14809   if (op != END) {
14810    RExC_parse = (char *)orig_parse;
14811    RExC_emit = (regnode *)orig_emit;
14812
14813    ret = reg_node(pRExC_state, op);
14814
14815    RExC_parse = (char *)cur_parse;
14816
14817    if (PL_regkind[op] == EXACT) {
14818     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14819           TRUE /* downgradable to EXACT */
14820           );
14821    }
14822
14823    SvREFCNT_dec_NN(cp_list);
14824    return ret;
14825   }
14826  }
14827
14828  /* Here, <cp_list> contains all the code points we can determine at
14829  * compile time that match under all conditions.  Go through it, and
14830  * for things that belong in the bitmap, put them there, and delete from
14831  * <cp_list>.  While we are at it, see if everything above 255 is in the
14832  * list, and if so, set a flag to speed up execution */
14833
14834  populate_ANYOF_from_invlist(ret, &cp_list);
14835
14836  if (invert) {
14837   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14838  }
14839
14840  /* Here, the bitmap has been populated with all the Latin1 code points that
14841  * always match.  Can now add to the overall list those that match only
14842  * when the target string is UTF-8 (<depends_list>). */
14843  if (depends_list) {
14844   if (cp_list) {
14845    _invlist_union(cp_list, depends_list, &cp_list);
14846    SvREFCNT_dec_NN(depends_list);
14847   }
14848   else {
14849    cp_list = depends_list;
14850   }
14851   ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14852  }
14853
14854  /* If there is a swash and more than one element, we can't use the swash in
14855  * the optimization below. */
14856  if (swash && element_count > 1) {
14857   SvREFCNT_dec_NN(swash);
14858   swash = NULL;
14859  }
14860
14861  set_ANYOF_arg(pRExC_state, ret, cp_list,
14862     (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14863     ? listsv : NULL,
14864     only_utf8_locale_list,
14865     swash, has_user_defined_property);
14866
14867  *flagp |= HASWIDTH|SIMPLE;
14868
14869  if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14870   RExC_contains_locale = 1;
14871  }
14872
14873  return ret;
14874 }
14875
14876 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14877
14878 STATIC void
14879 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14880     regnode* const node,
14881     SV* const cp_list,
14882     SV* const runtime_defns,
14883     SV* const only_utf8_locale_list,
14884     SV* const swash,
14885     const bool has_user_defined_property)
14886 {
14887  /* Sets the arg field of an ANYOF-type node 'node', using information about
14888  * the node passed-in.  If there is nothing outside the node's bitmap, the
14889  * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14890  * the count returned by add_data(), having allocated and stored an array,
14891  * av, that that count references, as follows:
14892  *  av[0] stores the character class description in its textual form.
14893  *        This is used later (regexec.c:Perl_regclass_swash()) to
14894  *        initialize the appropriate swash, and is also useful for dumping
14895  *        the regnode.  This is set to &PL_sv_undef if the textual
14896  *        description is not needed at run-time (as happens if the other
14897  *        elements completely define the class)
14898  *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14899  *        computed from av[0].  But if no further computation need be done,
14900  *        the swash is stored here now (and av[0] is &PL_sv_undef).
14901  *  av[2] stores the inversion list of code points that match only if the
14902  *        current locale is UTF-8
14903  *  av[3] stores the cp_list inversion list for use in addition or instead
14904  *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14905  *        (Otherwise everything needed is already in av[0] and av[1])
14906  *  av[4] is set if any component of the class is from a user-defined
14907  *        property; used only if av[3] exists */
14908
14909  UV n;
14910
14911  PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14912
14913  if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14914   assert(! (ANYOF_FLAGS(node)
14915      & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14916   ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14917  }
14918  else {
14919   AV * const av = newAV();
14920   SV *rv;
14921
14922   assert(ANYOF_FLAGS(node)
14923      & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14924
14925   av_store(av, 0, (runtime_defns)
14926       ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14927   if (swash) {
14928    av_store(av, 1, swash);
14929    SvREFCNT_dec_NN(cp_list);
14930   }
14931   else {
14932    av_store(av, 1, &PL_sv_undef);
14933    if (cp_list) {
14934     av_store(av, 3, cp_list);
14935     av_store(av, 4, newSVuv(has_user_defined_property));
14936    }
14937   }
14938
14939   if (only_utf8_locale_list) {
14940    av_store(av, 2, only_utf8_locale_list);
14941   }
14942   else {
14943    av_store(av, 2, &PL_sv_undef);
14944   }
14945
14946   rv = newRV_noinc(MUTABLE_SV(av));
14947   n = add_data(pRExC_state, STR_WITH_LEN("s"));
14948   RExC_rxi->data->data[n] = (void*)rv;
14949   ARG_SET(node, n);
14950  }
14951 }
14952
14953
14954 /* reg_skipcomment()
14955
14956    Absorbs an /x style # comments from the input stream.
14957    Returns true if there is more text remaining in the stream.
14958    Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
14959    terminates the pattern without including a newline.
14960
14961    Note its the callers responsibility to ensure that we are
14962    actually in /x mode
14963
14964 */
14965
14966 STATIC bool
14967 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14968 {
14969  bool ended = 0;
14970
14971  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14972
14973  while (RExC_parse < RExC_end)
14974   if (*RExC_parse++ == '\n') {
14975    ended = 1;
14976    break;
14977   }
14978  if (!ended) {
14979   /* we ran off the end of the pattern without ending
14980   the comment, so we have to add an \n when wrapping */
14981   RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
14982   return 0;
14983  } else
14984   return 1;
14985 }
14986
14987 /* nextchar()
14988
14989    Advances the parse position, and optionally absorbs
14990    "whitespace" from the inputstream.
14991
14992    Without /x "whitespace" means (?#...) style comments only,
14993    with /x this means (?#...) and # comments and whitespace proper.
14994
14995    Returns the RExC_parse point from BEFORE the scan occurs.
14996
14997    This is the /x friendly way of saying RExC_parse++.
14998 */
14999
15000 STATIC char*
15001 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15002 {
15003  char* const retval = RExC_parse++;
15004
15005  PERL_ARGS_ASSERT_NEXTCHAR;
15006
15007  for (;;) {
15008   if (RExC_end - RExC_parse >= 3
15009    && *RExC_parse == '('
15010    && RExC_parse[1] == '?'
15011    && RExC_parse[2] == '#')
15012   {
15013    while (*RExC_parse != ')') {
15014     if (RExC_parse == RExC_end)
15015      FAIL("Sequence (?#... not terminated");
15016     RExC_parse++;
15017    }
15018    RExC_parse++;
15019    continue;
15020   }
15021   if (RExC_flags & RXf_PMf_EXTENDED) {
15022    if (isSPACE(*RExC_parse)) {
15023     RExC_parse++;
15024     continue;
15025    }
15026    else if (*RExC_parse == '#') {
15027     if ( reg_skipcomment( pRExC_state ) )
15028      continue;
15029    }
15030   }
15031   return retval;
15032  }
15033 }
15034
15035 /*
15036 - reg_node - emit a node
15037 */
15038 STATIC regnode *   /* Location. */
15039 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15040 {
15041  dVAR;
15042  regnode *ptr;
15043  regnode * const ret = RExC_emit;
15044  GET_RE_DEBUG_FLAGS_DECL;
15045
15046  PERL_ARGS_ASSERT_REG_NODE;
15047
15048  if (SIZE_ONLY) {
15049   SIZE_ALIGN(RExC_size);
15050   RExC_size += 1;
15051   return(ret);
15052  }
15053  if (RExC_emit >= RExC_emit_bound)
15054   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15055     op, RExC_emit, RExC_emit_bound);
15056
15057  NODE_ALIGN_FILL(ret);
15058  ptr = ret;
15059  FILL_ADVANCE_NODE(ptr, op);
15060  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
15061 #ifdef RE_TRACK_PATTERN_OFFSETS
15062  if (RExC_offsets) {         /* MJD */
15063   MJD_OFFSET_DEBUG(
15064    ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15065    "reg_node", __LINE__,
15066    PL_reg_name[op],
15067    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15068     ? "Overwriting end of array!\n" : "OK",
15069    (UV)(RExC_emit - RExC_emit_start),
15070    (UV)(RExC_parse - RExC_start),
15071    (UV)RExC_offsets[0]));
15072   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15073  }
15074 #endif
15075  RExC_emit = ptr;
15076  return(ret);
15077 }
15078
15079 /*
15080 - reganode - emit a node with an argument
15081 */
15082 STATIC regnode *   /* Location. */
15083 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15084 {
15085  dVAR;
15086  regnode *ptr;
15087  regnode * const ret = RExC_emit;
15088  GET_RE_DEBUG_FLAGS_DECL;
15089
15090  PERL_ARGS_ASSERT_REGANODE;
15091
15092  if (SIZE_ONLY) {
15093   SIZE_ALIGN(RExC_size);
15094   RExC_size += 2;
15095   /*
15096   We can't do this:
15097
15098   assert(2==regarglen[op]+1);
15099
15100   Anything larger than this has to allocate the extra amount.
15101   If we changed this to be:
15102
15103   RExC_size += (1 + regarglen[op]);
15104
15105   then it wouldn't matter. Its not clear what side effect
15106   might come from that so its not done so far.
15107   -- dmq
15108   */
15109   return(ret);
15110  }
15111  if (RExC_emit >= RExC_emit_bound)
15112   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15113     op, RExC_emit, RExC_emit_bound);
15114
15115  NODE_ALIGN_FILL(ret);
15116  ptr = ret;
15117  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15118  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
15119 #ifdef RE_TRACK_PATTERN_OFFSETS
15120  if (RExC_offsets) {         /* MJD */
15121   MJD_OFFSET_DEBUG(
15122    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15123    "reganode",
15124    __LINE__,
15125    PL_reg_name[op],
15126    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15127    "Overwriting end of array!\n" : "OK",
15128    (UV)(RExC_emit - RExC_emit_start),
15129    (UV)(RExC_parse - RExC_start),
15130    (UV)RExC_offsets[0]));
15131   Set_Cur_Node_Offset;
15132  }
15133 #endif
15134  RExC_emit = ptr;
15135  return(ret);
15136 }
15137
15138 /*
15139 - reguni - emit (if appropriate) a Unicode character
15140 */
15141 PERL_STATIC_INLINE STRLEN
15142 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15143 {
15144  dVAR;
15145
15146  PERL_ARGS_ASSERT_REGUNI;
15147
15148  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15149 }
15150
15151 /*
15152 - reginsert - insert an operator in front of already-emitted operand
15153 *
15154 * Means relocating the operand.
15155 */
15156 STATIC void
15157 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15158 {
15159  dVAR;
15160  regnode *src;
15161  regnode *dst;
15162  regnode *place;
15163  const int offset = regarglen[(U8)op];
15164  const int size = NODE_STEP_REGNODE + offset;
15165  GET_RE_DEBUG_FLAGS_DECL;
15166
15167  PERL_ARGS_ASSERT_REGINSERT;
15168  PERL_UNUSED_ARG(depth);
15169 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15170  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15171  if (SIZE_ONLY) {
15172   RExC_size += size;
15173   return;
15174  }
15175
15176  src = RExC_emit;
15177  RExC_emit += size;
15178  dst = RExC_emit;
15179  if (RExC_open_parens) {
15180   int paren;
15181   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15182   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15183    if ( RExC_open_parens[paren] >= opnd ) {
15184     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15185     RExC_open_parens[paren] += size;
15186    } else {
15187     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15188    }
15189    if ( RExC_close_parens[paren] >= opnd ) {
15190     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15191     RExC_close_parens[paren] += size;
15192    } else {
15193     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15194    }
15195   }
15196  }
15197
15198  while (src > opnd) {
15199   StructCopy(--src, --dst, regnode);
15200 #ifdef RE_TRACK_PATTERN_OFFSETS
15201   if (RExC_offsets) {     /* MJD 20010112 */
15202    MJD_OFFSET_DEBUG(
15203     ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15204     "reg_insert",
15205     __LINE__,
15206     PL_reg_name[op],
15207     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15208      ? "Overwriting end of array!\n" : "OK",
15209     (UV)(src - RExC_emit_start),
15210     (UV)(dst - RExC_emit_start),
15211     (UV)RExC_offsets[0]));
15212    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15213    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15214   }
15215 #endif
15216  }
15217
15218
15219  place = opnd;  /* Op node, where operand used to be. */
15220 #ifdef RE_TRACK_PATTERN_OFFSETS
15221  if (RExC_offsets) {         /* MJD */
15222   MJD_OFFSET_DEBUG(
15223    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15224    "reginsert",
15225    __LINE__,
15226    PL_reg_name[op],
15227    (UV)(place - RExC_emit_start) > RExC_offsets[0]
15228    ? "Overwriting end of array!\n" : "OK",
15229    (UV)(place - RExC_emit_start),
15230    (UV)(RExC_parse - RExC_start),
15231    (UV)RExC_offsets[0]));
15232   Set_Node_Offset(place, RExC_parse);
15233   Set_Node_Length(place, 1);
15234  }
15235 #endif
15236  src = NEXTOPER(place);
15237  FILL_ADVANCE_NODE(place, op);
15238  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
15239  Zero(src, offset, regnode);
15240 }
15241
15242 /*
15243 - regtail - set the next-pointer at the end of a node chain of p to val.
15244 - SEE ALSO: regtail_study
15245 */
15246 /* TODO: All three parms should be const */
15247 STATIC void
15248 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15249     const regnode *val,U32 depth)
15250 {
15251  dVAR;
15252  regnode *scan;
15253  GET_RE_DEBUG_FLAGS_DECL;
15254
15255  PERL_ARGS_ASSERT_REGTAIL;
15256 #ifndef DEBUGGING
15257  PERL_UNUSED_ARG(depth);
15258 #endif
15259
15260  if (SIZE_ONLY)
15261   return;
15262
15263  /* Find last node. */
15264  scan = p;
15265  for (;;) {
15266   regnode * const temp = regnext(scan);
15267   DEBUG_PARSE_r({
15268    SV * const mysv=sv_newmortal();
15269    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15270    regprop(RExC_rx, mysv, scan, NULL);
15271    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15272     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15273      (temp == NULL ? "->" : ""),
15274      (temp == NULL ? PL_reg_name[OP(val)] : "")
15275    );
15276   });
15277   if (temp == NULL)
15278    break;
15279   scan = temp;
15280  }
15281
15282  if (reg_off_by_arg[OP(scan)]) {
15283   ARG_SET(scan, val - scan);
15284  }
15285  else {
15286   NEXT_OFF(scan) = val - scan;
15287  }
15288 }
15289
15290 #ifdef DEBUGGING
15291 /*
15292 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15293 - Look for optimizable sequences at the same time.
15294 - currently only looks for EXACT chains.
15295
15296 This is experimental code. The idea is to use this routine to perform
15297 in place optimizations on branches and groups as they are constructed,
15298 with the long term intention of removing optimization from study_chunk so
15299 that it is purely analytical.
15300
15301 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15302 to control which is which.
15303
15304 */
15305 /* TODO: All four parms should be const */
15306
15307 STATIC U8
15308 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15309      const regnode *val,U32 depth)
15310 {
15311  dVAR;
15312  regnode *scan;
15313  U8 exact = PSEUDO;
15314 #ifdef EXPERIMENTAL_INPLACESCAN
15315  I32 min = 0;
15316 #endif
15317  GET_RE_DEBUG_FLAGS_DECL;
15318
15319  PERL_ARGS_ASSERT_REGTAIL_STUDY;
15320
15321
15322  if (SIZE_ONLY)
15323   return exact;
15324
15325  /* Find last node. */
15326
15327  scan = p;
15328  for (;;) {
15329   regnode * const temp = regnext(scan);
15330 #ifdef EXPERIMENTAL_INPLACESCAN
15331   if (PL_regkind[OP(scan)] == EXACT) {
15332    bool unfolded_multi_char; /* Unexamined in this routine */
15333    if (join_exact(pRExC_state, scan, &min,
15334       &unfolded_multi_char, 1, val, depth+1))
15335     return EXACT;
15336   }
15337 #endif
15338   if ( exact ) {
15339    switch (OP(scan)) {
15340     case EXACT:
15341     case EXACTF:
15342     case EXACTFA_NO_TRIE:
15343     case EXACTFA:
15344     case EXACTFU:
15345     case EXACTFU_SS:
15346     case EXACTFL:
15347       if( exact == PSEUDO )
15348        exact= OP(scan);
15349       else if ( exact != OP(scan) )
15350        exact= 0;
15351     case NOTHING:
15352      break;
15353     default:
15354      exact= 0;
15355    }
15356   }
15357   DEBUG_PARSE_r({
15358    SV * const mysv=sv_newmortal();
15359    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15360    regprop(RExC_rx, mysv, scan, NULL);
15361    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15362     SvPV_nolen_const(mysv),
15363     REG_NODE_NUM(scan),
15364     PL_reg_name[exact]);
15365   });
15366   if (temp == NULL)
15367    break;
15368   scan = temp;
15369  }
15370  DEBUG_PARSE_r({
15371   SV * const mysv_val=sv_newmortal();
15372   DEBUG_PARSE_MSG("");
15373   regprop(RExC_rx, mysv_val, val, NULL);
15374   PerlIO_printf(Perl_debug_log,
15375      "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15376      SvPV_nolen_const(mysv_val),
15377      (IV)REG_NODE_NUM(val),
15378      (IV)(val - scan)
15379   );
15380  });
15381  if (reg_off_by_arg[OP(scan)]) {
15382   ARG_SET(scan, val - scan);
15383  }
15384  else {
15385   NEXT_OFF(scan) = val - scan;
15386  }
15387
15388  return exact;
15389 }
15390 #endif
15391
15392 /*
15393  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15394  */
15395 #ifdef DEBUGGING
15396
15397 static void
15398 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15399 {
15400  int bit;
15401  int set=0;
15402
15403  ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15404
15405  for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15406   if (flags & (1<<bit)) {
15407    if (!set++ && lead)
15408     PerlIO_printf(Perl_debug_log, "%s",lead);
15409    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15410   }
15411  }
15412  if (lead)  {
15413   if (set)
15414    PerlIO_printf(Perl_debug_log, "\n");
15415   else
15416    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15417  }
15418 }
15419
15420 static void
15421 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15422 {
15423  int bit;
15424  int set=0;
15425  regex_charset cs;
15426
15427  ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15428
15429  for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15430   if (flags & (1<<bit)) {
15431    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15432     continue;
15433    }
15434    if (!set++ && lead)
15435     PerlIO_printf(Perl_debug_log, "%s",lead);
15436    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15437   }
15438  }
15439  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15440    if (!set++ && lead) {
15441     PerlIO_printf(Perl_debug_log, "%s",lead);
15442    }
15443    switch (cs) {
15444     case REGEX_UNICODE_CHARSET:
15445      PerlIO_printf(Perl_debug_log, "UNICODE");
15446      break;
15447     case REGEX_LOCALE_CHARSET:
15448      PerlIO_printf(Perl_debug_log, "LOCALE");
15449      break;
15450     case REGEX_ASCII_RESTRICTED_CHARSET:
15451      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15452      break;
15453     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15454      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15455      break;
15456     default:
15457      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15458      break;
15459    }
15460  }
15461  if (lead)  {
15462   if (set)
15463    PerlIO_printf(Perl_debug_log, "\n");
15464   else
15465    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15466  }
15467 }
15468 #endif
15469
15470 void
15471 Perl_regdump(pTHX_ const regexp *r)
15472 {
15473 #ifdef DEBUGGING
15474  dVAR;
15475  SV * const sv = sv_newmortal();
15476  SV *dsv= sv_newmortal();
15477  RXi_GET_DECL(r,ri);
15478  GET_RE_DEBUG_FLAGS_DECL;
15479
15480  PERL_ARGS_ASSERT_REGDUMP;
15481
15482  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15483
15484  /* Header fields of interest. */
15485  if (r->anchored_substr) {
15486   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15487    RE_SV_DUMPLEN(r->anchored_substr), 30);
15488   PerlIO_printf(Perl_debug_log,
15489      "anchored %s%s at %"IVdf" ",
15490      s, RE_SV_TAIL(r->anchored_substr),
15491      (IV)r->anchored_offset);
15492  } else if (r->anchored_utf8) {
15493   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15494    RE_SV_DUMPLEN(r->anchored_utf8), 30);
15495   PerlIO_printf(Perl_debug_log,
15496      "anchored utf8 %s%s at %"IVdf" ",
15497      s, RE_SV_TAIL(r->anchored_utf8),
15498      (IV)r->anchored_offset);
15499  }
15500  if (r->float_substr) {
15501   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15502    RE_SV_DUMPLEN(r->float_substr), 30);
15503   PerlIO_printf(Perl_debug_log,
15504      "floating %s%s at %"IVdf"..%"UVuf" ",
15505      s, RE_SV_TAIL(r->float_substr),
15506      (IV)r->float_min_offset, (UV)r->float_max_offset);
15507  } else if (r->float_utf8) {
15508   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15509    RE_SV_DUMPLEN(r->float_utf8), 30);
15510   PerlIO_printf(Perl_debug_log,
15511      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15512      s, RE_SV_TAIL(r->float_utf8),
15513      (IV)r->float_min_offset, (UV)r->float_max_offset);
15514  }
15515  if (r->check_substr || r->check_utf8)
15516   PerlIO_printf(Perl_debug_log,
15517      (const char *)
15518      (r->check_substr == r->float_substr
15519      && r->check_utf8 == r->float_utf8
15520      ? "(checking floating" : "(checking anchored"));
15521  if (r->intflags & PREGf_NOSCAN)
15522   PerlIO_printf(Perl_debug_log, " noscan");
15523  if (r->extflags & RXf_CHECK_ALL)
15524   PerlIO_printf(Perl_debug_log, " isall");
15525  if (r->check_substr || r->check_utf8)
15526   PerlIO_printf(Perl_debug_log, ") ");
15527
15528  if (ri->regstclass) {
15529   regprop(r, sv, ri->regstclass, NULL);
15530   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15531  }
15532  if (r->intflags & PREGf_ANCH) {
15533   PerlIO_printf(Perl_debug_log, "anchored");
15534   if (r->intflags & PREGf_ANCH_BOL)
15535    PerlIO_printf(Perl_debug_log, "(BOL)");
15536   if (r->intflags & PREGf_ANCH_MBOL)
15537    PerlIO_printf(Perl_debug_log, "(MBOL)");
15538   if (r->intflags & PREGf_ANCH_SBOL)
15539    PerlIO_printf(Perl_debug_log, "(SBOL)");
15540   if (r->intflags & PREGf_ANCH_GPOS)
15541    PerlIO_printf(Perl_debug_log, "(GPOS)");
15542   PerlIO_putc(Perl_debug_log, ' ');
15543  }
15544  if (r->intflags & PREGf_GPOS_SEEN)
15545   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15546  if (r->intflags & PREGf_SKIP)
15547   PerlIO_printf(Perl_debug_log, "plus ");
15548  if (r->intflags & PREGf_IMPLICIT)
15549   PerlIO_printf(Perl_debug_log, "implicit ");
15550  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15551  if (r->extflags & RXf_EVAL_SEEN)
15552   PerlIO_printf(Perl_debug_log, "with eval ");
15553  PerlIO_printf(Perl_debug_log, "\n");
15554  DEBUG_FLAGS_r({
15555   regdump_extflags("r->extflags: ",r->extflags);
15556   regdump_intflags("r->intflags: ",r->intflags);
15557  });
15558 #else
15559  PERL_ARGS_ASSERT_REGDUMP;
15560  PERL_UNUSED_CONTEXT;
15561  PERL_UNUSED_ARG(r);
15562 #endif /* DEBUGGING */
15563 }
15564
15565 /*
15566 - regprop - printable representation of opcode, with run time support
15567 */
15568
15569 void
15570 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15571 {
15572 #ifdef DEBUGGING
15573  dVAR;
15574  int k;
15575
15576  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15577  static const char * const anyofs[] = {
15578 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15579  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15580  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15581  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15582  || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15583  || _CC_VERTSPACE != 16
15584   #error Need to adjust order of anyofs[]
15585 #endif
15586   "\\w",
15587   "\\W",
15588   "\\d",
15589   "\\D",
15590   "[:alpha:]",
15591   "[:^alpha:]",
15592   "[:lower:]",
15593   "[:^lower:]",
15594   "[:upper:]",
15595   "[:^upper:]",
15596   "[:punct:]",
15597   "[:^punct:]",
15598   "[:print:]",
15599   "[:^print:]",
15600   "[:alnum:]",
15601   "[:^alnum:]",
15602   "[:graph:]",
15603   "[:^graph:]",
15604   "[:cased:]",
15605   "[:^cased:]",
15606   "\\s",
15607   "\\S",
15608   "[:blank:]",
15609   "[:^blank:]",
15610   "[:xdigit:]",
15611   "[:^xdigit:]",
15612   "[:space:]",
15613   "[:^space:]",
15614   "[:cntrl:]",
15615   "[:^cntrl:]",
15616   "[:ascii:]",
15617   "[:^ascii:]",
15618   "\\v",
15619   "\\V"
15620  };
15621  RXi_GET_DECL(prog,progi);
15622  GET_RE_DEBUG_FLAGS_DECL;
15623
15624  PERL_ARGS_ASSERT_REGPROP;
15625
15626  sv_setpvs(sv, "");
15627
15628  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
15629   /* It would be nice to FAIL() here, but this may be called from
15630   regexec.c, and it would be hard to supply pRExC_state. */
15631   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15632            (int)OP(o), (int)REGNODE_MAX);
15633  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15634
15635  k = PL_regkind[OP(o)];
15636
15637  if (k == EXACT) {
15638   sv_catpvs(sv, " ");
15639   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15640   * is a crude hack but it may be the best for now since
15641   * we have no flag "this EXACTish node was UTF-8"
15642   * --jhi */
15643   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15644     PERL_PV_ESCAPE_UNI_DETECT |
15645     PERL_PV_ESCAPE_NONASCII   |
15646     PERL_PV_PRETTY_ELLIPSES   |
15647     PERL_PV_PRETTY_LTGT       |
15648     PERL_PV_PRETTY_NOCLEAR
15649     );
15650  } else if (k == TRIE) {
15651   /* print the details of the trie in dumpuntil instead, as
15652   * progi->data isn't available here */
15653   const char op = OP(o);
15654   const U32 n = ARG(o);
15655   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15656    (reg_ac_data *)progi->data->data[n] :
15657    NULL;
15658   const reg_trie_data * const trie
15659    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15660
15661   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15662   DEBUG_TRIE_COMPILE_r(
15663   Perl_sv_catpvf(aTHX_ sv,
15664    "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15665    (UV)trie->startstate,
15666    (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15667    (UV)trie->wordcount,
15668    (UV)trie->minlen,
15669    (UV)trie->maxlen,
15670    (UV)TRIE_CHARCOUNT(trie),
15671    (UV)trie->uniquecharcount
15672   );
15673   );
15674   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15675    sv_catpvs(sv, "[");
15676    (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15677             ? ANYOF_BITMAP(o)
15678             : TRIE_BITMAP(trie));
15679    sv_catpvs(sv, "]");
15680   }
15681
15682  } else if (k == CURLY) {
15683   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15684    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15685   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15686  }
15687  else if (k == WHILEM && o->flags)   /* Ordinal/of */
15688   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15689  else if (k == REF || k == OPEN || k == CLOSE
15690    || k == GROUPP || OP(o)==ACCEPT)
15691  {
15692   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15693   if ( RXp_PAREN_NAMES(prog) ) {
15694    if ( k != REF || (OP(o) < NREF)) {
15695     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15696     SV **name= av_fetch(list, ARG(o), 0 );
15697     if (name)
15698      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15699    }
15700    else {
15701     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15702     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15703     I32 *nums=(I32*)SvPVX(sv_dat);
15704     SV **name= av_fetch(list, nums[0], 0 );
15705     I32 n;
15706     if (name) {
15707      for ( n=0; n<SvIVX(sv_dat); n++ ) {
15708       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15709          (n ? "," : ""), (IV)nums[n]);
15710      }
15711      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15712     }
15713    }
15714   }
15715   if ( k == REF && reginfo) {
15716    U32 n = ARG(o);  /* which paren pair */
15717    I32 ln = prog->offs[n].start;
15718    if (prog->lastparen < n || ln == -1)
15719     Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15720    else if (ln == prog->offs[n].end)
15721     Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15722    else {
15723     const char *s = reginfo->strbeg + ln;
15724     Perl_sv_catpvf(aTHX_ sv, ": ");
15725     Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15726      PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15727    }
15728   }
15729  } else if (k == GOSUB)
15730   /* Paren and offset */
15731   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15732  else if (k == VERB) {
15733   if (!o->flags)
15734    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15735       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15736  } else if (k == LOGICAL)
15737   /* 2: embedded, otherwise 1 */
15738   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15739  else if (k == ANYOF) {
15740   const U8 flags = ANYOF_FLAGS(o);
15741   int do_sep = 0;
15742
15743
15744   if (flags & ANYOF_LOCALE_FLAGS)
15745    sv_catpvs(sv, "{loc}");
15746   if (flags & ANYOF_LOC_FOLD)
15747    sv_catpvs(sv, "{i}");
15748   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15749   if (flags & ANYOF_INVERT)
15750    sv_catpvs(sv, "^");
15751
15752   /* output what the standard cp 0-255 bitmap matches */
15753   do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15754
15755   /* output any special charclass tests (used entirely under use
15756   * locale) * */
15757   if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15758    int i;
15759    for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15760     if (ANYOF_POSIXL_TEST(o,i)) {
15761      sv_catpv(sv, anyofs[i]);
15762      do_sep = 1;
15763     }
15764    }
15765   }
15766
15767   if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15768      |ANYOF_UTF8
15769      |ANYOF_NONBITMAP_NON_UTF8
15770      |ANYOF_LOC_FOLD)))
15771   {
15772    if (do_sep) {
15773     Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15774     if (flags & ANYOF_INVERT)
15775      /*make sure the invert info is in each */
15776      sv_catpvs(sv, "^");
15777    }
15778
15779    if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15780     sv_catpvs(sv, "{non-utf8-latin1-all}");
15781    }
15782
15783    /* output information about the unicode matching */
15784    if (flags & ANYOF_ABOVE_LATIN1_ALL)
15785     sv_catpvs(sv, "{unicode_all}");
15786    else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15787     SV *lv; /* Set if there is something outside the bit map. */
15788     bool byte_output = FALSE;   /* If something in the bitmap has
15789            been output */
15790     SV *only_utf8_locale;
15791
15792     /* Get the stuff that wasn't in the bitmap */
15793     (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15794              &lv, &only_utf8_locale);
15795     if (lv && lv != &PL_sv_undef) {
15796      char *s = savesvpv(lv);
15797      char * const origs = s;
15798
15799      while (*s && *s != '\n')
15800       s++;
15801
15802      if (*s == '\n') {
15803       const char * const t = ++s;
15804
15805       if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15806        sv_catpvs(sv, "{outside bitmap}");
15807       }
15808       else {
15809        sv_catpvs(sv, "{utf8}");
15810       }
15811
15812       if (byte_output) {
15813        sv_catpvs(sv, " ");
15814       }
15815
15816       while (*s) {
15817        if (*s == '\n') {
15818
15819         /* Truncate very long output */
15820         if (s - origs > 256) {
15821          Perl_sv_catpvf(aTHX_ sv,
15822             "%.*s...",
15823             (int) (s - origs - 1),
15824             t);
15825          goto out_dump;
15826         }
15827         *s = ' ';
15828        }
15829        else if (*s == '\t') {
15830         *s = '-';
15831        }
15832        s++;
15833       }
15834       if (s[-1] == ' ')
15835        s[-1] = 0;
15836
15837       sv_catpv(sv, t);
15838      }
15839
15840     out_dump:
15841
15842      Safefree(origs);
15843      SvREFCNT_dec_NN(lv);
15844     }
15845
15846     if ((flags & ANYOF_LOC_FOLD)
15847      && only_utf8_locale
15848      && only_utf8_locale != &PL_sv_undef)
15849     {
15850      UV start, end;
15851      int max_entries = 256;
15852
15853      sv_catpvs(sv, "{utf8 locale}");
15854      invlist_iterinit(only_utf8_locale);
15855      while (invlist_iternext(only_utf8_locale,
15856            &start, &end)) {
15857       put_range(sv, start, end);
15858       max_entries --;
15859       if (max_entries < 0) {
15860        sv_catpvs(sv, "...");
15861        break;
15862       }
15863      }
15864      invlist_iterfinish(only_utf8_locale);
15865     }
15866    }
15867   }
15868
15869   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15870  }
15871  else if (k == POSIXD || k == NPOSIXD) {
15872   U8 index = FLAGS(o) * 2;
15873   if (index < C_ARRAY_LENGTH(anyofs)) {
15874    if (*anyofs[index] != '[')  {
15875     sv_catpv(sv, "[");
15876    }
15877    sv_catpv(sv, anyofs[index]);
15878    if (*anyofs[index] != '[')  {
15879     sv_catpv(sv, "]");
15880    }
15881   }
15882   else {
15883    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15884   }
15885  }
15886  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15887   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15888 #else
15889  PERL_UNUSED_CONTEXT;
15890  PERL_UNUSED_ARG(sv);
15891  PERL_UNUSED_ARG(o);
15892  PERL_UNUSED_ARG(prog);
15893  PERL_UNUSED_ARG(reginfo);
15894 #endif /* DEBUGGING */
15895 }
15896
15897
15898
15899 SV *
15900 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15901 {    /* Assume that RE_INTUIT is set */
15902  dVAR;
15903  struct regexp *const prog = ReANY(r);
15904  GET_RE_DEBUG_FLAGS_DECL;
15905
15906  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15907  PERL_UNUSED_CONTEXT;
15908
15909  DEBUG_COMPILE_r(
15910   {
15911    const char * const s = SvPV_nolen_const(prog->check_substr
15912      ? prog->check_substr : prog->check_utf8);
15913
15914    if (!PL_colorset) reginitcolors();
15915    PerlIO_printf(Perl_debug_log,
15916      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15917      PL_colors[4],
15918      prog->check_substr ? "" : "utf8 ",
15919      PL_colors[5],PL_colors[0],
15920      s,
15921      PL_colors[1],
15922      (strlen(s) > 60 ? "..." : ""));
15923   } );
15924
15925  return prog->check_substr ? prog->check_substr : prog->check_utf8;
15926 }
15927
15928 /*
15929    pregfree()
15930
15931    handles refcounting and freeing the perl core regexp structure. When
15932    it is necessary to actually free the structure the first thing it
15933    does is call the 'free' method of the regexp_engine associated to
15934    the regexp, allowing the handling of the void *pprivate; member
15935    first. (This routine is not overridable by extensions, which is why
15936    the extensions free is called first.)
15937
15938    See regdupe and regdupe_internal if you change anything here.
15939 */
15940 #ifndef PERL_IN_XSUB_RE
15941 void
15942 Perl_pregfree(pTHX_ REGEXP *r)
15943 {
15944  SvREFCNT_dec(r);
15945 }
15946
15947 void
15948 Perl_pregfree2(pTHX_ REGEXP *rx)
15949 {
15950  dVAR;
15951  struct regexp *const r = ReANY(rx);
15952  GET_RE_DEBUG_FLAGS_DECL;
15953
15954  PERL_ARGS_ASSERT_PREGFREE2;
15955
15956  if (r->mother_re) {
15957   ReREFCNT_dec(r->mother_re);
15958  } else {
15959   CALLREGFREE_PVT(rx); /* free the private data */
15960   SvREFCNT_dec(RXp_PAREN_NAMES(r));
15961   Safefree(r->xpv_len_u.xpvlenu_pv);
15962  }
15963  if (r->substrs) {
15964   SvREFCNT_dec(r->anchored_substr);
15965   SvREFCNT_dec(r->anchored_utf8);
15966   SvREFCNT_dec(r->float_substr);
15967   SvREFCNT_dec(r->float_utf8);
15968   Safefree(r->substrs);
15969  }
15970  RX_MATCH_COPY_FREE(rx);
15971 #ifdef PERL_ANY_COW
15972  SvREFCNT_dec(r->saved_copy);
15973 #endif
15974  Safefree(r->offs);
15975  SvREFCNT_dec(r->qr_anoncv);
15976  rx->sv_u.svu_rx = 0;
15977 }
15978
15979 /*  reg_temp_copy()
15980
15981  This is a hacky workaround to the structural issue of match results
15982  being stored in the regexp structure which is in turn stored in
15983  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15984  could be PL_curpm in multiple contexts, and could require multiple
15985  result sets being associated with the pattern simultaneously, such
15986  as when doing a recursive match with (??{$qr})
15987
15988  The solution is to make a lightweight copy of the regexp structure
15989  when a qr// is returned from the code executed by (??{$qr}) this
15990  lightweight copy doesn't actually own any of its data except for
15991  the starp/end and the actual regexp structure itself.
15992
15993 */
15994
15995
15996 REGEXP *
15997 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15998 {
15999  struct regexp *ret;
16000  struct regexp *const r = ReANY(rx);
16001  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16002
16003  PERL_ARGS_ASSERT_REG_TEMP_COPY;
16004
16005  if (!ret_x)
16006   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16007  else {
16008   SvOK_off((SV *)ret_x);
16009   if (islv) {
16010    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16011    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16012    made both spots point to the same regexp body.) */
16013    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16014    assert(!SvPVX(ret_x));
16015    ret_x->sv_u.svu_rx = temp->sv_any;
16016    temp->sv_any = NULL;
16017    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16018    SvREFCNT_dec_NN(temp);
16019    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16020    ing below will not set it. */
16021    SvCUR_set(ret_x, SvCUR(rx));
16022   }
16023  }
16024  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16025  sv_force_normal(sv) is called.  */
16026  SvFAKE_on(ret_x);
16027  ret = ReANY(ret_x);
16028
16029  SvFLAGS(ret_x) |= SvUTF8(rx);
16030  /* We share the same string buffer as the original regexp, on which we
16031  hold a reference count, incremented when mother_re is set below.
16032  The string pointer is copied here, being part of the regexp struct.
16033  */
16034  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16035   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16036  if (r->offs) {
16037   const I32 npar = r->nparens+1;
16038   Newx(ret->offs, npar, regexp_paren_pair);
16039   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16040  }
16041  if (r->substrs) {
16042   Newx(ret->substrs, 1, struct reg_substr_data);
16043   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16044
16045   SvREFCNT_inc_void(ret->anchored_substr);
16046   SvREFCNT_inc_void(ret->anchored_utf8);
16047   SvREFCNT_inc_void(ret->float_substr);
16048   SvREFCNT_inc_void(ret->float_utf8);
16049
16050   /* check_substr and check_utf8, if non-NULL, point to either their
16051   anchored or float namesakes, and don't hold a second reference.  */
16052  }
16053  RX_MATCH_COPIED_off(ret_x);
16054 #ifdef PERL_ANY_COW
16055  ret->saved_copy = NULL;
16056 #endif
16057  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16058  SvREFCNT_inc_void(ret->qr_anoncv);
16059
16060  return ret_x;
16061 }
16062 #endif
16063
16064 /* regfree_internal()
16065
16066    Free the private data in a regexp. This is overloadable by
16067    extensions. Perl takes care of the regexp structure in pregfree(),
16068    this covers the *pprivate pointer which technically perl doesn't
16069    know about, however of course we have to handle the
16070    regexp_internal structure when no extension is in use.
16071
16072    Note this is called before freeing anything in the regexp
16073    structure.
16074  */
16075
16076 void
16077 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16078 {
16079  dVAR;
16080  struct regexp *const r = ReANY(rx);
16081  RXi_GET_DECL(r,ri);
16082  GET_RE_DEBUG_FLAGS_DECL;
16083
16084  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16085
16086  DEBUG_COMPILE_r({
16087   if (!PL_colorset)
16088    reginitcolors();
16089   {
16090    SV *dsv= sv_newmortal();
16091    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16092     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16093    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16094     PL_colors[4],PL_colors[5],s);
16095   }
16096  });
16097 #ifdef RE_TRACK_PATTERN_OFFSETS
16098  if (ri->u.offsets)
16099   Safefree(ri->u.offsets);             /* 20010421 MJD */
16100 #endif
16101  if (ri->code_blocks) {
16102   int n;
16103   for (n = 0; n < ri->num_code_blocks; n++)
16104    SvREFCNT_dec(ri->code_blocks[n].src_regex);
16105   Safefree(ri->code_blocks);
16106  }
16107
16108  if (ri->data) {
16109   int n = ri->data->count;
16110
16111   while (--n >= 0) {
16112   /* If you add a ->what type here, update the comment in regcomp.h */
16113    switch (ri->data->what[n]) {
16114    case 'a':
16115    case 'r':
16116    case 's':
16117    case 'S':
16118    case 'u':
16119     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16120     break;
16121    case 'f':
16122     Safefree(ri->data->data[n]);
16123     break;
16124    case 'l':
16125    case 'L':
16126     break;
16127    case 'T':
16128     { /* Aho Corasick add-on structure for a trie node.
16129      Used in stclass optimization only */
16130      U32 refcount;
16131      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16132      OP_REFCNT_LOCK;
16133      refcount = --aho->refcount;
16134      OP_REFCNT_UNLOCK;
16135      if ( !refcount ) {
16136       PerlMemShared_free(aho->states);
16137       PerlMemShared_free(aho->fail);
16138       /* do this last!!!! */
16139       PerlMemShared_free(ri->data->data[n]);
16140       PerlMemShared_free(ri->regstclass);
16141      }
16142     }
16143     break;
16144    case 't':
16145     {
16146      /* trie structure. */
16147      U32 refcount;
16148      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16149      OP_REFCNT_LOCK;
16150      refcount = --trie->refcount;
16151      OP_REFCNT_UNLOCK;
16152      if ( !refcount ) {
16153       PerlMemShared_free(trie->charmap);
16154       PerlMemShared_free(trie->states);
16155       PerlMemShared_free(trie->trans);
16156       if (trie->bitmap)
16157        PerlMemShared_free(trie->bitmap);
16158       if (trie->jump)
16159        PerlMemShared_free(trie->jump);
16160       PerlMemShared_free(trie->wordinfo);
16161       /* do this last!!!! */
16162       PerlMemShared_free(ri->data->data[n]);
16163      }
16164     }
16165     break;
16166    default:
16167     Perl_croak(aTHX_ "panic: regfree data code '%c'",
16168              ri->data->what[n]);
16169    }
16170   }
16171   Safefree(ri->data->what);
16172   Safefree(ri->data);
16173  }
16174
16175  Safefree(ri);
16176 }
16177
16178 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16179 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16180 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16181
16182 /*
16183    re_dup - duplicate a regexp.
16184
16185    This routine is expected to clone a given regexp structure. It is only
16186    compiled under USE_ITHREADS.
16187
16188    After all of the core data stored in struct regexp is duplicated
16189    the regexp_engine.dupe method is used to copy any private data
16190    stored in the *pprivate pointer. This allows extensions to handle
16191    any duplication it needs to do.
16192
16193    See pregfree() and regfree_internal() if you change anything here.
16194 */
16195 #if defined(USE_ITHREADS)
16196 #ifndef PERL_IN_XSUB_RE
16197 void
16198 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16199 {
16200  dVAR;
16201  I32 npar;
16202  const struct regexp *r = ReANY(sstr);
16203  struct regexp *ret = ReANY(dstr);
16204
16205  PERL_ARGS_ASSERT_RE_DUP_GUTS;
16206
16207  npar = r->nparens+1;
16208  Newx(ret->offs, npar, regexp_paren_pair);
16209  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16210
16211  if (ret->substrs) {
16212   /* Do it this way to avoid reading from *r after the StructCopy().
16213   That way, if any of the sv_dup_inc()s dislodge *r from the L1
16214   cache, it doesn't matter.  */
16215   const bool anchored = r->check_substr
16216    ? r->check_substr == r->anchored_substr
16217    : r->check_utf8 == r->anchored_utf8;
16218   Newx(ret->substrs, 1, struct reg_substr_data);
16219   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16220
16221   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16222   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16223   ret->float_substr = sv_dup_inc(ret->float_substr, param);
16224   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16225
16226   /* check_substr and check_utf8, if non-NULL, point to either their
16227   anchored or float namesakes, and don't hold a second reference.  */
16228
16229   if (ret->check_substr) {
16230    if (anchored) {
16231     assert(r->check_utf8 == r->anchored_utf8);
16232     ret->check_substr = ret->anchored_substr;
16233     ret->check_utf8 = ret->anchored_utf8;
16234    } else {
16235     assert(r->check_substr == r->float_substr);
16236     assert(r->check_utf8 == r->float_utf8);
16237     ret->check_substr = ret->float_substr;
16238     ret->check_utf8 = ret->float_utf8;
16239    }
16240   } else if (ret->check_utf8) {
16241    if (anchored) {
16242     ret->check_utf8 = ret->anchored_utf8;
16243    } else {
16244     ret->check_utf8 = ret->float_utf8;
16245    }
16246   }
16247  }
16248
16249  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16250  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16251
16252  if (ret->pprivate)
16253   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16254
16255  if (RX_MATCH_COPIED(dstr))
16256   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16257  else
16258   ret->subbeg = NULL;
16259 #ifdef PERL_ANY_COW
16260  ret->saved_copy = NULL;
16261 #endif
16262
16263  /* Whether mother_re be set or no, we need to copy the string.  We
16264  cannot refrain from copying it when the storage points directly to
16265  our mother regexp, because that's
16266    1: a buffer in a different thread
16267    2: something we no longer hold a reference on
16268    so we need to copy it locally.  */
16269  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16270  ret->mother_re   = NULL;
16271 }
16272 #endif /* PERL_IN_XSUB_RE */
16273
16274 /*
16275    regdupe_internal()
16276
16277    This is the internal complement to regdupe() which is used to copy
16278    the structure pointed to by the *pprivate pointer in the regexp.
16279    This is the core version of the extension overridable cloning hook.
16280    The regexp structure being duplicated will be copied by perl prior
16281    to this and will be provided as the regexp *r argument, however
16282    with the /old/ structures pprivate pointer value. Thus this routine
16283    may override any copying normally done by perl.
16284
16285    It returns a pointer to the new regexp_internal structure.
16286 */
16287
16288 void *
16289 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16290 {
16291  dVAR;
16292  struct regexp *const r = ReANY(rx);
16293  regexp_internal *reti;
16294  int len;
16295  RXi_GET_DECL(r,ri);
16296
16297  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16298
16299  len = ProgLen(ri);
16300
16301  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16302   char, regexp_internal);
16303  Copy(ri->program, reti->program, len+1, regnode);
16304
16305  reti->num_code_blocks = ri->num_code_blocks;
16306  if (ri->code_blocks) {
16307   int n;
16308   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16309     struct reg_code_block);
16310   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16311     struct reg_code_block);
16312   for (n = 0; n < ri->num_code_blocks; n++)
16313    reti->code_blocks[n].src_regex = (REGEXP*)
16314      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16315  }
16316  else
16317   reti->code_blocks = NULL;
16318
16319  reti->regstclass = NULL;
16320
16321  if (ri->data) {
16322   struct reg_data *d;
16323   const int count = ri->data->count;
16324   int i;
16325
16326   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16327     char, struct reg_data);
16328   Newx(d->what, count, U8);
16329
16330   d->count = count;
16331   for (i = 0; i < count; i++) {
16332    d->what[i] = ri->data->what[i];
16333    switch (d->what[i]) {
16334     /* see also regcomp.h and regfree_internal() */
16335    case 'a': /* actually an AV, but the dup function is identical.  */
16336    case 'r':
16337    case 's':
16338    case 'S':
16339    case 'u': /* actually an HV, but the dup function is identical.  */
16340     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16341     break;
16342    case 'f':
16343     /* This is cheating. */
16344     Newx(d->data[i], 1, regnode_ssc);
16345     StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16346     reti->regstclass = (regnode*)d->data[i];
16347     break;
16348    case 'T':
16349     /* Trie stclasses are readonly and can thus be shared
16350     * without duplication. We free the stclass in pregfree
16351     * when the corresponding reg_ac_data struct is freed.
16352     */
16353     reti->regstclass= ri->regstclass;
16354     /* Fall through */
16355    case 't':
16356     OP_REFCNT_LOCK;
16357     ((reg_trie_data*)ri->data->data[i])->refcount++;
16358     OP_REFCNT_UNLOCK;
16359     /* Fall through */
16360    case 'l':
16361    case 'L':
16362     d->data[i] = ri->data->data[i];
16363     break;
16364    default:
16365     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16366               ri->data->what[i]);
16367    }
16368   }
16369
16370   reti->data = d;
16371  }
16372  else
16373   reti->data = NULL;
16374
16375  reti->name_list_idx = ri->name_list_idx;
16376
16377 #ifdef RE_TRACK_PATTERN_OFFSETS
16378  if (ri->u.offsets) {
16379   Newx(reti->u.offsets, 2*len+1, U32);
16380   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16381  }
16382 #else
16383  SetProgLen(reti,len);
16384 #endif
16385
16386  return (void*)reti;
16387 }
16388
16389 #endif    /* USE_ITHREADS */
16390
16391 #ifndef PERL_IN_XSUB_RE
16392
16393 /*
16394  - regnext - dig the "next" pointer out of a node
16395  */
16396 regnode *
16397 Perl_regnext(pTHX_ regnode *p)
16398 {
16399  dVAR;
16400  I32 offset;
16401
16402  if (!p)
16403   return(NULL);
16404
16405  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
16406   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16407             (int)OP(p), (int)REGNODE_MAX);
16408  }
16409
16410  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16411  if (offset == 0)
16412   return(NULL);
16413
16414  return(p+offset);
16415 }
16416 #endif
16417
16418 STATIC void
16419 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16420 {
16421  va_list args;
16422  STRLEN l1 = strlen(pat1);
16423  STRLEN l2 = strlen(pat2);
16424  char buf[512];
16425  SV *msv;
16426  const char *message;
16427
16428  PERL_ARGS_ASSERT_RE_CROAK2;
16429
16430  if (l1 > 510)
16431   l1 = 510;
16432  if (l1 + l2 > 510)
16433   l2 = 510 - l1;
16434  Copy(pat1, buf, l1 , char);
16435  Copy(pat2, buf + l1, l2 , char);
16436  buf[l1 + l2] = '\n';
16437  buf[l1 + l2 + 1] = '\0';
16438  va_start(args, pat2);
16439  msv = vmess(buf, &args);
16440  va_end(args);
16441  message = SvPV_const(msv,l1);
16442  if (l1 > 512)
16443   l1 = 512;
16444  Copy(message, buf, l1 , char);
16445  /* l1-1 to avoid \n */
16446  Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16447 }
16448
16449 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16450
16451 #ifndef PERL_IN_XSUB_RE
16452 void
16453 Perl_save_re_context(pTHX)
16454 {
16455  dVAR;
16456
16457  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16458  if (PL_curpm) {
16459   const REGEXP * const rx = PM_GETRE(PL_curpm);
16460   if (rx) {
16461    U32 i;
16462    for (i = 1; i <= RX_NPARENS(rx); i++) {
16463     char digits[TYPE_CHARS(long)];
16464     const STRLEN len = my_snprintf(digits, sizeof(digits),
16465            "%lu", (long)i);
16466     GV *const *const gvp
16467      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16468
16469     if (gvp) {
16470      GV * const gv = *gvp;
16471      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16472       save_scalar(gv);
16473     }
16474    }
16475   }
16476  }
16477 }
16478 #endif
16479
16480 #ifdef DEBUGGING
16481
16482 STATIC void
16483 S_put_byte(pTHX_ SV *sv, int c)
16484 {
16485  PERL_ARGS_ASSERT_PUT_BYTE;
16486
16487  if (!isPRINT(c)) {
16488   switch (c) {
16489    case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16490    case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16491    case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16492    case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16493    case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16494
16495    default:
16496     Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16497     break;
16498   }
16499  }
16500  else {
16501   const char string = c;
16502   if (c == '-' || c == ']' || c == '\\' || c == '^')
16503    sv_catpvs(sv, "\\");
16504   sv_catpvn(sv, &string, 1);
16505  }
16506 }
16507
16508 STATIC void
16509 S_put_range(pTHX_ SV *sv, UV start, UV end)
16510 {
16511
16512  /* Appends to 'sv' a displayable version of the range of code points from
16513  * 'start' to 'end' */
16514
16515  assert(start <= end);
16516
16517  PERL_ARGS_ASSERT_PUT_RANGE;
16518
16519  if (end - start < 3) {  /* Individual chars in short ranges */
16520   for (; start <= end; start++)
16521    put_byte(sv, start);
16522  }
16523  else if (   end > 255
16524    || ! isALPHANUMERIC(start)
16525    || ! isALPHANUMERIC(end)
16526    || isDIGIT(start) != isDIGIT(end)
16527    || isUPPER(start) != isUPPER(end)
16528    || isLOWER(start) != isLOWER(end)
16529
16530     /* This final test should get optimized out except on EBCDIC
16531     * platforms, where it causes ranges that cross discontinuities
16532     * like i/j to be shown as hex instead of the misleading,
16533     * e.g. H-K (since that range includes more than H, I, J, K).
16534     * */
16535    || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16536  {
16537   Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16538      start,
16539      (end < 256) ? end : 255);
16540  }
16541  else { /* Here, the ends of the range are both digits, or both uppercase,
16542    or both lowercase; and there's no discontinuity in the range
16543    (which could happen on EBCDIC platforms) */
16544   put_byte(sv, start);
16545   sv_catpvs(sv, "-");
16546   put_byte(sv, end);
16547  }
16548 }
16549
16550 STATIC bool
16551 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16552 {
16553  /* Appends to 'sv' a displayable version of the innards of the bracketed
16554  * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16555  * output anything */
16556
16557  int i;
16558  bool has_output_anything = FALSE;
16559
16560  PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16561
16562  for (i = 0; i < 256; i++) {
16563   if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16564
16565    /* The character at index i should be output.  Find the next
16566    * character that should NOT be output */
16567    int j;
16568    for (j = i + 1; j <= 256; j++) {
16569     if (! BITMAP_TEST((U8 *) bitmap, j)) {
16570      break;
16571     }
16572    }
16573
16574    /* Everything between them is a single range that should be output
16575    * */
16576    put_range(sv, i, j - 1);
16577    has_output_anything = TRUE;
16578    i = j;
16579   }
16580  }
16581
16582  return has_output_anything;
16583 }
16584
16585 #define CLEAR_OPTSTART \
16586  if (optstart) STMT_START {                                               \
16587   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16588        " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16589   optstart=NULL;                                                       \
16590  } STMT_END
16591
16592 #define DUMPUNTIL(b,e)                                                       \
16593      CLEAR_OPTSTART;                                          \
16594      node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16595
16596 STATIC const regnode *
16597 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16598    const regnode *last, const regnode *plast,
16599    SV* sv, I32 indent, U32 depth)
16600 {
16601  dVAR;
16602  U8 op = PSEUDO; /* Arbitrary non-END op. */
16603  const regnode *next;
16604  const regnode *optstart= NULL;
16605
16606  RXi_GET_DECL(r,ri);
16607  GET_RE_DEBUG_FLAGS_DECL;
16608
16609  PERL_ARGS_ASSERT_DUMPUNTIL;
16610
16611 #ifdef DEBUG_DUMPUNTIL
16612  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16613   last ? last-start : 0,plast ? plast-start : 0);
16614 #endif
16615
16616  if (plast && plast < last)
16617   last= plast;
16618
16619  while (PL_regkind[op] != END && (!last || node < last)) {
16620   /* While that wasn't END last time... */
16621   NODE_ALIGN(node);
16622   op = OP(node);
16623   if (op == CLOSE || op == WHILEM)
16624    indent--;
16625   next = regnext((regnode *)node);
16626
16627   /* Where, what. */
16628   if (OP(node) == OPTIMIZED) {
16629    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16630     optstart = node;
16631    else
16632     goto after_print;
16633   } else
16634    CLEAR_OPTSTART;
16635
16636   regprop(r, sv, node, NULL);
16637   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16638      (int)(2*indent + 1), "", SvPVX_const(sv));
16639
16640   if (OP(node) != OPTIMIZED) {
16641    if (next == NULL)  /* Next ptr. */
16642     PerlIO_printf(Perl_debug_log, " (0)");
16643    else if (PL_regkind[(U8)op] == BRANCH
16644      && PL_regkind[OP(next)] != BRANCH )
16645     PerlIO_printf(Perl_debug_log, " (FAIL)");
16646    else
16647     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16648    (void)PerlIO_putc(Perl_debug_log, '\n');
16649   }
16650
16651  after_print:
16652   if (PL_regkind[(U8)op] == BRANCHJ) {
16653    assert(next);
16654    {
16655     const regnode *nnode = (OP(next) == LONGJMP
16656          ? regnext((regnode *)next)
16657          : next);
16658     if (last && nnode > last)
16659      nnode = last;
16660     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16661    }
16662   }
16663   else if (PL_regkind[(U8)op] == BRANCH) {
16664    assert(next);
16665    DUMPUNTIL(NEXTOPER(node), next);
16666   }
16667   else if ( PL_regkind[(U8)op]  == TRIE ) {
16668    const regnode *this_trie = node;
16669    const char op = OP(node);
16670    const U32 n = ARG(node);
16671    const reg_ac_data * const ac = op>=AHOCORASICK ?
16672    (reg_ac_data *)ri->data->data[n] :
16673    NULL;
16674    const reg_trie_data * const trie =
16675     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16676 #ifdef DEBUGGING
16677    AV *const trie_words
16678       = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16679 #endif
16680    const regnode *nextbranch= NULL;
16681    I32 word_idx;
16682    sv_setpvs(sv, "");
16683    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16684     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16685
16686     PerlIO_printf(Perl_debug_log, "%*s%s ",
16687     (int)(2*(indent+3)), "",
16688      elem_ptr
16689      ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16690         SvCUR(*elem_ptr), 60,
16691         PL_colors[0], PL_colors[1],
16692         (SvUTF8(*elem_ptr)
16693         ? PERL_PV_ESCAPE_UNI
16694         : 0)
16695         | PERL_PV_PRETTY_ELLIPSES
16696         | PERL_PV_PRETTY_LTGT
16697        )
16698      : "???"
16699     );
16700     if (trie->jump) {
16701      U16 dist= trie->jump[word_idx+1];
16702      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16703        (UV)((dist ? this_trie + dist : next) - start));
16704      if (dist) {
16705       if (!nextbranch)
16706        nextbranch= this_trie + trie->jump[0];
16707       DUMPUNTIL(this_trie + dist, nextbranch);
16708      }
16709      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16710       nextbranch= regnext((regnode *)nextbranch);
16711     } else {
16712      PerlIO_printf(Perl_debug_log, "\n");
16713     }
16714    }
16715    if (last && next > last)
16716     node= last;
16717    else
16718     node= next;
16719   }
16720   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16721    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16722      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16723   }
16724   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16725    assert(next);
16726    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16727   }
16728   else if ( op == PLUS || op == STAR) {
16729    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16730   }
16731   else if (PL_regkind[(U8)op] == ANYOF) {
16732    /* arglen 1 + class block */
16733    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16734       ? ANYOF_POSIXL_SKIP
16735       : ANYOF_SKIP);
16736    node = NEXTOPER(node);
16737   }
16738   else if (PL_regkind[(U8)op] == EXACT) {
16739    /* Literal string, where present. */
16740    node += NODE_SZ_STR(node) - 1;
16741    node = NEXTOPER(node);
16742   }
16743   else {
16744    node = NEXTOPER(node);
16745    node += regarglen[(U8)op];
16746   }
16747   if (op == CURLYX || op == OPEN)
16748    indent++;
16749  }
16750  CLEAR_OPTSTART;
16751 #ifdef DEBUG_DUMPUNTIL
16752  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16753 #endif
16754  return node;
16755 }
16756
16757 #endif /* DEBUGGING */
16758
16759 /*
16760  * Local variables:
16761  * c-indentation-style: bsd
16762  * c-basic-offset: 4
16763  * indent-tabs-mode: nil
16764  * End:
16765  *
16766  * ex: set ts=8 sts=4 sw=4 et:
16767  */