5020001/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 EXTERN_C const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
  95  _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  96 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  98
  99 #ifndef STATIC
 100 #define STATIC static
 101 #endif
 102
 103
 104 struct RExC_state_t {
 105  U32  flags;   /* RXf_* are we folding, multilining? */
 106  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 107  char *precomp;  /* uncompiled string. */
 108  REGEXP *rx_sv;   /* The SV that is the regexp. */
 109  regexp *rx;                    /* perl core regexp structure */
 110  regexp_internal *rxi;           /* internal data for regexp object
 111           pprivate field */
 112  char *start;   /* Start of input for compile */
 113  char *end;   /* End of input for compile */
 114  char *parse;   /* Input-scan pointer. */
 115  SSize_t whilem_seen;  /* number of WHILEM in this expr */
 116  regnode *emit_start;  /* Start of emitted-code area */
 117  regnode *emit_bound;  /* First regnode outside of the
 118           allocated space */
 119  regnode *emit;   /* Code-emit pointer; if = &emit_dummy,
 120           implies compiling, so don't emit */
 121  regnode_ssc emit_dummy;  /* placeholder for emit to point to;
 122           large enough for the largest
 123           non-EXACTish node, so can use it as
 124           scratch in pass1 */
 125  I32  naughty;  /* How bad is this pattern? */
 126  I32  sawback;  /* Did we see \1, ...? */
 127  U32  seen;
 128  SSize_t size;   /* Code size. */
 129  I32                npar;            /* Capture buffer count, (OPEN) plus
 130           one. ("par" 0 is the whole
 131           pattern)*/
 132  I32  nestroot;  /* root parens we are in - used by
 133           accept */
 134  I32  extralen;
 135  I32  seen_zerolen;
 136  regnode **open_parens;  /* pointers to open parens */
 137  regnode **close_parens;  /* pointers to close parens */
 138  regnode *opend;   /* END node in program */
 139  I32  utf8;  /* whether the pattern is utf8 or not */
 140  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 141         /* XXX use this for future optimisation of case
 142         * where pattern must be upgraded to utf8. */
 143  I32  uni_semantics; /* If a d charset modifier should use unicode
 144         rules, even if the pattern is not in
 145         utf8 */
 146  HV  *paren_names;  /* Paren names */
 147
 148  regnode **recurse;  /* Recurse regops */
 149  I32  recurse_count;  /* Number of recurse regops */
 150  U8          *study_chunk_recursed;  /* bitmap of which parens we have moved
 151           through */
 152  U32         study_chunk_recursed_bytes;  /* bytes in bitmap */
 153  I32  in_lookbehind;
 154  I32  contains_locale;
 155  I32  contains_i;
 156  I32  override_recoding;
 157  I32  in_multi_char_class;
 158  struct reg_code_block *code_blocks; /* positions of literal (?{})
 159            within pattern */
 160  int  num_code_blocks; /* size of code_blocks[] */
 161  int  code_index;  /* next code_blocks[] slot */
 162  SSize_t     maxlen;                        /* mininum possible number of chars in string to match */
 163 #ifdef ADD_TO_REGEXEC
 164  char  *starttry;  /* -Dr: where regtry was called. */
 165 #define RExC_starttry (pRExC_state->starttry)
 166 #endif
 167  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 168 #ifdef DEBUGGING
 169  const char  *lastparse;
 170  I32         lastnum;
 171  AV          *paren_name_list;       /* idx -> name */
 172 #define RExC_lastparse (pRExC_state->lastparse)
 173 #define RExC_lastnum (pRExC_state->lastnum)
 174 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 175 #endif
 176 };
 177
 178 #define RExC_flags (pRExC_state->flags)
 179 #define RExC_pm_flags (pRExC_state->pm_flags)
 180 #define RExC_precomp (pRExC_state->precomp)
 181 #define RExC_rx_sv (pRExC_state->rx_sv)
 182 #define RExC_rx  (pRExC_state->rx)
 183 #define RExC_rxi (pRExC_state->rxi)
 184 #define RExC_start (pRExC_state->start)
 185 #define RExC_end (pRExC_state->end)
 186 #define RExC_parse (pRExC_state->parse)
 187 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 188 #ifdef RE_TRACK_PATTERN_OFFSETS
 189 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
 190               others */
 191 #endif
 192 #define RExC_emit (pRExC_state->emit)
 193 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 194 #define RExC_emit_start (pRExC_state->emit_start)
 195 #define RExC_emit_bound (pRExC_state->emit_bound)
 196 #define RExC_naughty (pRExC_state->naughty)
 197 #define RExC_sawback (pRExC_state->sawback)
 198 #define RExC_seen (pRExC_state->seen)
 199 #define RExC_size (pRExC_state->size)
 200 #define RExC_maxlen        (pRExC_state->maxlen)
 201 #define RExC_npar (pRExC_state->npar)
 202 #define RExC_nestroot   (pRExC_state->nestroot)
 203 #define RExC_extralen (pRExC_state->extralen)
 204 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 205 #define RExC_utf8 (pRExC_state->utf8)
 206 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 207 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 208 #define RExC_open_parens (pRExC_state->open_parens)
 209 #define RExC_close_parens (pRExC_state->close_parens)
 210 #define RExC_opend (pRExC_state->opend)
 211 #define RExC_paren_names (pRExC_state->paren_names)
 212 #define RExC_recurse (pRExC_state->recurse)
 213 #define RExC_recurse_count (pRExC_state->recurse_count)
 214 #define RExC_study_chunk_recursed        (pRExC_state->study_chunk_recursed)
 215 #define RExC_study_chunk_recursed_bytes  \
 216         (pRExC_state->study_chunk_recursed_bytes)
 217 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 218 #define RExC_contains_locale (pRExC_state->contains_locale)
 219 #define RExC_contains_i (pRExC_state->contains_i)
 220 #define RExC_override_recoding (pRExC_state->override_recoding)
 221 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 222
 223
 224 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 225 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 226   ((*s) == '{' && regcurly(s, FALSE)))
 227
 228 /*
 229  * Flags to be passed up and down.
 230  */
 231 #define WORST  0 /* Worst case. */
 232 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 233
 234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 235  * character.  (There needs to be a case: in the switch statement in regexec.c
 236  * for any node marked SIMPLE.)  Note that this is not the same thing as
 237  * REGNODE_SIMPLE */
 238 #define SIMPLE  0x02
 239 #define SPSTART  0x04 /* Starts with * or + */
 240 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 241 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 242 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 243
 244 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 245
 246 /* whether trie related optimizations are enabled */
 247 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 248 #define TRIE_STUDY_OPT
 249 #define FULL_TRIE_STUDY
 250 #define TRIE_STCLASS
 251 #endif
 252
 253
 254
 255 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 256 #define PBITVAL(paren) (1 << ((paren) & 7))
 257 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 258 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 259 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 260
 261 #define REQUIRE_UTF8 STMT_START {                                       \
 262          if (!UTF) {                           \
 263           *flagp = RESTART_UTF8;            \
 264           return NULL;                      \
 265          }                                     \
 266       } STMT_END
 267
 268 /* This converts the named class defined in regcomp.h to its equivalent class
 269  * number defined in handy.h. */
 270 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 271 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 272
 273 #define _invlist_union_complement_2nd(a, b, output) \
 274       _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
 275 #define _invlist_intersection_complement_2nd(a, b, output) \
 276     _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
 277
 278 /* About scan_data_t.
 279
 280   During optimisation we recurse through the regexp program performing
 281   various inplace (keyhole style) optimisations. In addition study_chunk
 282   and scan_commit populate this data structure with information about
 283   what strings MUST appear in the pattern. We look for the longest
 284   string that must appear at a fixed location, and we look for the
 285   longest string that may appear at a floating location. So for instance
 286   in the pattern:
 287
 288  /FOO[xX]A.*B[xX]BAR/
 289
 290   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 291   strings (because they follow a .* construct). study_chunk will identify
 292   both FOO and BAR as being the longest fixed and floating strings respectively.
 293
 294   The strings can be composites, for instance
 295
 296  /(f)(o)(o)/
 297
 298   will result in a composite fixed substring 'foo'.
 299
 300   For each string some basic information is maintained:
 301
 302   - offset or min_offset
 303  This is the position the string must appear at, or not before.
 304  It also implicitly (when combined with minlenp) tells us how many
 305  characters must match before the string we are searching for.
 306  Likewise when combined with minlenp and the length of the string it
 307  tells us how many characters must appear after the string we have
 308  found.
 309
 310   - max_offset
 311  Only used for floating strings. This is the rightmost point that
 312  the string can appear at. If set to SSize_t_MAX it indicates that the
 313  string can occur infinitely far to the right.
 314
 315   - minlenp
 316  A pointer to the minimum number of characters of the pattern that the
 317  string was found inside. This is important as in the case of positive
 318  lookahead or positive lookbehind we can have multiple patterns
 319  involved. Consider
 320
 321  /(?=FOO).*F/
 322
 323  The minimum length of the pattern overall is 3, the minimum length
 324  of the lookahead part is 3, but the minimum length of the part that
 325  will actually match is 1. So 'FOO's minimum length is 3, but the
 326  minimum length for the F is 1. This is important as the minimum length
 327  is used to determine offsets in front of and behind the string being
 328  looked for.  Since strings can be composites this is the length of the
 329  pattern at the time it was committed with a scan_commit. Note that
 330  the length is calculated by study_chunk, so that the minimum lengths
 331  are not known until the full pattern has been compiled, thus the
 332  pointer to the value.
 333
 334   - lookbehind
 335
 336  In the case of lookbehind the string being searched for can be
 337  offset past the start point of the final matching string.
 338  If this value was just blithely removed from the min_offset it would
 339  invalidate some of the calculations for how many chars must match
 340  before or after (as they are derived from min_offset and minlen and
 341  the length of the string being searched for).
 342  When the final pattern is compiled and the data is moved from the
 343  scan_data_t structure into the regexp structure the information
 344  about lookbehind is factored in, with the information that would
 345  have been lost precalculated in the end_shift field for the
 346  associated string.
 347
 348   The fields pos_min and pos_delta are used to store the minimum offset
 349   and the delta to the maximum offset at the current point in the pattern.
 350
 351 */
 352
 353 typedef struct scan_data_t {
 354  /*I32 len_min;      unused */
 355  /*I32 len_delta;    unused */
 356  SSize_t pos_min;
 357  SSize_t pos_delta;
 358  SV *last_found;
 359  SSize_t last_end;     /* min value, <0 unless valid. */
 360  SSize_t last_start_min;
 361  SSize_t last_start_max;
 362  SV **longest;     /* Either &l_fixed, or &l_float. */
 363  SV *longest_fixed;      /* longest fixed string found in pattern */
 364  SSize_t offset_fixed;   /* offset where it starts */
 365  SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
 366  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 367  SV *longest_float;      /* longest floating string found in pattern */
 368  SSize_t offset_float_min; /* earliest point in string it can appear */
 369  SSize_t offset_float_max; /* latest point in string it can appear */
 370  SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
 371  SSize_t lookbehind_float; /* is the pos of the string modified by LB */
 372  I32 flags;
 373  I32 whilem_c;
 374  SSize_t *last_closep;
 375  regnode_ssc *start_class;
 376 } scan_data_t;
 377
 378 /* The below is perhaps overboard, but this allows us to save a test at the
 379  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 380  * and 'a' differ by a single bit; the same with the upper and lower case of
 381  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 382  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 383  * then inverts it to form a mask, with just a single 0, in the bit position
 384  * where the upper- and lowercase differ.  XXX There are about 40 other
 385  * instances in the Perl core where this micro-optimization could be used.
 386  * Should decide if maintenance cost is worse, before changing those
 387  *
 388  * Returns a boolean as to whether or not 'v' is either a lowercase or
 389  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 390  * compile-time constant, the generated code is better than some optimizing
 391  * compilers figure out, amounting to a mask and test.  The results are
 392  * meaningless if 'c' is not one of [A-Za-z] */
 393 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 394        (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 395
 396 /*
 397  * Forward declarations for pregcomp()'s friends.
 398  */
 399
 400 static const scan_data_t zero_scan_data =
 401   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 402
 403 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 404 #define SF_BEFORE_SEOL  0x0001
 405 #define SF_BEFORE_MEOL  0x0002
 406 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 407 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 408
 409 #define SF_FIX_SHIFT_EOL (+2)
 410 #define SF_FL_SHIFT_EOL  (+4)
 411
 412 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 413 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 414
 415 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 416 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 417 #define SF_IS_INF  0x0040
 418 #define SF_HAS_PAR  0x0080
 419 #define SF_IN_PAR  0x0100
 420 #define SF_HAS_EVAL  0x0200
 421 #define SCF_DO_SUBSTR  0x0400
 422 #define SCF_DO_STCLASS_AND 0x0800
 423 #define SCF_DO_STCLASS_OR 0x1000
 424 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 425 #define SCF_WHILEM_VISITED_POS 0x2000
 426
 427 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 428 #define SCF_SEEN_ACCEPT         0x8000
 429 #define SCF_TRIE_DOING_RESTUDY 0x10000
 430
 431 #define UTF cBOOL(RExC_utf8)
 432
 433 /* The enums for all these are ordered so things work out correctly */
 434 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 435 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags)                    \
 436              == REGEX_DEPENDS_CHARSET)
 437 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 438 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags)                \
 439              >= REGEX_UNICODE_CHARSET)
 440 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags)                      \
 441            == REGEX_ASCII_RESTRICTED_CHARSET)
 442 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags)             \
 443            >= REGEX_ASCII_RESTRICTED_CHARSET)
 444 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags)                 \
 445           == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 446
 447 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 448
 449 /* For programs that want to be strictly Unicode compatible by dying if any
 450  * attempt is made to match a non-Unicode code point against a Unicode
 451  * property.  */
 452 #define ALWAYS_WARN_SUPER  ckDEAD(packWARN(WARN_NON_UNICODE))
 453
 454 #define OOB_NAMEDCLASS  -1
 455
 456 /* There is no code point that is out-of-bounds, so this is problematic.  But
 457  * its only current use is to initialize a variable that is always set before
 458  * looked at. */
 459 #define OOB_UNICODE  0xDEADBEEF
 460
 461 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 462 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 463
 464
 465 /* length of regex to show in messages that don't mark a position within */
 466 #define RegexLengthToShowInErrorMessages 127
 467
 468 /*
 469  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 470  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 471  * op/pragma/warn/regcomp.
 472  */
 473 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 474 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 475
 476 #define REPORT_LOCATION " in regex; marked by " MARKER1    \
 477       " in m/%"UTF8f MARKER2 "%"UTF8f"/"
 478
 479 #define REPORT_LOCATION_ARGS(offset)            \
 480     UTF8fARG(UTF, offset, RExC_precomp), \
 481     UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
 482
 483 /*
 484  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 485  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 486  * "...".
 487  */
 488 #define _FAIL(code) STMT_START {     \
 489  const char *ellipses = "";      \
 490  IV len = RExC_end - RExC_precomp;     \
 491                   \
 492  if (!SIZE_ONLY)       \
 493   SAVEFREESV(RExC_rx_sv);      \
 494  if (len > RegexLengthToShowInErrorMessages) {   \
 495   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 496   len = RegexLengthToShowInErrorMessages - 10;   \
 497   ellipses = "...";      \
 498  }         \
 499  code;                                                               \
 500 } STMT_END
 501
 502 #define FAIL(msg) _FAIL(       \
 503  Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/",     \
 504    msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 505
 506 #define FAIL2(msg,arg) _FAIL(       \
 507  Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/",     \
 508    arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
 509
 510 /*
 511  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 512  */
 513 #define Simple_vFAIL(m) STMT_START {     \
 514  const IV offset = RExC_parse - RExC_precomp;   \
 515  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 516    m, REPORT_LOCATION_ARGS(offset)); \
 517 } STMT_END
 518
 519 /*
 520  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 521  */
 522 #define vFAIL(m) STMT_START {    \
 523  if (!SIZE_ONLY)     \
 524   SAVEFREESV(RExC_rx_sv);    \
 525  Simple_vFAIL(m);     \
 526 } STMT_END
 527
 528 /*
 529  * Like Simple_vFAIL(), but accepts two arguments.
 530  */
 531 #define Simple_vFAIL2(m,a1) STMT_START {   \
 532  const IV offset = RExC_parse - RExC_precomp;   \
 533  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1,   \
 534      REPORT_LOCATION_ARGS(offset)); \
 535 } STMT_END
 536
 537 /*
 538  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 539  */
 540 #define vFAIL2(m,a1) STMT_START {   \
 541  if (!SIZE_ONLY)     \
 542   SAVEFREESV(RExC_rx_sv);    \
 543  Simple_vFAIL2(m, a1);    \
 544 } STMT_END
 545
 546
 547 /*
 548  * Like Simple_vFAIL(), but accepts three arguments.
 549  */
 550 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 551  const IV offset = RExC_parse - RExC_precomp;  \
 552  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2,  \
 553    REPORT_LOCATION_ARGS(offset)); \
 554 } STMT_END
 555
 556 /*
 557  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 558  */
 559 #define vFAIL3(m,a1,a2) STMT_START {   \
 560  if (!SIZE_ONLY)     \
 561   SAVEFREESV(RExC_rx_sv);    \
 562  Simple_vFAIL3(m, a1, a2);    \
 563 } STMT_END
 564
 565 /*
 566  * Like Simple_vFAIL(), but accepts four arguments.
 567  */
 568 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 569  const IV offset = RExC_parse - RExC_precomp;  \
 570  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3,  \
 571    REPORT_LOCATION_ARGS(offset)); \
 572 } STMT_END
 573
 574 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 575  if (!SIZE_ONLY)     \
 576   SAVEFREESV(RExC_rx_sv);    \
 577  Simple_vFAIL4(m, a1, a2, a3);   \
 578 } STMT_END
 579
 580 /* A specialized version of vFAIL2 that works with UTF8f */
 581 #define vFAIL2utf8f(m, a1) STMT_START { \
 582  const IV offset = RExC_parse - RExC_precomp;   \
 583  if (!SIZE_ONLY)                                \
 584   SAVEFREESV(RExC_rx_sv);                    \
 585  S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
 586    REPORT_LOCATION_ARGS(offset));         \
 587 } STMT_END
 588
 589
 590 /* m is not necessarily a "literal string", in this macro */
 591 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 592  const IV offset = loc - RExC_precomp;                               \
 593  Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 594    m, REPORT_LOCATION_ARGS(offset));       \
 595 } STMT_END
 596
 597 #define ckWARNreg(loc,m) STMT_START {     \
 598  const IV offset = loc - RExC_precomp;    \
 599  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 600    REPORT_LOCATION_ARGS(offset));  \
 601 } STMT_END
 602
 603 #define vWARN_dep(loc, m) STMT_START {            \
 604  const IV offset = loc - RExC_precomp;    \
 605  Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 606    REPORT_LOCATION_ARGS(offset));         \
 607 } STMT_END
 608
 609 #define ckWARNdep(loc,m) STMT_START {            \
 610  const IV offset = loc - RExC_precomp;    \
 611  Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 612    m REPORT_LOCATION,      \
 613    REPORT_LOCATION_ARGS(offset));  \
 614 } STMT_END
 615
 616 #define ckWARNregdep(loc,m) STMT_START {    \
 617  const IV offset = loc - RExC_precomp;    \
 618  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 619    m REPORT_LOCATION,      \
 620    REPORT_LOCATION_ARGS(offset));  \
 621 } STMT_END
 622
 623 #define ckWARN2reg_d(loc,m, a1) STMT_START {    \
 624  const IV offset = loc - RExC_precomp;    \
 625  Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),   \
 626    m REPORT_LOCATION,      \
 627    a1, REPORT_LOCATION_ARGS(offset)); \
 628 } STMT_END
 629
 630 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 631  const IV offset = loc - RExC_precomp;    \
 632  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 633    a1, REPORT_LOCATION_ARGS(offset)); \
 634 } STMT_END
 635
 636 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 637  const IV offset = loc - RExC_precomp;    \
 638  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 639    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 640 } STMT_END
 641
 642 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 643  const IV offset = loc - RExC_precomp;    \
 644  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 645    a1, a2, REPORT_LOCATION_ARGS(offset)); \
 646 } STMT_END
 647
 648 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 649  const IV offset = loc - RExC_precomp;    \
 650  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 651    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 652 } STMT_END
 653
 654 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 655  const IV offset = loc - RExC_precomp;    \
 656  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 657    a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
 658 } STMT_END
 659
 660 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 661  const IV offset = loc - RExC_precomp;    \
 662  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 663    a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
 664 } STMT_END
 665
 666
 667 /* Allow for side effects in s */
 668 #define REGC(c,s) STMT_START {   \
 669  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 670 } STMT_END
 671
 672 /* Macros for recording node offsets.   20001227 mjd@plover.com
 673  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 674  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 675  * Element 0 holds the number n.
 676  * Position is 1 indexed.
 677  */
 678 #ifndef RE_TRACK_PATTERN_OFFSETS
 679 #define Set_Node_Offset_To_R(node,byte)
 680 #define Set_Node_Offset(node,byte)
 681 #define Set_Cur_Node_Offset
 682 #define Set_Node_Length_To_R(node,len)
 683 #define Set_Node_Length(node,len)
 684 #define Set_Node_Cur_Length(node,start)
 685 #define Node_Offset(n)
 686 #define Node_Length(n)
 687 #define Set_Node_Offset_Length(node,offset,len)
 688 #define ProgLen(ri) ri->u.proglen
 689 #define SetProgLen(ri,x) ri->u.proglen = x
 690 #else
 691 #define ProgLen(ri) ri->u.offsets[0]
 692 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 693 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 694  if (! SIZE_ONLY) {       \
 695   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 696      __LINE__, (int)(node), (int)(byte)));  \
 697   if((node) < 0) {      \
 698    Perl_croak(aTHX_ "value of node is %d in Offset macro",     \
 699           (int)(node));                  \
 700   } else {       \
 701    RExC_offsets[2*(node)-1] = (byte);    \
 702   }        \
 703  }         \
 704 } STMT_END
 705
 706 #define Set_Node_Offset(node,byte) \
 707  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 708 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 709
 710 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 711  if (! SIZE_ONLY) {       \
 712   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 713     __LINE__, (int)(node), (int)(len)));   \
 714   if((node) < 0) {      \
 715    Perl_croak(aTHX_ "value of node is %d in Length macro",     \
 716           (int)(node));                  \
 717   } else {       \
 718    RExC_offsets[2*(node)] = (len);    \
 719   }        \
 720  }         \
 721 } STMT_END
 722
 723 #define Set_Node_Length(node,len) \
 724  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 725 #define Set_Node_Cur_Length(node, start)                \
 726  Set_Node_Length(node, RExC_parse - start)
 727
 728 /* Get offsets and lengths */
 729 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 730 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 731
 732 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 733  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 734  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 735 } STMT_END
 736 #endif
 737
 738 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 739 #define EXPERIMENTAL_INPLACESCAN
 740 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 741
 742 #define DEBUG_RExC_seen() \
 743   DEBUG_OPTIMISE_MORE_r({                                             \
 744    PerlIO_printf(Perl_debug_log,"RExC_seen: ");                    \
 745                    \
 746    if (RExC_seen & REG_ZERO_LEN_SEEN)                              \
 747     PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN ");         \
 748                    \
 749    if (RExC_seen & REG_LOOKBEHIND_SEEN)                            \
 750     PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN ");       \
 751                    \
 752    if (RExC_seen & REG_GPOS_SEEN)                                  \
 753     PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN ");             \
 754                    \
 755    if (RExC_seen & REG_CANY_SEEN)                                  \
 756     PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN ");             \
 757                    \
 758    if (RExC_seen & REG_RECURSE_SEEN)                               \
 759     PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN ");          \
 760                    \
 761    if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)                         \
 762     PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN ");    \
 763                    \
 764    if (RExC_seen & REG_VERBARG_SEEN)                               \
 765     PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN ");          \
 766                    \
 767    if (RExC_seen & REG_CUTGROUP_SEEN)                              \
 768     PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN ");         \
 769                    \
 770    if (RExC_seen & REG_RUN_ON_COMMENT_SEEN)                        \
 771     PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN ");   \
 772                    \
 773    if (RExC_seen & REG_UNFOLDED_MULTI_SEEN)                        \
 774     PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN ");   \
 775                    \
 776    if (RExC_seen & REG_GOSTART_SEEN)                               \
 777     PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN ");          \
 778                    \
 779    if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)                               \
 780     PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN ");          \
 781                    \
 782    PerlIO_printf(Perl_debug_log,"\n");                             \
 783   });
 784
 785 #define DEBUG_STUDYDATA(str,data,depth)                              \
 786 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 787  PerlIO_printf(Perl_debug_log,                                    \
 788   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 789   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 790   (int)(depth)*2, "",                                          \
 791   (IV)((data)->pos_min),                                       \
 792   (IV)((data)->pos_delta),                                     \
 793   (UV)((data)->flags),                                         \
 794   (IV)((data)->whilem_c),                                      \
 795   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 796   is_inf ? "INF " : ""                                         \
 797  );                                                               \
 798  if ((data)->last_found)                                          \
 799   PerlIO_printf(Perl_debug_log,                                \
 800    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 801    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 802    SvPVX_const((data)->last_found),                         \
 803    (IV)((data)->last_end),                                  \
 804    (IV)((data)->last_start_min),                            \
 805    (IV)((data)->last_start_max),                            \
 806    ((data)->longest &&                                      \
 807    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 808    SvPVX_const((data)->longest_fixed),                      \
 809    (IV)((data)->offset_fixed),                              \
 810    ((data)->longest &&                                      \
 811    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 812    SvPVX_const((data)->longest_float),                      \
 813    (IV)((data)->offset_float_min),                          \
 814    (IV)((data)->offset_float_max)                           \
 815   );                                                           \
 816  PerlIO_printf(Perl_debug_log,"\n");                              \
 817 });
 818
 819 /* Mark that we cannot extend a found fixed substring at this point.
 820    Update the longest found anchored substring and the longest found
 821    floating substrings if needed. */
 822
 823 STATIC void
 824 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
 825      SSize_t *minlenp, int is_inf)
 826 {
 827  const STRLEN l = CHR_SVLEN(data->last_found);
 828  const STRLEN old_l = CHR_SVLEN(*data->longest);
 829  GET_RE_DEBUG_FLAGS_DECL;
 830
 831  PERL_ARGS_ASSERT_SCAN_COMMIT;
 832
 833  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 834   SvSetMagicSV(*data->longest, data->last_found);
 835   if (*data->longest == data->longest_fixed) {
 836    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 837    if (data->flags & SF_BEFORE_EOL)
 838     data->flags
 839      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 840    else
 841     data->flags &= ~SF_FIX_BEFORE_EOL;
 842    data->minlen_fixed=minlenp;
 843    data->lookbehind_fixed=0;
 844   }
 845   else { /* *data->longest == data->longest_float */
 846    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 847    data->offset_float_max = (l
 848          ? data->last_start_max
 849          : (data->pos_delta == SSize_t_MAX
 850           ? SSize_t_MAX
 851           : data->pos_min + data->pos_delta));
 852    if (is_inf
 853     || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
 854     data->offset_float_max = SSize_t_MAX;
 855    if (data->flags & SF_BEFORE_EOL)
 856     data->flags
 857      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 858    else
 859     data->flags &= ~SF_FL_BEFORE_EOL;
 860    data->minlen_float=minlenp;
 861    data->lookbehind_float=0;
 862   }
 863  }
 864  SvCUR_set(data->last_found, 0);
 865  {
 866   SV * const sv = data->last_found;
 867   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 868    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 869    if (mg)
 870     mg->mg_len = 0;
 871   }
 872  }
 873  data->last_end = -1;
 874  data->flags &= ~SF_BEFORE_EOL;
 875  DEBUG_STUDYDATA("commit: ",data,0);
 876 }
 877
 878 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
 879  * list that describes which code points it matches */
 880
 881 STATIC void
 882 S_ssc_anything(pTHX_ regnode_ssc *ssc)
 883 {
 884  /* Set the SSC 'ssc' to match an empty string or any code point */
 885
 886  PERL_ARGS_ASSERT_SSC_ANYTHING;
 887
 888  assert(is_ANYOF_SYNTHETIC(ssc));
 889
 890  ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
 891  _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
 892  ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING;    /* Plus match empty string */
 893 }
 894
 895 STATIC int
 896 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
 897 {
 898  /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
 899  * point; FALSE otherwise.  Thus, this is used to see if using 'ssc' buys
 900  * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
 901  * in any way, so there's no point in using it */
 902
 903  UV start, end;
 904  bool ret;
 905
 906  PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
 907
 908  assert(is_ANYOF_SYNTHETIC(ssc));
 909
 910  if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
 911   return FALSE;
 912  }
 913
 914  /* See if the list consists solely of the range 0 - Infinity */
 915  invlist_iterinit(ssc->invlist);
 916  ret = invlist_iternext(ssc->invlist, &start, &end)
 917   && start == 0
 918   && end == UV_MAX;
 919
 920  invlist_iterfinish(ssc->invlist);
 921
 922  if (ret) {
 923   return TRUE;
 924  }
 925
 926  /* If e.g., both \w and \W are set, matches everything */
 927  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
 928   int i;
 929   for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
 930    if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
 931     return TRUE;
 932    }
 933   }
 934  }
 935
 936  return FALSE;
 937 }
 938
 939 STATIC void
 940 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
 941 {
 942  /* Initializes the SSC 'ssc'.  This includes setting it to match an empty
 943  * string, any code point, or any posix class under locale */
 944
 945  PERL_ARGS_ASSERT_SSC_INIT;
 946
 947  Zero(ssc, 1, regnode_ssc);
 948  set_ANYOF_SYNTHETIC(ssc);
 949  ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
 950  ssc_anything(ssc);
 951
 952  /* If any portion of the regex is to operate under locale rules,
 953  * initialization includes it.  The reason this isn't done for all regexes
 954  * is that the optimizer was written under the assumption that locale was
 955  * all-or-nothing.  Given the complexity and lack of documentation in the
 956  * optimizer, and that there are inadequate test cases for locale, many
 957  * parts of it may not work properly, it is safest to avoid locale unless
 958  * necessary. */
 959  if (RExC_contains_locale) {
 960   ANYOF_POSIXL_SETALL(ssc);
 961  }
 962  else {
 963   ANYOF_POSIXL_ZERO(ssc);
 964  }
 965 }
 966
 967 STATIC int
 968 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
 969        const regnode_ssc *ssc)
 970 {
 971  /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
 972  * to the list of code points matched, and locale posix classes; hence does
 973  * not check its flags) */
 974
 975  UV start, end;
 976  bool ret;
 977
 978  PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
 979
 980  assert(is_ANYOF_SYNTHETIC(ssc));
 981
 982  invlist_iterinit(ssc->invlist);
 983  ret = invlist_iternext(ssc->invlist, &start, &end)
 984   && start == 0
 985   && end == UV_MAX;
 986
 987  invlist_iterfinish(ssc->invlist);
 988
 989  if (! ret) {
 990   return FALSE;
 991  }
 992
 993  if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
 994   return FALSE;
 995  }
 996
 997  return TRUE;
 998 }
 999
1000 STATIC SV*
1001 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1002        const regnode_charclass* const node)
1003 {
1004  /* Returns a mortal inversion list defining which code points are matched
1005  * by 'node', which is of type ANYOF.  Handles complementing the result if
1006  * appropriate.  If some code points aren't knowable at this time, the
1007  * returned list must, and will, contain every code point that is a
1008  * possibility. */
1009
1010  SV* invlist = sv_2mortal(_new_invlist(0));
1011  SV* only_utf8_locale_invlist = NULL;
1012  unsigned int i;
1013  const U32 n = ARG(node);
1014  bool new_node_has_latin1 = FALSE;
1015
1016  PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1017
1018  /* Look at the data structure created by S_set_ANYOF_arg() */
1019  if (n != ANYOF_NONBITMAP_EMPTY) {
1020   SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1021   AV * const av = MUTABLE_AV(SvRV(rv));
1022   SV **const ary = AvARRAY(av);
1023   assert(RExC_rxi->data->what[n] == 's');
1024
1025   if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1026    invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1027   }
1028   else if (ary[0] && ary[0] != &PL_sv_undef) {
1029
1030    /* Here, no compile-time swash, and there are things that won't be
1031    * known until runtime -- we have to assume it could be anything */
1032    return _add_range_to_invlist(invlist, 0, UV_MAX);
1033   }
1034   else if (ary[3] && ary[3] != &PL_sv_undef) {
1035
1036    /* Here no compile-time swash, and no run-time only data.  Use the
1037    * node's inversion list */
1038    invlist = sv_2mortal(invlist_clone(ary[3]));
1039   }
1040
1041   /* Get the code points valid only under UTF-8 locales */
1042   if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1043    && ary[2] && ary[2] != &PL_sv_undef)
1044   {
1045    only_utf8_locale_invlist = ary[2];
1046   }
1047  }
1048
1049  /* An ANYOF node contains a bitmap for the first 256 code points, and an
1050  * inversion list for the others, but if there are code points that should
1051  * match only conditionally on the target string being UTF-8, those are
1052  * placed in the inversion list, and not the bitmap.  Since there are
1053  * circumstances under which they could match, they are included in the
1054  * SSC.  But if the ANYOF node is to be inverted, we have to exclude them
1055  * here, so that when we invert below, the end result actually does include
1056  * them.  (Think about "\xe0" =~ /[^\xc0]/di;).  We have to do this here
1057  * before we add the unconditionally matched code points */
1058  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1059   _invlist_intersection_complement_2nd(invlist,
1060            PL_UpperLatin1,
1061            &invlist);
1062  }
1063
1064  /* Add in the points from the bit map */
1065  for (i = 0; i < 256; i++) {
1066   if (ANYOF_BITMAP_TEST(node, i)) {
1067    invlist = add_cp_to_invlist(invlist, i);
1068    new_node_has_latin1 = TRUE;
1069   }
1070  }
1071
1072  /* If this can match all upper Latin1 code points, have to add them
1073  * as well */
1074  if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1075   _invlist_union(invlist, PL_UpperLatin1, &invlist);
1076  }
1077
1078  /* Similarly for these */
1079  if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1080   invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1081  }
1082
1083  if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1084   _invlist_invert(invlist);
1085  }
1086  else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1087
1088   /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1089   * locale.  We can skip this if there are no 0-255 at all. */
1090   _invlist_union(invlist, PL_Latin1, &invlist);
1091  }
1092
1093  /* Similarly add the UTF-8 locale possible matches.  These have to be
1094  * deferred until after the non-UTF-8 locale ones are taken care of just
1095  * above, or it leads to wrong results under ANYOF_INVERT */
1096  if (only_utf8_locale_invlist) {
1097   _invlist_union_maybe_complement_2nd(invlist,
1098            only_utf8_locale_invlist,
1099            ANYOF_FLAGS(node) & ANYOF_INVERT,
1100            &invlist);
1101  }
1102
1103  return invlist;
1104 }
1105
1106 /* These two functions currently do the exact same thing */
1107 #define ssc_init_zero  ssc_init
1108
1109 #define ssc_add_cp(ssc, cp)   ssc_add_range((ssc), (cp), (cp))
1110 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1111
1112 /* 'AND' a given class with another one.  Can create false positives.  'ssc'
1113  * should not be inverted.  'and_with->flags & ANYOF_POSIXL' should be 0 if
1114  * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1115
1116 STATIC void
1117 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1118     const regnode_charclass *and_with)
1119 {
1120  /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1121  * another SSC or a regular ANYOF class.  Can create false positives. */
1122
1123  SV* anded_cp_list;
1124  U8  anded_flags;
1125
1126  PERL_ARGS_ASSERT_SSC_AND;
1127
1128  assert(is_ANYOF_SYNTHETIC(ssc));
1129
1130  /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1131  * the code point inversion list and just the relevant flags */
1132  if (is_ANYOF_SYNTHETIC(and_with)) {
1133   anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1134   anded_flags = ANYOF_FLAGS(and_with);
1135
1136   /* XXX This is a kludge around what appears to be deficiencies in the
1137   * optimizer.  If we make S_ssc_anything() add in the WARN_SUPER flag,
1138   * there are paths through the optimizer where it doesn't get weeded
1139   * out when it should.  And if we don't make some extra provision for
1140   * it like the code just below, it doesn't get added when it should.
1141   * This solution is to add it only when AND'ing, which is here, and
1142   * only when what is being AND'ed is the pristine, original node
1143   * matching anything.  Thus it is like adding it to ssc_anything() but
1144   * only when the result is to be AND'ed.  Probably the same solution
1145   * could be adopted for the same problem we have with /l matching,
1146   * which is solved differently in S_ssc_init(), and that would lead to
1147   * fewer false positives than that solution has.  But if this solution
1148   * creates bugs, the consequences are only that a warning isn't raised
1149   * that should be; while the consequences for having /l bugs is
1150   * incorrect matches */
1151   if (ssc_is_anything((regnode_ssc *)and_with)) {
1152    anded_flags |= ANYOF_WARN_SUPER;
1153   }
1154  }
1155  else {
1156   anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1157   anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1158  }
1159
1160  ANYOF_FLAGS(ssc) &= anded_flags;
1161
1162  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1163  * C2 is the list of code points in 'and-with'; P2, its posix classes.
1164  * 'and_with' may be inverted.  When not inverted, we have the situation of
1165  * computing:
1166  *  (C1 | P1) & (C2 | P2)
1167  *                     =  (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1168  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1169  *                    <=  ((C1 & C2) |       P2)) | ( P1       | (P1 & P2))
1170  *                    <=  ((C1 & C2) | P1 | P2)
1171  * Alternatively, the last few steps could be:
1172  *                     =  ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1173  *                    <=  ((C1 & C2) |  C1      ) | (      C2  | (P1 & P2))
1174  *                    <=  (C1 | C2 | (P1 & P2))
1175  * We favor the second approach if either P1 or P2 is non-empty.  This is
1176  * because these components are a barrier to doing optimizations, as what
1177  * they match cannot be known until the moment of matching as they are
1178  * dependent on the current locale, 'AND"ing them likely will reduce or
1179  * eliminate them.
1180  * But we can do better if we know that C1,P1 are in their initial state (a
1181  * frequent occurrence), each matching everything:
1182  *  (<everything>) & (C2 | P2) =  C2 | P2
1183  * Similarly, if C2,P2 are in their initial state (again a frequent
1184  * occurrence), the result is a no-op
1185  *  (C1 | P1) & (<everything>) =  C1 | P1
1186  *
1187  * Inverted, we have
1188  *  (C1 | P1) & ~(C2 | P2)  =  (C1 | P1) & (~C2 & ~P2)
1189  *                          =  (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1190  *                         <=  (C1 & ~C2) | (P1 & ~P2)
1191  * */
1192
1193  if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1194   && ! is_ANYOF_SYNTHETIC(and_with))
1195  {
1196   unsigned int i;
1197
1198   ssc_intersection(ssc,
1199       anded_cp_list,
1200       FALSE /* Has already been inverted */
1201       );
1202
1203   /* If either P1 or P2 is empty, the intersection will be also; can skip
1204   * the loop */
1205   if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1206    ANYOF_POSIXL_ZERO(ssc);
1207   }
1208   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1209
1210    /* Note that the Posix class component P from 'and_with' actually
1211    * looks like:
1212    *      P = Pa | Pb | ... | Pn
1213    * where each component is one posix class, such as in [\w\s].
1214    * Thus
1215    *      ~P = ~(Pa | Pb | ... | Pn)
1216    *         = ~Pa & ~Pb & ... & ~Pn
1217    *        <= ~Pa | ~Pb | ... | ~Pn
1218    * The last is something we can easily calculate, but unfortunately
1219    * is likely to have many false positives.  We could do better
1220    * in some (but certainly not all) instances if two classes in
1221    * P have known relationships.  For example
1222    *      :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1223    * So
1224    *      :lower: & :print: = :lower:
1225    * And similarly for classes that must be disjoint.  For example,
1226    * since \s and \w can have no elements in common based on rules in
1227    * the POSIX standard,
1228    *      \w & ^\S = nothing
1229    * Unfortunately, some vendor locales do not meet the Posix
1230    * standard, in particular almost everything by Microsoft.
1231    * The loop below just changes e.g., \w into \W and vice versa */
1232
1233    regnode_charclass_posixl temp;
1234    int add = 1;    /* To calculate the index of the complement */
1235
1236    ANYOF_POSIXL_ZERO(&temp);
1237    for (i = 0; i < ANYOF_MAX; i++) {
1238     assert(i % 2 != 0
1239      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1240      || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1241
1242     if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1243      ANYOF_POSIXL_SET(&temp, i + add);
1244     }
1245     add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1246    }
1247    ANYOF_POSIXL_AND(&temp, ssc);
1248
1249   } /* else ssc already has no posixes */
1250  } /* else: Not inverted.  This routine is a no-op if 'and_with' is an SSC
1251   in its initial state */
1252  else if (! is_ANYOF_SYNTHETIC(and_with)
1253    || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1254  {
1255   /* But if 'ssc' is in its initial state, the result is just 'and_with';
1256   * copy it over 'ssc' */
1257   if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1258    if (is_ANYOF_SYNTHETIC(and_with)) {
1259     StructCopy(and_with, ssc, regnode_ssc);
1260    }
1261    else {
1262     ssc->invlist = anded_cp_list;
1263     ANYOF_POSIXL_ZERO(ssc);
1264     if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1265      ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1266     }
1267    }
1268   }
1269   else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1270     || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1271   {
1272    /* One or the other of P1, P2 is non-empty. */
1273    if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1274     ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1275    }
1276    ssc_union(ssc, anded_cp_list, FALSE);
1277   }
1278   else { /* P1 = P2 = empty */
1279    ssc_intersection(ssc, anded_cp_list, FALSE);
1280   }
1281  }
1282 }
1283
1284 STATIC void
1285 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1286    const regnode_charclass *or_with)
1287 {
1288  /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1289  * another SSC or a regular ANYOF class.  Can create false positives if
1290  * 'or_with' is to be inverted. */
1291
1292  SV* ored_cp_list;
1293  U8 ored_flags;
1294
1295  PERL_ARGS_ASSERT_SSC_OR;
1296
1297  assert(is_ANYOF_SYNTHETIC(ssc));
1298
1299  /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1300  * the code point inversion list and just the relevant flags */
1301  if (is_ANYOF_SYNTHETIC(or_with)) {
1302   ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1303   ored_flags = ANYOF_FLAGS(or_with);
1304  }
1305  else {
1306   ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1307   ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1308  }
1309
1310  ANYOF_FLAGS(ssc) |= ored_flags;
1311
1312  /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1313  * C2 is the list of code points in 'or-with'; P2, its posix classes.
1314  * 'or_with' may be inverted.  When not inverted, we have the simple
1315  * situation of computing:
1316  *  (C1 | P1) | (C2 | P2)  =  (C1 | C2) | (P1 | P2)
1317  * If P1|P2 yields a situation with both a class and its complement are
1318  * set, like having both \w and \W, this matches all code points, and we
1319  * can delete these from the P component of the ssc going forward.  XXX We
1320  * might be able to delete all the P components, but I (khw) am not certain
1321  * about this, and it is better to be safe.
1322  *
1323  * Inverted, we have
1324  *  (C1 | P1) | ~(C2 | P2)  =  (C1 | P1) | (~C2 & ~P2)
1325  *                         <=  (C1 | P1) | ~C2
1326  *                         <=  (C1 | ~C2) | P1
1327  * (which results in actually simpler code than the non-inverted case)
1328  * */
1329
1330  if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1331   && ! is_ANYOF_SYNTHETIC(or_with))
1332  {
1333   /* We ignore P2, leaving P1 going forward */
1334  }   /* else  Not inverted */
1335  else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1336   ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1337   if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1338    unsigned int i;
1339    for (i = 0; i < ANYOF_MAX; i += 2) {
1340     if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1341     {
1342      ssc_match_all_cp(ssc);
1343      ANYOF_POSIXL_CLEAR(ssc, i);
1344      ANYOF_POSIXL_CLEAR(ssc, i+1);
1345     }
1346    }
1347   }
1348  }
1349
1350  ssc_union(ssc,
1351    ored_cp_list,
1352    FALSE /* Already has been inverted */
1353    );
1354 }
1355
1356 PERL_STATIC_INLINE void
1357 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1358 {
1359  PERL_ARGS_ASSERT_SSC_UNION;
1360
1361  assert(is_ANYOF_SYNTHETIC(ssc));
1362
1363  _invlist_union_maybe_complement_2nd(ssc->invlist,
1364           invlist,
1365           invert2nd,
1366           &ssc->invlist);
1367 }
1368
1369 PERL_STATIC_INLINE void
1370 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1371       SV* const invlist,
1372       const bool invert2nd)
1373 {
1374  PERL_ARGS_ASSERT_SSC_INTERSECTION;
1375
1376  assert(is_ANYOF_SYNTHETIC(ssc));
1377
1378  _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1379            invlist,
1380            invert2nd,
1381            &ssc->invlist);
1382 }
1383
1384 PERL_STATIC_INLINE void
1385 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1386 {
1387  PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1388
1389  assert(is_ANYOF_SYNTHETIC(ssc));
1390
1391  ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1392 }
1393
1394 PERL_STATIC_INLINE void
1395 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1396 {
1397  /* AND just the single code point 'cp' into the SSC 'ssc' */
1398
1399  SV* cp_list = _new_invlist(2);
1400
1401  PERL_ARGS_ASSERT_SSC_CP_AND;
1402
1403  assert(is_ANYOF_SYNTHETIC(ssc));
1404
1405  cp_list = add_cp_to_invlist(cp_list, cp);
1406  ssc_intersection(ssc, cp_list,
1407      FALSE /* Not inverted */
1408      );
1409  SvREFCNT_dec_NN(cp_list);
1410 }
1411
1412 PERL_STATIC_INLINE void
1413 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1414 {
1415  /* Set the SSC 'ssc' to not match any locale things */
1416
1417  PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1418
1419  assert(is_ANYOF_SYNTHETIC(ssc));
1420
1421  ANYOF_POSIXL_ZERO(ssc);
1422  ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1423 }
1424
1425 STATIC void
1426 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1427 {
1428  /* The inversion list in the SSC is marked mortal; now we need a more
1429  * permanent copy, which is stored the same way that is done in a regular
1430  * ANYOF node, with the first 256 code points in a bit map */
1431
1432  SV* invlist = invlist_clone(ssc->invlist);
1433
1434  PERL_ARGS_ASSERT_SSC_FINALIZE;
1435
1436  assert(is_ANYOF_SYNTHETIC(ssc));
1437
1438  /* The code in this file assumes that all but these flags aren't relevant
1439  * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1440  * time we reach here */
1441  assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1442
1443  populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1444
1445  set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1446         NULL, NULL, NULL, FALSE);
1447
1448  /* Make sure is clone-safe */
1449  ssc->invlist = NULL;
1450
1451  if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1452   ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1453  }
1454
1455  assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1456 }
1457
1458 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1459 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1460 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1461 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
1462        ? (TRIE_LIST_CUR( idx ) - 1)           \
1463        : 0 )
1464
1465
1466 #ifdef DEBUGGING
1467 /*
1468    dump_trie(trie,widecharmap,revcharmap)
1469    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1470    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1471
1472    These routines dump out a trie in a somewhat readable format.
1473    The _interim_ variants are used for debugging the interim
1474    tables that are used to generate the final compressed
1475    representation which is what dump_trie expects.
1476
1477    Part of the reason for their existence is to provide a form
1478    of documentation as to how the different representations function.
1479
1480 */
1481
1482 /*
1483   Dumps the final compressed table form of the trie to Perl_debug_log.
1484   Used for debugging make_trie().
1485 */
1486
1487 STATIC void
1488 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1489    AV *revcharmap, U32 depth)
1490 {
1491  U32 state;
1492  SV *sv=sv_newmortal();
1493  int colwidth= widecharmap ? 6 : 4;
1494  U16 word;
1495  GET_RE_DEBUG_FLAGS_DECL;
1496
1497  PERL_ARGS_ASSERT_DUMP_TRIE;
1498
1499  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1500   (int)depth * 2 + 2,"",
1501   "Match","Base","Ofs" );
1502
1503  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1504   SV ** const tmp = av_fetch( revcharmap, state, 0);
1505   if ( tmp ) {
1506    PerlIO_printf( Perl_debug_log, "%*s",
1507     colwidth,
1508     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1509        PL_colors[0], PL_colors[1],
1510        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1511        PERL_PV_ESCAPE_FIRSTCHAR
1512     )
1513    );
1514   }
1515  }
1516  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1517   (int)depth * 2 + 2,"");
1518
1519  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1520   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1521  PerlIO_printf( Perl_debug_log, "\n");
1522
1523  for( state = 1 ; state < trie->statecount ; state++ ) {
1524   const U32 base = trie->states[ state ].trans.base;
1525
1526   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1527          (int)depth * 2 + 2,"", (UV)state);
1528
1529   if ( trie->states[ state ].wordnum ) {
1530    PerlIO_printf( Perl_debug_log, " W%4X",
1531           trie->states[ state ].wordnum );
1532   } else {
1533    PerlIO_printf( Perl_debug_log, "%6s", "" );
1534   }
1535
1536   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1537
1538   if ( base ) {
1539    U32 ofs = 0;
1540
1541    while( ( base + ofs  < trie->uniquecharcount ) ||
1542     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1543      && trie->trans[ base + ofs - trie->uniquecharcount ].check
1544                  != state))
1545      ofs++;
1546
1547    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1548
1549    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1550     if ( ( base + ofs >= trie->uniquecharcount )
1551       && ( base + ofs - trie->uniquecharcount
1552               < trie->lasttrans )
1553       && trie->trans[ base + ofs
1554          - trie->uniquecharcount ].check == state )
1555     {
1556     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1557      colwidth,
1558      (UV)trie->trans[ base + ofs
1559            - trie->uniquecharcount ].next );
1560     } else {
1561      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1562     }
1563    }
1564
1565    PerlIO_printf( Perl_debug_log, "]");
1566
1567   }
1568   PerlIO_printf( Perl_debug_log, "\n" );
1569  }
1570  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1571         (int)depth*2, "");
1572  for (word=1; word <= trie->wordcount; word++) {
1573   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1574    (int)word, (int)(trie->wordinfo[word].prev),
1575    (int)(trie->wordinfo[word].len));
1576  }
1577  PerlIO_printf(Perl_debug_log, "\n" );
1578 }
1579 /*
1580   Dumps a fully constructed but uncompressed trie in list form.
1581   List tries normally only are used for construction when the number of
1582   possible chars (trie->uniquecharcount) is very high.
1583   Used for debugging make_trie().
1584 */
1585 STATIC void
1586 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1587       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1588       U32 depth)
1589 {
1590  U32 state;
1591  SV *sv=sv_newmortal();
1592  int colwidth= widecharmap ? 6 : 4;
1593  GET_RE_DEBUG_FLAGS_DECL;
1594
1595  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1596
1597  /* print out the table precompression.  */
1598  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1599   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1600   "------:-----+-----------------\n" );
1601
1602  for( state=1 ; state < next_alloc ; state ++ ) {
1603   U16 charid;
1604
1605   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1606    (int)depth * 2 + 2,"", (UV)state  );
1607   if ( ! trie->states[ state ].wordnum ) {
1608    PerlIO_printf( Perl_debug_log, "%5s| ","");
1609   } else {
1610    PerlIO_printf( Perl_debug_log, "W%4x| ",
1611     trie->states[ state ].wordnum
1612    );
1613   }
1614   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1615    SV ** const tmp = av_fetch( revcharmap,
1616           TRIE_LIST_ITEM(state,charid).forid, 0);
1617    if ( tmp ) {
1618     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1619      colwidth,
1620      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1621        colwidth,
1622        PL_colors[0], PL_colors[1],
1623        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1624        | PERL_PV_ESCAPE_FIRSTCHAR
1625      ) ,
1626      TRIE_LIST_ITEM(state,charid).forid,
1627      (UV)TRIE_LIST_ITEM(state,charid).newstate
1628     );
1629     if (!(charid % 10))
1630      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1631       (int)((depth * 2) + 14), "");
1632    }
1633   }
1634   PerlIO_printf( Perl_debug_log, "\n");
1635  }
1636 }
1637
1638 /*
1639   Dumps a fully constructed but uncompressed trie in table form.
1640   This is the normal DFA style state transition table, with a few
1641   twists to facilitate compression later.
1642   Used for debugging make_trie().
1643 */
1644 STATIC void
1645 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1646       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1647       U32 depth)
1648 {
1649  U32 state;
1650  U16 charid;
1651  SV *sv=sv_newmortal();
1652  int colwidth= widecharmap ? 6 : 4;
1653  GET_RE_DEBUG_FLAGS_DECL;
1654
1655  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1656
1657  /*
1658  print out the table precompression so that we can do a visual check
1659  that they are identical.
1660  */
1661
1662  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1663
1664  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1665   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1666   if ( tmp ) {
1667    PerlIO_printf( Perl_debug_log, "%*s",
1668     colwidth,
1669     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1670        PL_colors[0], PL_colors[1],
1671        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1672        PERL_PV_ESCAPE_FIRSTCHAR
1673     )
1674    );
1675   }
1676  }
1677
1678  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1679
1680  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1681   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1682  }
1683
1684  PerlIO_printf( Perl_debug_log, "\n" );
1685
1686  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1687
1688   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1689    (int)depth * 2 + 2,"",
1690    (UV)TRIE_NODENUM( state ) );
1691
1692   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1693    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1694    if (v)
1695     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1696    else
1697     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1698   }
1699   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1700    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1701            (UV)trie->trans[ state ].check );
1702   } else {
1703    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1704            (UV)trie->trans[ state ].check,
1705    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1706   }
1707  }
1708 }
1709
1710 #endif
1711
1712
1713 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1714   startbranch: the first branch in the whole branch sequence
1715   first      : start branch of sequence of branch-exact nodes.
1716    May be the same as startbranch
1717   last       : Thing following the last branch.
1718    May be the same as tail.
1719   tail       : item following the branch sequence
1720   count      : words in the sequence
1721   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1722   depth      : indent depth
1723
1724 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1725
1726 A trie is an N'ary tree where the branches are determined by digital
1727 decomposition of the key. IE, at the root node you look up the 1st character and
1728 follow that branch repeat until you find the end of the branches. Nodes can be
1729 marked as "accepting" meaning they represent a complete word. Eg:
1730
1731   /he|she|his|hers/
1732
1733 would convert into the following structure. Numbers represent states, letters
1734 following numbers represent valid transitions on the letter from that state, if
1735 the number is in square brackets it represents an accepting state, otherwise it
1736 will be in parenthesis.
1737
1738  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1739  |    |
1740  |   (2)
1741  |    |
1742  (1)   +-i->(6)-+-s->[7]
1743  |
1744  +-s->(3)-+-h->(4)-+-e->[5]
1745
1746  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1747
1748 This shows that when matching against the string 'hers' we will begin at state 1
1749 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1750 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1751 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1752 single traverse. We store a mapping from accepting to state to which word was
1753 matched, and then when we have multiple possibilities we try to complete the
1754 rest of the regex in the order in which they occured in the alternation.
1755
1756 The only prior NFA like behaviour that would be changed by the TRIE support is
1757 the silent ignoring of duplicate alternations which are of the form:
1758
1759  / (DUPE|DUPE) X? (?{ ... }) Y /x
1760
1761 Thus EVAL blocks following a trie may be called a different number of times with
1762 and without the optimisation. With the optimisations dupes will be silently
1763 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1764 the following demonstrates:
1765
1766  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1767
1768 which prints out 'word' three times, but
1769
1770  'words'=~/(word|word|word)(?{ print $1 })S/
1771
1772 which doesnt print it out at all. This is due to other optimisations kicking in.
1773
1774 Example of what happens on a structural level:
1775
1776 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1777
1778    1: CURLYM[1] {1,32767}(18)
1779    5:   BRANCH(8)
1780    6:     EXACT <ac>(16)
1781    8:   BRANCH(11)
1782    9:     EXACT <ad>(16)
1783   11:   BRANCH(14)
1784   12:     EXACT <ab>(16)
1785   16:   SUCCEED(0)
1786   17:   NOTHING(18)
1787   18: END(0)
1788
1789 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1790 and should turn into:
1791
1792    1: CURLYM[1] {1,32767}(18)
1793    5:   TRIE(16)
1794   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1795   <ac>
1796   <ad>
1797   <ab>
1798   16:   SUCCEED(0)
1799   17:   NOTHING(18)
1800   18: END(0)
1801
1802 Cases where tail != last would be like /(?foo|bar)baz/:
1803
1804    1: BRANCH(4)
1805    2:   EXACT <foo>(8)
1806    4: BRANCH(7)
1807    5:   EXACT <bar>(8)
1808    7: TAIL(8)
1809    8: EXACT <baz>(10)
1810   10: END(0)
1811
1812 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1813 and would end up looking like:
1814
1815  1: TRIE(8)
1816  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1817   <foo>
1818   <bar>
1819    7: TAIL(8)
1820    8: EXACT <baz>(10)
1821   10: END(0)
1822
1823  d = uvchr_to_utf8_flags(d, uv, 0);
1824
1825 is the recommended Unicode-aware way of saying
1826
1827  *(d++) = uv;
1828 */
1829
1830 #define TRIE_STORE_REVCHAR(val)                                            \
1831  STMT_START {                                                           \
1832   if (UTF) {          \
1833    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1834    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1835    unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1836    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1837    SvPOK_on(zlopp);         \
1838    SvUTF8_on(zlopp);         \
1839    av_push(revcharmap, zlopp);        \
1840   } else {          \
1841    char ooooff = (char)val;                                           \
1842    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1843   }           \
1844   } STMT_END
1845
1846 /* This gets the next character from the input, folding it if not already
1847  * folded. */
1848 #define TRIE_READ_CHAR STMT_START {                                           \
1849  wordlen++;                                                                \
1850  if ( UTF ) {                                                              \
1851   /* if it is UTF then it is either already folded, or does not need    \
1852   * folding */                                                         \
1853   uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
1854  }                                                                         \
1855  else if (folder == PL_fold_latin1) {                                      \
1856   /* This folder implies Unicode rules, which in the range expressible  \
1857   *  by not UTF is the lower case, with the two exceptions, one of     \
1858   *  which should have been taken care of before calling this */       \
1859   assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
1860   uvc = toLOWER_L1(*uc);                                                \
1861   if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
1862   len = 1;                                                              \
1863  } else {                                                                  \
1864   /* raw data, will be folded later if needed */                        \
1865   uvc = (U32)*uc;                                                       \
1866   len = 1;                                                              \
1867  }                                                                         \
1868 } STMT_END
1869
1870
1871
1872 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1873  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1874   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1875   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1876  }                                                           \
1877  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1878  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1879  TRIE_LIST_CUR( state )++;                                   \
1880 } STMT_END
1881
1882 #define TRIE_LIST_NEW(state) STMT_START {                       \
1883  Newxz( trie->states[ state ].trans.list,               \
1884   4, reg_trie_trans_le );                                 \
1885  TRIE_LIST_CUR( state ) = 1;                                \
1886  TRIE_LIST_LEN( state ) = 4;                                \
1887 } STMT_END
1888
1889 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1890  U16 dupe= trie->states[ state ].wordnum;                    \
1891  regnode * const noper_next = regnext( noper );              \
1892                 \
1893  DEBUG_r({                                                   \
1894   /* store the word for dumping */                        \
1895   SV* tmp;                                                \
1896   if (OP(noper) != NOTHING)                               \
1897    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1898   else                                                    \
1899    tmp = newSVpvn_utf8( "", 0, UTF );   \
1900   av_push( trie_words, tmp );                             \
1901  });                                                         \
1902                 \
1903  curword++;                                                  \
1904  trie->wordinfo[curword].prev   = 0;                         \
1905  trie->wordinfo[curword].len    = wordlen;                   \
1906  trie->wordinfo[curword].accept = state;                     \
1907                 \
1908  if ( noper_next < tail ) {                                  \
1909   if (!trie->jump)                                        \
1910    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1911             sizeof(U16) ); \
1912   trie->jump[curword] = (U16)(noper_next - convert);      \
1913   if (!jumper)                                            \
1914    jumper = noper_next;                                \
1915   if (!nextbranch)                                        \
1916    nextbranch= regnext(cur);                           \
1917  }                                                           \
1918                 \
1919  if ( dupe ) {                                               \
1920   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1921   /* chain, so that when the bits of chain are later    */\
1922   /* linked together, the dups appear in the chain      */\
1923   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1924   trie->wordinfo[dupe].prev = curword;                    \
1925  } else {                                                    \
1926   /* we haven't inserted this word yet.                */ \
1927   trie->states[ state ].wordnum = curword;                \
1928  }                                                           \
1929 } STMT_END
1930
1931
1932 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1933  ( ( base + charid >=  ucharcount     \
1934   && base + charid < ubound     \
1935   && state == trie->trans[ base - ucharcount + charid ].check \
1936   && trie->trans[ base - ucharcount + charid ].next )  \
1937   ? trie->trans[ base - ucharcount + charid ].next  \
1938   : ( state==1 ? special : 0 )     \
1939  )
1940
1941 #define MADE_TRIE       1
1942 #define MADE_JUMP_TRIE  2
1943 #define MADE_EXACT_TRIE 4
1944
1945 STATIC I32
1946 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1947     regnode *first, regnode *last, regnode *tail,
1948     U32 word_count, U32 flags, U32 depth)
1949 {
1950  dVAR;
1951  /* first pass, loop through and scan words */
1952  reg_trie_data *trie;
1953  HV *widecharmap = NULL;
1954  AV *revcharmap = newAV();
1955  regnode *cur;
1956  STRLEN len = 0;
1957  UV uvc = 0;
1958  U16 curword = 0;
1959  U32 next_alloc = 0;
1960  regnode *jumper = NULL;
1961  regnode *nextbranch = NULL;
1962  regnode *convert = NULL;
1963  U32 *prev_states; /* temp array mapping each state to previous one */
1964  /* we just use folder as a flag in utf8 */
1965  const U8 * folder = NULL;
1966
1967 #ifdef DEBUGGING
1968  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1969  AV *trie_words = NULL;
1970  /* along with revcharmap, this only used during construction but both are
1971  * useful during debugging so we store them in the struct when debugging.
1972  */
1973 #else
1974  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1975  STRLEN trie_charcount=0;
1976 #endif
1977  SV *re_trie_maxbuff;
1978  GET_RE_DEBUG_FLAGS_DECL;
1979
1980  PERL_ARGS_ASSERT_MAKE_TRIE;
1981 #ifndef DEBUGGING
1982  PERL_UNUSED_ARG(depth);
1983 #endif
1984
1985  switch (flags) {
1986   case EXACT: break;
1987   case EXACTFA:
1988   case EXACTFU_SS:
1989   case EXACTFU: folder = PL_fold_latin1; break;
1990   case EXACTF:  folder = PL_fold; break;
1991   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1992  }
1993
1994  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1995  trie->refcount = 1;
1996  trie->startstate = 1;
1997  trie->wordcount = word_count;
1998  RExC_rxi->data->data[ data_slot ] = (void*)trie;
1999  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2000  if (flags == EXACT)
2001   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2002  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2003      trie->wordcount+1, sizeof(reg_trie_wordinfo));
2004
2005  DEBUG_r({
2006   trie_words = newAV();
2007  });
2008
2009  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2010  if (!SvIOK(re_trie_maxbuff)) {
2011   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2012  }
2013  DEBUG_TRIE_COMPILE_r({
2014   PerlIO_printf( Perl_debug_log,
2015   "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2016   (int)depth * 2 + 2, "",
2017   REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2018   REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2019  });
2020
2021    /* Find the node we are going to overwrite */
2022  if ( first == startbranch && OP( last ) != BRANCH ) {
2023   /* whole branch chain */
2024   convert = first;
2025  } else {
2026   /* branch sub-chain */
2027   convert = NEXTOPER( first );
2028  }
2029
2030  /*  -- First loop and Setup --
2031
2032  We first traverse the branches and scan each word to determine if it
2033  contains widechars, and how many unique chars there are, this is
2034  important as we have to build a table with at least as many columns as we
2035  have unique chars.
2036
2037  We use an array of integers to represent the character codes 0..255
2038  (trie->charmap) and we use a an HV* to store Unicode characters. We use
2039  the native representation of the character value as the key and IV's for
2040  the coded index.
2041
2042  *TODO* If we keep track of how many times each character is used we can
2043  remap the columns so that the table compression later on is more
2044  efficient in terms of memory by ensuring the most common value is in the
2045  middle and the least common are on the outside.  IMO this would be better
2046  than a most to least common mapping as theres a decent chance the most
2047  common letter will share a node with the least common, meaning the node
2048  will not be compressible. With a middle is most common approach the worst
2049  case is when we have the least common nodes twice.
2050
2051  */
2052
2053  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2054   regnode *noper = NEXTOPER( cur );
2055   const U8 *uc = (U8*)STRING( noper );
2056   const U8 *e  = uc + STR_LEN( noper );
2057   int foldlen = 0;
2058   U32 wordlen      = 0;         /* required init */
2059   STRLEN minchars = 0;
2060   STRLEN maxchars = 0;
2061   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2062            bitmap?*/
2063
2064   if (OP(noper) == NOTHING) {
2065    regnode *noper_next= regnext(noper);
2066    if (noper_next != tail && OP(noper_next) == flags) {
2067     noper = noper_next;
2068     uc= (U8*)STRING(noper);
2069     e= uc + STR_LEN(noper);
2070     trie->minlen= STR_LEN(noper);
2071    } else {
2072     trie->minlen= 0;
2073     continue;
2074    }
2075   }
2076
2077   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2078    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2079           regardless of encoding */
2080    if (OP( noper ) == EXACTFU_SS) {
2081     /* false positives are ok, so just set this */
2082     TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2083    }
2084   }
2085   for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
2086           branch */
2087    TRIE_CHARCOUNT(trie)++;
2088    TRIE_READ_CHAR;
2089
2090    /* TRIE_READ_CHAR returns the current character, or its fold if /i
2091    * is in effect.  Under /i, this character can match itself, or
2092    * anything that folds to it.  If not under /i, it can match just
2093    * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
2094    * all fold to k, and all are single characters.   But some folds
2095    * expand to more than one character, so for example LATIN SMALL
2096    * LIGATURE FFI folds to the three character sequence 'ffi'.  If
2097    * the string beginning at 'uc' is 'ffi', it could be matched by
2098    * three characters, or just by the one ligature character. (It
2099    * could also be matched by two characters: LATIN SMALL LIGATURE FF
2100    * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2101    * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2102    * match.)  The trie needs to know the minimum and maximum number
2103    * of characters that could match so that it can use size alone to
2104    * quickly reject many match attempts.  The max is simple: it is
2105    * the number of folded characters in this branch (since a fold is
2106    * never shorter than what folds to it. */
2107
2108    maxchars++;
2109
2110    /* And the min is equal to the max if not under /i (indicated by
2111    * 'folder' being NULL), or there are no multi-character folds.  If
2112    * there is a multi-character fold, the min is incremented just
2113    * once, for the character that folds to the sequence.  Each
2114    * character in the sequence needs to be added to the list below of
2115    * characters in the trie, but we count only the first towards the
2116    * min number of characters needed.  This is done through the
2117    * variable 'foldlen', which is returned by the macros that look
2118    * for these sequences as the number of bytes the sequence
2119    * occupies.  Each time through the loop, we decrement 'foldlen' by
2120    * how many bytes the current char occupies.  Only when it reaches
2121    * 0 do we increment 'minchars' or look for another multi-character
2122    * sequence. */
2123    if (folder == NULL) {
2124     minchars++;
2125    }
2126    else if (foldlen > 0) {
2127     foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2128    }
2129    else {
2130     minchars++;
2131
2132     /* See if *uc is the beginning of a multi-character fold.  If
2133     * so, we decrement the length remaining to look at, to account
2134     * for the current character this iteration.  (We can use 'uc'
2135     * instead of the fold returned by TRIE_READ_CHAR because for
2136     * non-UTF, the latin1_safe macro is smart enough to account
2137     * for all the unfolded characters, and because for UTF, the
2138     * string will already have been folded earlier in the
2139     * compilation process */
2140     if (UTF) {
2141      if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2142       foldlen -= UTF8SKIP(uc);
2143      }
2144     }
2145     else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2146      foldlen--;
2147     }
2148    }
2149
2150    /* The current character (and any potential folds) should be added
2151    * to the possible matching characters for this position in this
2152    * branch */
2153    if ( uvc < 256 ) {
2154     if ( folder ) {
2155      U8 folded= folder[ (U8) uvc ];
2156      if ( !trie->charmap[ folded ] ) {
2157       trie->charmap[ folded ]=( ++trie->uniquecharcount );
2158       TRIE_STORE_REVCHAR( folded );
2159      }
2160     }
2161     if ( !trie->charmap[ uvc ] ) {
2162      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2163      TRIE_STORE_REVCHAR( uvc );
2164     }
2165     if ( set_bit ) {
2166      /* store the codepoint in the bitmap, and its folded
2167      * equivalent. */
2168      TRIE_BITMAP_SET(trie, uvc);
2169
2170      /* store the folded codepoint */
2171      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2172
2173      if ( !UTF ) {
2174       /* store first byte of utf8 representation of
2175       variant codepoints */
2176       if (! UVCHR_IS_INVARIANT(uvc)) {
2177        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2178       }
2179      }
2180      set_bit = 0; /* We've done our bit :-) */
2181     }
2182    } else {
2183
2184     /* XXX We could come up with the list of code points that fold
2185     * to this using PL_utf8_foldclosures, except not for
2186     * multi-char folds, as there may be multiple combinations
2187     * there that could work, which needs to wait until runtime to
2188     * resolve (The comment about LIGATURE FFI above is such an
2189     * example */
2190
2191     SV** svpp;
2192     if ( !widecharmap )
2193      widecharmap = newHV();
2194
2195     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2196
2197     if ( !svpp )
2198      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2199
2200     if ( !SvTRUE( *svpp ) ) {
2201      sv_setiv( *svpp, ++trie->uniquecharcount );
2202      TRIE_STORE_REVCHAR(uvc);
2203     }
2204    }
2205   } /* end loop through characters in this branch of the trie */
2206
2207   /* We take the min and max for this branch and combine to find the min
2208   * and max for all branches processed so far */
2209   if( cur == first ) {
2210    trie->minlen = minchars;
2211    trie->maxlen = maxchars;
2212   } else if (minchars < trie->minlen) {
2213    trie->minlen = minchars;
2214   } else if (maxchars > trie->maxlen) {
2215    trie->maxlen = maxchars;
2216   }
2217  } /* end first pass */
2218  DEBUG_TRIE_COMPILE_r(
2219   PerlIO_printf( Perl_debug_log,
2220     "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2221     (int)depth * 2 + 2,"",
2222     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2223     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2224     (int)trie->minlen, (int)trie->maxlen )
2225  );
2226
2227  /*
2228   We now know what we are dealing with in terms of unique chars and
2229   string sizes so we can calculate how much memory a naive
2230   representation using a flat table  will take. If it's over a reasonable
2231   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2232   conservative but potentially much slower representation using an array
2233   of lists.
2234
2235   At the end we convert both representations into the same compressed
2236   form that will be used in regexec.c for matching with. The latter
2237   is a form that cannot be used to construct with but has memory
2238   properties similar to the list form and access properties similar
2239   to the table form making it both suitable for fast searches and
2240   small enough that its feasable to store for the duration of a program.
2241
2242   See the comment in the code where the compressed table is produced
2243   inplace from the flat tabe representation for an explanation of how
2244   the compression works.
2245
2246  */
2247
2248
2249  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2250  prev_states[1] = 0;
2251
2252  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2253              > SvIV(re_trie_maxbuff) )
2254  {
2255   /*
2256    Second Pass -- Array Of Lists Representation
2257
2258    Each state will be represented by a list of charid:state records
2259    (reg_trie_trans_le) the first such element holds the CUR and LEN
2260    points of the allocated array. (See defines above).
2261
2262    We build the initial structure using the lists, and then convert
2263    it into the compressed table form which allows faster lookups
2264    (but cant be modified once converted).
2265   */
2266
2267   STRLEN transcount = 1;
2268
2269   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2270    "%*sCompiling trie using list compiler\n",
2271    (int)depth * 2 + 2, ""));
2272
2273   trie->states = (reg_trie_state *)
2274    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2275         sizeof(reg_trie_state) );
2276   TRIE_LIST_NEW(1);
2277   next_alloc = 2;
2278
2279   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2280
2281    regnode *noper   = NEXTOPER( cur );
2282    U8 *uc           = (U8*)STRING( noper );
2283    const U8 *e      = uc + STR_LEN( noper );
2284    U32 state        = 1;         /* required init */
2285    U16 charid       = 0;         /* sanity init */
2286    U32 wordlen      = 0;         /* required init */
2287
2288    if (OP(noper) == NOTHING) {
2289     regnode *noper_next= regnext(noper);
2290     if (noper_next != tail && OP(noper_next) == flags) {
2291      noper = noper_next;
2292      uc= (U8*)STRING(noper);
2293      e= uc + STR_LEN(noper);
2294     }
2295    }
2296
2297    if (OP(noper) != NOTHING) {
2298     for ( ; uc < e ; uc += len ) {
2299
2300      TRIE_READ_CHAR;
2301
2302      if ( uvc < 256 ) {
2303       charid = trie->charmap[ uvc ];
2304      } else {
2305       SV** const svpp = hv_fetch( widecharmap,
2306              (char*)&uvc,
2307              sizeof( UV ),
2308              0);
2309       if ( !svpp ) {
2310        charid = 0;
2311       } else {
2312        charid=(U16)SvIV( *svpp );
2313       }
2314      }
2315      /* charid is now 0 if we dont know the char read, or
2316      * nonzero if we do */
2317      if ( charid ) {
2318
2319       U16 check;
2320       U32 newstate = 0;
2321
2322       charid--;
2323       if ( !trie->states[ state ].trans.list ) {
2324        TRIE_LIST_NEW( state );
2325       }
2326       for ( check = 1;
2327        check <= TRIE_LIST_USED( state );
2328        check++ )
2329       {
2330        if ( TRIE_LIST_ITEM( state, check ).forid
2331                  == charid )
2332        {
2333         newstate = TRIE_LIST_ITEM( state, check ).newstate;
2334         break;
2335        }
2336       }
2337       if ( ! newstate ) {
2338        newstate = next_alloc++;
2339        prev_states[newstate] = state;
2340        TRIE_LIST_PUSH( state, charid, newstate );
2341        transcount++;
2342       }
2343       state = newstate;
2344      } else {
2345       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2346      }
2347     }
2348    }
2349    TRIE_HANDLE_WORD(state);
2350
2351   } /* end second pass */
2352
2353   /* next alloc is the NEXT state to be allocated */
2354   trie->statecount = next_alloc;
2355   trie->states = (reg_trie_state *)
2356    PerlMemShared_realloc( trie->states,
2357         next_alloc
2358         * sizeof(reg_trie_state) );
2359
2360   /* and now dump it out before we compress it */
2361   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2362               revcharmap, next_alloc,
2363               depth+1)
2364   );
2365
2366   trie->trans = (reg_trie_trans *)
2367    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2368   {
2369    U32 state;
2370    U32 tp = 0;
2371    U32 zp = 0;
2372
2373
2374    for( state=1 ; state < next_alloc ; state ++ ) {
2375     U32 base=0;
2376
2377     /*
2378     DEBUG_TRIE_COMPILE_MORE_r(
2379      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2380     );
2381     */
2382
2383     if (trie->states[state].trans.list) {
2384      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2385      U16 maxid=minid;
2386      U16 idx;
2387
2388      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2389       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2390       if ( forid < minid ) {
2391        minid=forid;
2392       } else if ( forid > maxid ) {
2393        maxid=forid;
2394       }
2395      }
2396      if ( transcount < tp + maxid - minid + 1) {
2397       transcount *= 2;
2398       trie->trans = (reg_trie_trans *)
2399        PerlMemShared_realloc( trie->trans,
2400              transcount
2401              * sizeof(reg_trie_trans) );
2402       Zero( trie->trans + (transcount / 2),
2403        transcount / 2,
2404        reg_trie_trans );
2405      }
2406      base = trie->uniquecharcount + tp - minid;
2407      if ( maxid == minid ) {
2408       U32 set = 0;
2409       for ( ; zp < tp ; zp++ ) {
2410        if ( ! trie->trans[ zp ].next ) {
2411         base = trie->uniquecharcount + zp - minid;
2412         trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2413                 1).newstate;
2414         trie->trans[ zp ].check = state;
2415         set = 1;
2416         break;
2417        }
2418       }
2419       if ( !set ) {
2420        trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2421                 1).newstate;
2422        trie->trans[ tp ].check = state;
2423        tp++;
2424        zp = tp;
2425       }
2426      } else {
2427       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2428        const U32 tid = base
2429           - trie->uniquecharcount
2430           + TRIE_LIST_ITEM( state, idx ).forid;
2431        trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2432                 idx ).newstate;
2433        trie->trans[ tid ].check = state;
2434       }
2435       tp += ( maxid - minid + 1 );
2436      }
2437      Safefree(trie->states[ state ].trans.list);
2438     }
2439     /*
2440     DEBUG_TRIE_COMPILE_MORE_r(
2441      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2442     );
2443     */
2444     trie->states[ state ].trans.base=base;
2445    }
2446    trie->lasttrans = tp + 1;
2447   }
2448  } else {
2449   /*
2450   Second Pass -- Flat Table Representation.
2451
2452   we dont use the 0 slot of either trans[] or states[] so we add 1 to
2453   each.  We know that we will need Charcount+1 trans at most to store
2454   the data (one row per char at worst case) So we preallocate both
2455   structures assuming worst case.
2456
2457   We then construct the trie using only the .next slots of the entry
2458   structs.
2459
2460   We use the .check field of the first entry of the node temporarily
2461   to make compression both faster and easier by keeping track of how
2462   many non zero fields are in the node.
2463
2464   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2465   transition.
2466
2467   There are two terms at use here: state as a TRIE_NODEIDX() which is
2468   a number representing the first entry of the node, and state as a
2469   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2470   and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2471   if there are 2 entrys per node. eg:
2472
2473    A B       A B
2474   1. 2 4    1. 3 7
2475   2. 0 3    3. 0 5
2476   3. 0 0    5. 0 0
2477   4. 0 0    7. 0 0
2478
2479   The table is internally in the right hand, idx form. However as we
2480   also have to deal with the states array which is indexed by nodenum
2481   we have to use TRIE_NODENUM() to convert.
2482
2483   */
2484   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2485    "%*sCompiling trie using table compiler\n",
2486    (int)depth * 2 + 2, ""));
2487
2488   trie->trans = (reg_trie_trans *)
2489    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2490         * trie->uniquecharcount + 1,
2491         sizeof(reg_trie_trans) );
2492   trie->states = (reg_trie_state *)
2493    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2494         sizeof(reg_trie_state) );
2495   next_alloc = trie->uniquecharcount + 1;
2496
2497
2498   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2499
2500    regnode *noper   = NEXTOPER( cur );
2501    const U8 *uc     = (U8*)STRING( noper );
2502    const U8 *e      = uc + STR_LEN( noper );
2503
2504    U32 state        = 1;         /* required init */
2505
2506    U16 charid       = 0;         /* sanity init */
2507    U32 accept_state = 0;         /* sanity init */
2508
2509    U32 wordlen      = 0;         /* required init */
2510
2511    if (OP(noper) == NOTHING) {
2512     regnode *noper_next= regnext(noper);
2513     if (noper_next != tail && OP(noper_next) == flags) {
2514      noper = noper_next;
2515      uc= (U8*)STRING(noper);
2516      e= uc + STR_LEN(noper);
2517     }
2518    }
2519
2520    if ( OP(noper) != NOTHING ) {
2521     for ( ; uc < e ; uc += len ) {
2522
2523      TRIE_READ_CHAR;
2524
2525      if ( uvc < 256 ) {
2526       charid = trie->charmap[ uvc ];
2527      } else {
2528       SV* const * const svpp = hv_fetch( widecharmap,
2529               (char*)&uvc,
2530               sizeof( UV ),
2531               0);
2532       charid = svpp ? (U16)SvIV(*svpp) : 0;
2533      }
2534      if ( charid ) {
2535       charid--;
2536       if ( !trie->trans[ state + charid ].next ) {
2537        trie->trans[ state + charid ].next = next_alloc;
2538        trie->trans[ state ].check++;
2539        prev_states[TRIE_NODENUM(next_alloc)]
2540          = TRIE_NODENUM(state);
2541        next_alloc += trie->uniquecharcount;
2542       }
2543       state = trie->trans[ state + charid ].next;
2544      } else {
2545       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2546      }
2547      /* charid is now 0 if we dont know the char read, or
2548      * nonzero if we do */
2549     }
2550    }
2551    accept_state = TRIE_NODENUM( state );
2552    TRIE_HANDLE_WORD(accept_state);
2553
2554   } /* end second pass */
2555
2556   /* and now dump it out before we compress it */
2557   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2558               revcharmap,
2559               next_alloc, depth+1));
2560
2561   {
2562   /*
2563   * Inplace compress the table.*
2564
2565   For sparse data sets the table constructed by the trie algorithm will
2566   be mostly 0/FAIL transitions or to put it another way mostly empty.
2567   (Note that leaf nodes will not contain any transitions.)
2568
2569   This algorithm compresses the tables by eliminating most such
2570   transitions, at the cost of a modest bit of extra work during lookup:
2571
2572   - Each states[] entry contains a .base field which indicates the
2573   index in the state[] array wheres its transition data is stored.
2574
2575   - If .base is 0 there are no valid transitions from that node.
2576
2577   - If .base is nonzero then charid is added to it to find an entry in
2578   the trans array.
2579
2580   -If trans[states[state].base+charid].check!=state then the
2581   transition is taken to be a 0/Fail transition. Thus if there are fail
2582   transitions at the front of the node then the .base offset will point
2583   somewhere inside the previous nodes data (or maybe even into a node
2584   even earlier), but the .check field determines if the transition is
2585   valid.
2586
2587   XXX - wrong maybe?
2588   The following process inplace converts the table to the compressed
2589   table: We first do not compress the root node 1,and mark all its
2590   .check pointers as 1 and set its .base pointer as 1 as well. This
2591   allows us to do a DFA construction from the compressed table later,
2592   and ensures that any .base pointers we calculate later are greater
2593   than 0.
2594
2595   - We set 'pos' to indicate the first entry of the second node.
2596
2597   - We then iterate over the columns of the node, finding the first and
2598   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2599   and set the .check pointers accordingly, and advance pos
2600   appropriately and repreat for the next node. Note that when we copy
2601   the next pointers we have to convert them from the original
2602   NODEIDX form to NODENUM form as the former is not valid post
2603   compression.
2604
2605   - If a node has no transitions used we mark its base as 0 and do not
2606   advance the pos pointer.
2607
2608   - If a node only has one transition we use a second pointer into the
2609   structure to fill in allocated fail transitions from other states.
2610   This pointer is independent of the main pointer and scans forward
2611   looking for null transitions that are allocated to a state. When it
2612   finds one it writes the single transition into the "hole".  If the
2613   pointer doesnt find one the single transition is appended as normal.
2614
2615   - Once compressed we can Renew/realloc the structures to release the
2616   excess space.
2617
2618   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2619   specifically Fig 3.47 and the associated pseudocode.
2620
2621   demq
2622   */
2623   const U32 laststate = TRIE_NODENUM( next_alloc );
2624   U32 state, charid;
2625   U32 pos = 0, zp=0;
2626   trie->statecount = laststate;
2627
2628   for ( state = 1 ; state < laststate ; state++ ) {
2629    U8 flag = 0;
2630    const U32 stateidx = TRIE_NODEIDX( state );
2631    const U32 o_used = trie->trans[ stateidx ].check;
2632    U32 used = trie->trans[ stateidx ].check;
2633    trie->trans[ stateidx ].check = 0;
2634
2635    for ( charid = 0;
2636     used && charid < trie->uniquecharcount;
2637     charid++ )
2638    {
2639     if ( flag || trie->trans[ stateidx + charid ].next ) {
2640      if ( trie->trans[ stateidx + charid ].next ) {
2641       if (o_used == 1) {
2642        for ( ; zp < pos ; zp++ ) {
2643         if ( ! trie->trans[ zp ].next ) {
2644          break;
2645         }
2646        }
2647        trie->states[ state ].trans.base
2648              = zp
2649              + trie->uniquecharcount
2650              - charid ;
2651        trie->trans[ zp ].next
2652         = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2653                + charid ].next );
2654        trie->trans[ zp ].check = state;
2655        if ( ++zp > pos ) pos = zp;
2656        break;
2657       }
2658       used--;
2659      }
2660      if ( !flag ) {
2661       flag = 1;
2662       trie->states[ state ].trans.base
2663          = pos + trie->uniquecharcount - charid ;
2664      }
2665      trie->trans[ pos ].next
2666       = SAFE_TRIE_NODENUM(
2667          trie->trans[ stateidx + charid ].next );
2668      trie->trans[ pos ].check = state;
2669      pos++;
2670     }
2671    }
2672   }
2673   trie->lasttrans = pos + 1;
2674   trie->states = (reg_trie_state *)
2675    PerlMemShared_realloc( trie->states, laststate
2676         * sizeof(reg_trie_state) );
2677   DEBUG_TRIE_COMPILE_MORE_r(
2678    PerlIO_printf( Perl_debug_log,
2679     "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2680     (int)depth * 2 + 2,"",
2681     (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2682      + 1 ),
2683     (IV)next_alloc,
2684     (IV)pos,
2685     ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2686    );
2687
2688   } /* end table compress */
2689  }
2690  DEBUG_TRIE_COMPILE_MORE_r(
2691    PerlIO_printf(Perl_debug_log,
2692     "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2693     (int)depth * 2 + 2, "",
2694     (UV)trie->statecount,
2695     (UV)trie->lasttrans)
2696  );
2697  /* resize the trans array to remove unused space */
2698  trie->trans = (reg_trie_trans *)
2699   PerlMemShared_realloc( trie->trans, trie->lasttrans
2700        * sizeof(reg_trie_trans) );
2701
2702  {   /* Modify the program and insert the new TRIE node */
2703   U8 nodetype =(U8)(flags & 0xFF);
2704   char *str=NULL;
2705
2706 #ifdef DEBUGGING
2707   regnode *optimize = NULL;
2708 #ifdef RE_TRACK_PATTERN_OFFSETS
2709
2710   U32 mjd_offset = 0;
2711   U32 mjd_nodelen = 0;
2712 #endif /* RE_TRACK_PATTERN_OFFSETS */
2713 #endif /* DEBUGGING */
2714   /*
2715   This means we convert either the first branch or the first Exact,
2716   depending on whether the thing following (in 'last') is a branch
2717   or not and whther first is the startbranch (ie is it a sub part of
2718   the alternation or is it the whole thing.)
2719   Assuming its a sub part we convert the EXACT otherwise we convert
2720   the whole branch sequence, including the first.
2721   */
2722   /* Find the node we are going to overwrite */
2723   if ( first != startbranch || OP( last ) == BRANCH ) {
2724    /* branch sub-chain */
2725    NEXT_OFF( first ) = (U16)(last - first);
2726 #ifdef RE_TRACK_PATTERN_OFFSETS
2727    DEBUG_r({
2728     mjd_offset= Node_Offset((convert));
2729     mjd_nodelen= Node_Length((convert));
2730    });
2731 #endif
2732    /* whole branch chain */
2733   }
2734 #ifdef RE_TRACK_PATTERN_OFFSETS
2735   else {
2736    DEBUG_r({
2737     const  regnode *nop = NEXTOPER( convert );
2738     mjd_offset= Node_Offset((nop));
2739     mjd_nodelen= Node_Length((nop));
2740    });
2741   }
2742   DEBUG_OPTIMISE_r(
2743    PerlIO_printf(Perl_debug_log,
2744     "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2745     (int)depth * 2 + 2, "",
2746     (UV)mjd_offset, (UV)mjd_nodelen)
2747   );
2748 #endif
2749   /* But first we check to see if there is a common prefix we can
2750   split out as an EXACT and put in front of the TRIE node.  */
2751   trie->startstate= 1;
2752   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2753    U32 state;
2754    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2755     U32 ofs = 0;
2756     I32 idx = -1;
2757     U32 count = 0;
2758     const U32 base = trie->states[ state ].trans.base;
2759
2760     if ( trie->states[state].wordnum )
2761       count = 1;
2762
2763     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2764      if ( ( base + ofs >= trie->uniquecharcount ) &&
2765       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2766       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2767      {
2768       if ( ++count > 1 ) {
2769        SV **tmp = av_fetch( revcharmap, ofs, 0);
2770        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2771        if ( state == 1 ) break;
2772        if ( count == 2 ) {
2773         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2774         DEBUG_OPTIMISE_r(
2775          PerlIO_printf(Perl_debug_log,
2776           "%*sNew Start State=%"UVuf" Class: [",
2777           (int)depth * 2 + 2, "",
2778           (UV)state));
2779         if (idx >= 0) {
2780          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2781          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2782
2783          TRIE_BITMAP_SET(trie,*ch);
2784          if ( folder )
2785           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2786          DEBUG_OPTIMISE_r(
2787           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2788          );
2789         }
2790        }
2791        TRIE_BITMAP_SET(trie,*ch);
2792        if ( folder )
2793         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2794        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2795       }
2796       idx = ofs;
2797      }
2798     }
2799     if ( count == 1 ) {
2800      SV **tmp = av_fetch( revcharmap, idx, 0);
2801      STRLEN len;
2802      char *ch = SvPV( *tmp, len );
2803      DEBUG_OPTIMISE_r({
2804       SV *sv=sv_newmortal();
2805       PerlIO_printf( Perl_debug_log,
2806        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2807        (int)depth * 2 + 2, "",
2808        (UV)state, (UV)idx,
2809        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2810         PL_colors[0], PL_colors[1],
2811         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2812         PERL_PV_ESCAPE_FIRSTCHAR
2813        )
2814       );
2815      });
2816      if ( state==1 ) {
2817       OP( convert ) = nodetype;
2818       str=STRING(convert);
2819       STR_LEN(convert)=0;
2820      }
2821      STR_LEN(convert) += len;
2822      while (len--)
2823       *str++ = *ch++;
2824     } else {
2825 #ifdef DEBUGGING
2826      if (state>1)
2827       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2828 #endif
2829      break;
2830     }
2831    }
2832    trie->prefixlen = (state-1);
2833    if (str) {
2834     regnode *n = convert+NODE_SZ_STR(convert);
2835     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2836     trie->startstate = state;
2837     trie->minlen -= (state - 1);
2838     trie->maxlen -= (state - 1);
2839 #ifdef DEBUGGING
2840    /* At least the UNICOS C compiler choked on this
2841     * being argument to DEBUG_r(), so let's just have
2842     * it right here. */
2843    if (
2844 #ifdef PERL_EXT_RE_BUILD
2845     1
2846 #else
2847     DEBUG_r_TEST
2848 #endif
2849     ) {
2850     regnode *fix = convert;
2851     U32 word = trie->wordcount;
2852     mjd_nodelen++;
2853     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2854     while( ++fix < n ) {
2855      Set_Node_Offset_Length(fix, 0, 0);
2856     }
2857     while (word--) {
2858      SV ** const tmp = av_fetch( trie_words, word, 0 );
2859      if (tmp) {
2860       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2861        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2862       else
2863        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2864      }
2865     }
2866    }
2867 #endif
2868     if (trie->maxlen) {
2869      convert = n;
2870     } else {
2871      NEXT_OFF(convert) = (U16)(tail - convert);
2872      DEBUG_r(optimize= n);
2873     }
2874    }
2875   }
2876   if (!jumper)
2877    jumper = last;
2878   if ( trie->maxlen ) {
2879    NEXT_OFF( convert ) = (U16)(tail - convert);
2880    ARG_SET( convert, data_slot );
2881    /* Store the offset to the first unabsorbed branch in
2882    jump[0], which is otherwise unused by the jump logic.
2883    We use this when dumping a trie and during optimisation. */
2884    if (trie->jump)
2885     trie->jump[0] = (U16)(nextbranch - convert);
2886
2887    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2888    *   and there is a bitmap
2889    *   and the first "jump target" node we found leaves enough room
2890    * then convert the TRIE node into a TRIEC node, with the bitmap
2891    * embedded inline in the opcode - this is hypothetically faster.
2892    */
2893    if ( !trie->states[trie->startstate].wordnum
2894     && trie->bitmap
2895     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2896    {
2897     OP( convert ) = TRIEC;
2898     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2899     PerlMemShared_free(trie->bitmap);
2900     trie->bitmap= NULL;
2901    } else
2902     OP( convert ) = TRIE;
2903
2904    /* store the type in the flags */
2905    convert->flags = nodetype;
2906    DEBUG_r({
2907    optimize = convert
2908      + NODE_STEP_REGNODE
2909      + regarglen[ OP( convert ) ];
2910    });
2911    /* XXX We really should free up the resource in trie now,
2912     as we won't use them - (which resources?) dmq */
2913   }
2914   /* needed for dumping*/
2915   DEBUG_r(if (optimize) {
2916    regnode *opt = convert;
2917
2918    while ( ++opt < optimize) {
2919     Set_Node_Offset_Length(opt,0,0);
2920    }
2921    /*
2922     Try to clean up some of the debris left after the
2923     optimisation.
2924    */
2925    while( optimize < jumper ) {
2926     mjd_nodelen += Node_Length((optimize));
2927     OP( optimize ) = OPTIMIZED;
2928     Set_Node_Offset_Length(optimize,0,0);
2929     optimize++;
2930    }
2931    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2932   });
2933  } /* end node insert */
2934  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2935
2936  /*  Finish populating the prev field of the wordinfo array.  Walk back
2937  *  from each accept state until we find another accept state, and if
2938  *  so, point the first word's .prev field at the second word. If the
2939  *  second already has a .prev field set, stop now. This will be the
2940  *  case either if we've already processed that word's accept state,
2941  *  or that state had multiple words, and the overspill words were
2942  *  already linked up earlier.
2943  */
2944  {
2945   U16 word;
2946   U32 state;
2947   U16 prev;
2948
2949   for (word=1; word <= trie->wordcount; word++) {
2950    prev = 0;
2951    if (trie->wordinfo[word].prev)
2952     continue;
2953    state = trie->wordinfo[word].accept;
2954    while (state) {
2955     state = prev_states[state];
2956     if (!state)
2957      break;
2958     prev = trie->states[state].wordnum;
2959     if (prev)
2960      break;
2961    }
2962    trie->wordinfo[word].prev = prev;
2963   }
2964   Safefree(prev_states);
2965  }
2966
2967
2968  /* and now dump out the compressed format */
2969  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2970
2971  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2972 #ifdef DEBUGGING
2973  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2974  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2975 #else
2976  SvREFCNT_dec_NN(revcharmap);
2977 #endif
2978  return trie->jump
2979   ? MADE_JUMP_TRIE
2980   : trie->startstate>1
2981    ? MADE_EXACT_TRIE
2982    : MADE_TRIE;
2983 }
2984
2985 STATIC void
2986 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2987 {
2988 /* The Trie is constructed and compressed now so we can build a fail array if
2989  * it's needed
2990
2991    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2992    3.32 in the
2993    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2994    Ullman 1985/88
2995    ISBN 0-201-10088-6
2996
2997    We find the fail state for each state in the trie, this state is the longest
2998    proper suffix of the current state's 'word' that is also a proper prefix of
2999    another word in our trie. State 1 represents the word '' and is thus the
3000    default fail state. This allows the DFA not to have to restart after its
3001    tried and failed a word at a given point, it simply continues as though it
3002    had been matching the other word in the first place.
3003    Consider
3004  'abcdgu'=~/abcdefg|cdgu/
3005    When we get to 'd' we are still matching the first word, we would encounter
3006    'g' which would fail, which would bring us to the state representing 'd' in
3007    the second word where we would try 'g' and succeed, proceeding to match
3008    'cdgu'.
3009  */
3010  /* add a fail transition */
3011  const U32 trie_offset = ARG(source);
3012  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3013  U32 *q;
3014  const U32 ucharcount = trie->uniquecharcount;
3015  const U32 numstates = trie->statecount;
3016  const U32 ubound = trie->lasttrans + ucharcount;
3017  U32 q_read = 0;
3018  U32 q_write = 0;
3019  U32 charid;
3020  U32 base = trie->states[ 1 ].trans.base;
3021  U32 *fail;
3022  reg_ac_data *aho;
3023  const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3024  GET_RE_DEBUG_FLAGS_DECL;
3025
3026  PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
3027 #ifndef DEBUGGING
3028  PERL_UNUSED_ARG(depth);
3029 #endif
3030
3031
3032  ARG_SET( stclass, data_slot );
3033  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3034  RExC_rxi->data->data[ data_slot ] = (void*)aho;
3035  aho->trie=trie_offset;
3036  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3037  Copy( trie->states, aho->states, numstates, reg_trie_state );
3038  Newxz( q, numstates, U32);
3039  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3040  aho->refcount = 1;
3041  fail = aho->fail;
3042  /* initialize fail[0..1] to be 1 so that we always have
3043  a valid final fail state */
3044  fail[ 0 ] = fail[ 1 ] = 1;
3045
3046  for ( charid = 0; charid < ucharcount ; charid++ ) {
3047   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3048   if ( newstate ) {
3049    q[ q_write ] = newstate;
3050    /* set to point at the root */
3051    fail[ q[ q_write++ ] ]=1;
3052   }
3053  }
3054  while ( q_read < q_write) {
3055   const U32 cur = q[ q_read++ % numstates ];
3056   base = trie->states[ cur ].trans.base;
3057
3058   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3059    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3060    if (ch_state) {
3061     U32 fail_state = cur;
3062     U32 fail_base;
3063     do {
3064      fail_state = fail[ fail_state ];
3065      fail_base = aho->states[ fail_state ].trans.base;
3066     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3067
3068     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3069     fail[ ch_state ] = fail_state;
3070     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3071     {
3072       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
3073     }
3074     q[ q_write++ % numstates] = ch_state;
3075    }
3076   }
3077  }
3078  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3079  when we fail in state 1, this allows us to use the
3080  charclass scan to find a valid start char. This is based on the principle
3081  that theres a good chance the string being searched contains lots of stuff
3082  that cant be a start char.
3083  */
3084  fail[ 0 ] = fail[ 1 ] = 0;
3085  DEBUG_TRIE_COMPILE_r({
3086   PerlIO_printf(Perl_debug_log,
3087      "%*sStclass Failtable (%"UVuf" states): 0",
3088      (int)(depth * 2), "", (UV)numstates
3089   );
3090   for( q_read=1; q_read<numstates; q_read++ ) {
3091    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3092   }
3093   PerlIO_printf(Perl_debug_log, "\n");
3094  });
3095  Safefree(q);
3096  /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3097 }
3098
3099
3100 #define DEBUG_PEEP(str,scan,depth) \
3101  DEBUG_OPTIMISE_r({if (scan){ \
3102  SV * const mysv=sv_newmortal(); \
3103  regnode *Next = regnext(scan); \
3104  regprop(RExC_rx, mysv, scan, NULL); \
3105  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3106  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3107  Next ? (REG_NODE_NUM(Next)) : 0 ); \
3108    }});
3109
3110
3111 /* The below joins as many adjacent EXACTish nodes as possible into a single
3112  * one.  The regop may be changed if the node(s) contain certain sequences that
3113  * require special handling.  The joining is only done if:
3114  * 1) there is room in the current conglomerated node to entirely contain the
3115  *    next one.
3116  * 2) they are the exact same node type
3117  *
3118  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3119  * these get optimized out
3120  *
3121  * If a node is to match under /i (folded), the number of characters it matches
3122  * can be different than its character length if it contains a multi-character
3123  * fold.  *min_subtract is set to the total delta number of characters of the
3124  * input nodes.
3125  *
3126  * And *unfolded_multi_char is set to indicate whether or not the node contains
3127  * an unfolded multi-char fold.  This happens when whether the fold is valid or
3128  * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3129  * SMALL LETTER SHARP S, as only if the target string being matched against
3130  * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3131  * folding rules depend on the locale in force at runtime.  (Multi-char folds
3132  * whose components are all above the Latin1 range are not run-time locale
3133  * dependent, and have already been folded by the time this function is
3134  * called.)
3135  *
3136  * This is as good a place as any to discuss the design of handling these
3137  * multi-character fold sequences.  It's been wrong in Perl for a very long
3138  * time.  There are three code points in Unicode whose multi-character folds
3139  * were long ago discovered to mess things up.  The previous designs for
3140  * dealing with these involved assigning a special node for them.  This
3141  * approach doesn't always work, as evidenced by this example:
3142  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
3143  * Both sides fold to "sss", but if the pattern is parsed to create a node that
3144  * would match just the \xDF, it won't be able to handle the case where a
3145  * successful match would have to cross the node's boundary.  The new approach
3146  * that hopefully generally solves the problem generates an EXACTFU_SS node
3147  * that is "sss" in this case.
3148  *
3149  * It turns out that there are problems with all multi-character folds, and not
3150  * just these three.  Now the code is general, for all such cases.  The
3151  * approach taken is:
3152  * 1)   This routine examines each EXACTFish node that could contain multi-
3153  *      character folded sequences.  Since a single character can fold into
3154  *      such a sequence, the minimum match length for this node is less than
3155  *      the number of characters in the node.  This routine returns in
3156  *      *min_subtract how many characters to subtract from the the actual
3157  *      length of the string to get a real minimum match length; it is 0 if
3158  *      there are no multi-char foldeds.  This delta is used by the caller to
3159  *      adjust the min length of the match, and the delta between min and max,
3160  *      so that the optimizer doesn't reject these possibilities based on size
3161  *      constraints.
3162  * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3163  *      is used for an EXACTFU node that contains at least one "ss" sequence in
3164  *      it.  For non-UTF-8 patterns and strings, this is the only case where
3165  *      there is a possible fold length change.  That means that a regular
3166  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
3167  *      with length changes, and so can be processed faster.  regexec.c takes
3168  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
3169  *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3170  *      known until runtime).  This saves effort in regex matching.  However,
3171  *      the pre-folding isn't done for non-UTF8 patterns because the fold of
3172  *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3173  *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
3174  *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
3175  *      possibilities for the non-UTF8 patterns are quite simple, except for
3176  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
3177  *      members of a fold-pair, and arrays are set up for all of them so that
3178  *      the other member of the pair can be found quickly.  Code elsewhere in
3179  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3180  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
3181  *      described in the next item.
3182  * 3)   A problem remains for unfolded multi-char folds. (These occur when the
3183  *      validity of the fold won't be known until runtime, and so must remain
3184  *      unfolded for now.  This happens for the sharp s in EXACTF and EXACTFA
3185  *      nodes when the pattern isn't in UTF-8.  (Note, BTW, that there cannot
3186  *      be an EXACTF node with a UTF-8 pattern.)  They also occur for various
3187  *      folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3188  *      The reason this is a problem is that the optimizer part of regexec.c
3189  *      (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3190  *      that a character in the pattern corresponds to at most a single
3191  *      character in the target string.  (And I do mean character, and not byte
3192  *      here, unlike other parts of the documentation that have never been
3193  *      updated to account for multibyte Unicode.)  sharp s in EXACTF and
3194  *      EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3195  *      it can match "\x{17F}\x{17F}".  These, along with other ones in EXACTFL
3196  *      nodes, violate the assumption, and they are the only instances where it
3197  *      is violated.  I'm reluctant to try to change the assumption, as the
3198  *      code involved is impenetrable to me (khw), so instead the code here
3199  *      punts.  This routine examines EXACTFL nodes, and (when the pattern
3200  *      isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3201  *      boolean indicating whether or not the node contains such a fold.  When
3202  *      it is true, the caller sets a flag that later causes the optimizer in
3203  *      this file to not set values for the floating and fixed string lengths,
3204  *      and thus avoids the optimizer code in regexec.c that makes the invalid
3205  *      assumption.  Thus, there is no optimization based on string lengths for
3206  *      EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3207  *      EXACTF and EXACTFA nodes that contain the sharp s.  (The reason the
3208  *      assumption is wrong only in these cases is that all other non-UTF-8
3209  *      folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3210  *      their expanded versions.  (Again, we can't prefold sharp s to 'ss' in
3211  *      EXACTF nodes because we don't know at compile time if it actually
3212  *      matches 'ss' or not.  For EXACTF nodes it will match iff the target
3213  *      string is in UTF-8.  This is in contrast to EXACTFU nodes, where it
3214  *      always matches; and EXACTFA where it never does.  In an EXACTFA node in
3215  *      a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3216  *      problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3217  *      string would require the pattern to be forced into UTF-8, the overhead
3218  *      of which we want to avoid.  Similarly the unfolded multi-char folds in
3219  *      EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3220  *      locale.)
3221  *
3222  *      Similarly, the code that generates tries doesn't currently handle
3223  *      not-already-folded multi-char folds, and it looks like a pain to change
3224  *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
3225  *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
3226  *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
3227  *      using /iaa matching will be doing so almost entirely with ASCII
3228  *      strings, so this should rarely be encountered in practice */
3229
3230 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3231  if (PL_regkind[OP(scan)] == EXACT) \
3232   join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3233
3234 STATIC U32
3235 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3236     UV *min_subtract, bool *unfolded_multi_char,
3237     U32 flags,regnode *val, U32 depth)
3238 {
3239  /* Merge several consecutive EXACTish nodes into one. */
3240  regnode *n = regnext(scan);
3241  U32 stringok = 1;
3242  regnode *next = scan + NODE_SZ_STR(scan);
3243  U32 merged = 0;
3244  U32 stopnow = 0;
3245 #ifdef DEBUGGING
3246  regnode *stop = scan;
3247  GET_RE_DEBUG_FLAGS_DECL;
3248 #else
3249  PERL_UNUSED_ARG(depth);
3250 #endif
3251
3252  PERL_ARGS_ASSERT_JOIN_EXACT;
3253 #ifndef EXPERIMENTAL_INPLACESCAN
3254  PERL_UNUSED_ARG(flags);
3255  PERL_UNUSED_ARG(val);
3256 #endif
3257  DEBUG_PEEP("join",scan,depth);
3258
3259  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
3260  * EXACT ones that are mergeable to the current one. */
3261  while (n
3262   && (PL_regkind[OP(n)] == NOTHING
3263    || (stringok && OP(n) == OP(scan)))
3264   && NEXT_OFF(n)
3265   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3266  {
3267
3268   if (OP(n) == TAIL || n > next)
3269    stringok = 0;
3270   if (PL_regkind[OP(n)] == NOTHING) {
3271    DEBUG_PEEP("skip:",n,depth);
3272    NEXT_OFF(scan) += NEXT_OFF(n);
3273    next = n + NODE_STEP_REGNODE;
3274 #ifdef DEBUGGING
3275    if (stringok)
3276     stop = n;
3277 #endif
3278    n = regnext(n);
3279   }
3280   else if (stringok) {
3281    const unsigned int oldl = STR_LEN(scan);
3282    regnode * const nnext = regnext(n);
3283
3284    /* XXX I (khw) kind of doubt that this works on platforms (should
3285    * Perl ever run on one) where U8_MAX is above 255 because of lots
3286    * of other assumptions */
3287    /* Don't join if the sum can't fit into a single node */
3288    if (oldl + STR_LEN(n) > U8_MAX)
3289     break;
3290
3291    DEBUG_PEEP("merg",n,depth);
3292    merged++;
3293
3294    NEXT_OFF(scan) += NEXT_OFF(n);
3295    STR_LEN(scan) += STR_LEN(n);
3296    next = n + NODE_SZ_STR(n);
3297    /* Now we can overwrite *n : */
3298    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3299 #ifdef DEBUGGING
3300    stop = next - 1;
3301 #endif
3302    n = nnext;
3303    if (stopnow) break;
3304   }
3305
3306 #ifdef EXPERIMENTAL_INPLACESCAN
3307   if (flags && !NEXT_OFF(n)) {
3308    DEBUG_PEEP("atch", val, depth);
3309    if (reg_off_by_arg[OP(n)]) {
3310     ARG_SET(n, val - n);
3311    }
3312    else {
3313     NEXT_OFF(n) = val - n;
3314    }
3315    stopnow = 1;
3316   }
3317 #endif
3318  }
3319
3320  *min_subtract = 0;
3321  *unfolded_multi_char = FALSE;
3322
3323  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
3324  * can now analyze for sequences of problematic code points.  (Prior to
3325  * this final joining, sequences could have been split over boundaries, and
3326  * hence missed).  The sequences only happen in folding, hence for any
3327  * non-EXACT EXACTish node */
3328  if (OP(scan) != EXACT) {
3329   U8* s0 = (U8*) STRING(scan);
3330   U8* s = s0;
3331   U8* s_end = s0 + STR_LEN(scan);
3332
3333   int total_count_delta = 0;  /* Total delta number of characters that
3334          multi-char folds expand to */
3335
3336   /* One pass is made over the node's string looking for all the
3337   * possibilities.  To avoid some tests in the loop, there are two main
3338   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3339   * non-UTF-8 */
3340   if (UTF) {
3341    U8* folded = NULL;
3342
3343    if (OP(scan) == EXACTFL) {
3344     U8 *d;
3345
3346     /* An EXACTFL node would already have been changed to another
3347     * node type unless there is at least one character in it that
3348     * is problematic; likely a character whose fold definition
3349     * won't be known until runtime, and so has yet to be folded.
3350     * For all but the UTF-8 locale, folds are 1-1 in length, but
3351     * to handle the UTF-8 case, we need to create a temporary
3352     * folded copy using UTF-8 locale rules in order to analyze it.
3353     * This is because our macros that look to see if a sequence is
3354     * a multi-char fold assume everything is folded (otherwise the
3355     * tests in those macros would be too complicated and slow).
3356     * Note that here, the non-problematic folds will have already
3357     * been done, so we can just copy such characters.  We actually
3358     * don't completely fold the EXACTFL string.  We skip the
3359     * unfolded multi-char folds, as that would just create work
3360     * below to figure out the size they already are */
3361
3362     Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3363     d = folded;
3364     while (s < s_end) {
3365      STRLEN s_len = UTF8SKIP(s);
3366      if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3367       Copy(s, d, s_len, U8);
3368       d += s_len;
3369      }
3370      else if (is_FOLDS_TO_MULTI_utf8(s)) {
3371       *unfolded_multi_char = TRUE;
3372       Copy(s, d, s_len, U8);
3373       d += s_len;
3374      }
3375      else if (isASCII(*s)) {
3376       *(d++) = toFOLD(*s);
3377      }
3378      else {
3379       STRLEN len;
3380       _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3381       d += len;
3382      }
3383      s += s_len;
3384     }
3385
3386     /* Point the remainder of the routine to look at our temporary
3387     * folded copy */
3388     s = folded;
3389     s_end = d;
3390    } /* End of creating folded copy of EXACTFL string */
3391
3392    /* Examine the string for a multi-character fold sequence.  UTF-8
3393    * patterns have all characters pre-folded by the time this code is
3394    * executed */
3395    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3396          length sequence we are looking for is 2 */
3397    {
3398     int count = 0;  /* How many characters in a multi-char fold */
3399     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3400     if (! len) {    /* Not a multi-char fold: get next char */
3401      s += UTF8SKIP(s);
3402      continue;
3403     }
3404
3405     /* Nodes with 'ss' require special handling, except for
3406     * EXACTFA-ish for which there is no multi-char fold to this */
3407     if (len == 2 && *s == 's' && *(s+1) == 's'
3408      && OP(scan) != EXACTFA
3409      && OP(scan) != EXACTFA_NO_TRIE)
3410     {
3411      count = 2;
3412      if (OP(scan) != EXACTFL) {
3413       OP(scan) = EXACTFU_SS;
3414      }
3415      s += 2;
3416     }
3417     else { /* Here is a generic multi-char fold. */
3418      U8* multi_end  = s + len;
3419
3420      /* Count how many characters in it.  In the case of /aa, no
3421      * folds which contain ASCII code points are allowed, so
3422      * check for those, and skip if found. */
3423      if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3424       count = utf8_length(s, multi_end);
3425       s = multi_end;
3426      }
3427      else {
3428       while (s < multi_end) {
3429        if (isASCII(*s)) {
3430         s++;
3431         goto next_iteration;
3432        }
3433        else {
3434         s += UTF8SKIP(s);
3435        }
3436        count++;
3437       }
3438      }
3439     }
3440
3441     /* The delta is how long the sequence is minus 1 (1 is how long
3442     * the character that folds to the sequence is) */
3443     total_count_delta += count - 1;
3444    next_iteration: ;
3445    }
3446
3447    /* We created a temporary folded copy of the string in EXACTFL
3448    * nodes.  Therefore we need to be sure it doesn't go below zero,
3449    * as the real string could be shorter */
3450    if (OP(scan) == EXACTFL) {
3451     int total_chars = utf8_length((U8*) STRING(scan),
3452           (U8*) STRING(scan) + STR_LEN(scan));
3453     if (total_count_delta > total_chars) {
3454      total_count_delta = total_chars;
3455     }
3456    }
3457
3458    *min_subtract += total_count_delta;
3459    Safefree(folded);
3460   }
3461   else if (OP(scan) == EXACTFA) {
3462
3463    /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
3464    * fold to the ASCII range (and there are no existing ones in the
3465    * upper latin1 range).  But, as outlined in the comments preceding
3466    * this function, we need to flag any occurrences of the sharp s.
3467    * This character forbids trie formation (because of added
3468    * complexity) */
3469    while (s < s_end) {
3470     if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3471      OP(scan) = EXACTFA_NO_TRIE;
3472      *unfolded_multi_char = TRUE;
3473      break;
3474     }
3475     s++;
3476     continue;
3477    }
3478   }
3479   else {
3480
3481    /* Non-UTF-8 pattern, not EXACTFA node.  Look for the multi-char
3482    * folds that are all Latin1.  As explained in the comments
3483    * preceding this function, we look also for the sharp s in EXACTF
3484    * and EXACTFL nodes; it can be in the final position.  Otherwise
3485    * we can stop looking 1 byte earlier because have to find at least
3486    * two characters for a multi-fold */
3487    const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3488        ? s_end
3489        : s_end -1;
3490
3491    while (s < upper) {
3492     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3493     if (! len) {    /* Not a multi-char fold. */
3494      if (*s == LATIN_SMALL_LETTER_SHARP_S
3495       && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3496      {
3497       *unfolded_multi_char = TRUE;
3498      }
3499      s++;
3500      continue;
3501     }
3502
3503     if (len == 2
3504      && isARG2_lower_or_UPPER_ARG1('s', *s)
3505      && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3506     {
3507
3508      /* EXACTF nodes need to know that the minimum length
3509      * changed so that a sharp s in the string can match this
3510      * ss in the pattern, but they remain EXACTF nodes, as they
3511      * won't match this unless the target string is is UTF-8,
3512      * which we don't know until runtime.  EXACTFL nodes can't
3513      * transform into EXACTFU nodes */
3514      if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3515       OP(scan) = EXACTFU_SS;
3516      }
3517     }
3518
3519     *min_subtract += len - 1;
3520     s += len;
3521    }
3522   }
3523  }
3524
3525 #ifdef DEBUGGING
3526  /* Allow dumping but overwriting the collection of skipped
3527  * ops and/or strings with fake optimized ops */
3528  n = scan + NODE_SZ_STR(scan);
3529  while (n <= stop) {
3530   OP(n) = OPTIMIZED;
3531   FLAGS(n) = 0;
3532   NEXT_OFF(n) = 0;
3533   n++;
3534  }
3535 #endif
3536  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3537  return stopnow;
3538 }
3539
3540 /* REx optimizer.  Converts nodes into quicker variants "in place".
3541    Finds fixed substrings.  */
3542
3543 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3544    to the position after last scanned or to NULL. */
3545
3546 #define INIT_AND_WITHP \
3547  assert(!and_withp); \
3548  Newx(and_withp,1, regnode_ssc); \
3549  SAVEFREEPV(and_withp)
3550
3551 /* this is a chain of data about sub patterns we are processing that
3552    need to be handled separately/specially in study_chunk. Its so
3553    we can simulate recursion without losing state.  */
3554 struct scan_frame;
3555 typedef struct scan_frame {
3556  regnode *last;  /* last node to process in this frame */
3557  regnode *next;  /* next node to process when last is reached */
3558  struct scan_frame *prev; /*previous frame*/
3559  U32 prev_recursed_depth;
3560  I32 stop; /* what stopparen do we use */
3561 } scan_frame;
3562
3563
3564 STATIC SSize_t
3565 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3566       SSize_t *minlenp, SSize_t *deltap,
3567       regnode *last,
3568       scan_data_t *data,
3569       I32 stopparen,
3570       U32 recursed_depth,
3571       regnode_ssc *and_withp,
3572       U32 flags, U32 depth)
3573       /* scanp: Start here (read-write). */
3574       /* deltap: Write maxlen-minlen here. */
3575       /* last: Stop before this one. */
3576       /* data: string data about the pattern */
3577       /* stopparen: treat close N as END */
3578       /* recursed: which subroutines have we recursed into */
3579       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3580 {
3581  dVAR;
3582  /* There must be at least this number of characters to match */
3583  SSize_t min = 0;
3584  I32 pars = 0, code;
3585  regnode *scan = *scanp, *next;
3586  SSize_t delta = 0;
3587  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3588  int is_inf_internal = 0;  /* The studied chunk is infinite */
3589  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3590  scan_data_t data_fake;
3591  SV *re_trie_maxbuff = NULL;
3592  regnode *first_non_open = scan;
3593  SSize_t stopmin = SSize_t_MAX;
3594  scan_frame *frame = NULL;
3595  GET_RE_DEBUG_FLAGS_DECL;
3596
3597  PERL_ARGS_ASSERT_STUDY_CHUNK;
3598
3599 #ifdef DEBUGGING
3600  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3601 #endif
3602  if ( depth == 0 ) {
3603   while (first_non_open && OP(first_non_open) == OPEN)
3604    first_non_open=regnext(first_non_open);
3605  }
3606
3607
3608   fake_study_recurse:
3609  while ( scan && OP(scan) != END && scan < last ){
3610   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3611         node length to get a real minimum (because
3612         the folded version may be shorter) */
3613   bool unfolded_multi_char = FALSE;
3614   /* Peephole optimizer: */
3615   DEBUG_OPTIMISE_MORE_r(
3616   {
3617    PerlIO_printf(Perl_debug_log,
3618     "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3619     ((int) depth*2), "", (long)stopparen,
3620     (unsigned long)depth, (unsigned long)recursed_depth);
3621    if (recursed_depth) {
3622     U32 i;
3623     U32 j;
3624     for ( j = 0 ; j < recursed_depth ; j++ ) {
3625      PerlIO_printf(Perl_debug_log,"[");
3626      for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3627       PerlIO_printf(Perl_debug_log,"%d",
3628        PAREN_TEST(RExC_study_chunk_recursed +
3629          (j * RExC_study_chunk_recursed_bytes), i)
3630        ? 1 : 0
3631       );
3632      PerlIO_printf(Perl_debug_log,"]");
3633     }
3634    }
3635    PerlIO_printf(Perl_debug_log,"\n");
3636   }
3637   );
3638   DEBUG_STUDYDATA("Peep:", data, depth);
3639   DEBUG_PEEP("Peep", scan, depth);
3640
3641
3642   /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3643   * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3644   * by a different invocation of reg() -- Yves
3645   */
3646   JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3647
3648   /* Follow the next-chain of the current node and optimize
3649   away all the NOTHINGs from it.  */
3650   if (OP(scan) != CURLYX) {
3651    const int max = (reg_off_by_arg[OP(scan)]
3652      ? I32_MAX
3653      /* I32 may be smaller than U16 on CRAYs! */
3654      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3655    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3656    int noff;
3657    regnode *n = scan;
3658
3659    /* Skip NOTHING and LONGJMP. */
3660    while ((n = regnext(n))
3661     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3662      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3663     && off + noff < max)
3664     off += noff;
3665    if (reg_off_by_arg[OP(scan)])
3666     ARG(scan) = off;
3667    else
3668     NEXT_OFF(scan) = off;
3669   }
3670
3671
3672
3673   /* The principal pseudo-switch.  Cannot be a switch, since we
3674   look into several different things.  */
3675   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3676     || OP(scan) == IFTHEN) {
3677    next = regnext(scan);
3678    code = OP(scan);
3679    /* demq: the op(next)==code check is to see if we have
3680    * "branch-branch" AFAICT */
3681
3682    if (OP(next) == code || code == IFTHEN) {
3683     /* NOTE - There is similar code to this block below for
3684     * handling TRIE nodes on a re-study.  If you change stuff here
3685     * check there too. */
3686     SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3687     regnode_ssc accum;
3688     regnode * const startbranch=scan;
3689
3690     if (flags & SCF_DO_SUBSTR) {
3691      /* Cannot merge strings after this. */
3692      scan_commit(pRExC_state, data, minlenp, is_inf);
3693     }
3694
3695     if (flags & SCF_DO_STCLASS)
3696      ssc_init_zero(pRExC_state, &accum);
3697
3698     while (OP(scan) == code) {
3699      SSize_t deltanext, minnext, fake;
3700      I32 f = 0;
3701      regnode_ssc this_class;
3702
3703      num++;
3704      data_fake.flags = 0;
3705      if (data) {
3706       data_fake.whilem_c = data->whilem_c;
3707       data_fake.last_closep = data->last_closep;
3708      }
3709      else
3710       data_fake.last_closep = &fake;
3711
3712      data_fake.pos_delta = delta;
3713      next = regnext(scan);
3714      scan = NEXTOPER(scan);
3715      if (code != BRANCH)
3716       scan = NEXTOPER(scan);
3717      if (flags & SCF_DO_STCLASS) {
3718       ssc_init(pRExC_state, &this_class);
3719       data_fake.start_class = &this_class;
3720       f = SCF_DO_STCLASS_AND;
3721      }
3722      if (flags & SCF_WHILEM_VISITED_POS)
3723       f |= SCF_WHILEM_VISITED_POS;
3724
3725      /* we suppose the run is continuous, last=next...*/
3726      minnext = study_chunk(pRExC_state, &scan, minlenp,
3727          &deltanext, next, &data_fake, stopparen,
3728          recursed_depth, NULL, f,depth+1);
3729      if (min1 > minnext)
3730       min1 = minnext;
3731      if (deltanext == SSize_t_MAX) {
3732       is_inf = is_inf_internal = 1;
3733       max1 = SSize_t_MAX;
3734      } else if (max1 < minnext + deltanext)
3735       max1 = minnext + deltanext;
3736      scan = next;
3737      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3738       pars++;
3739      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3740       if ( stopmin > minnext)
3741        stopmin = min + min1;
3742       flags &= ~SCF_DO_SUBSTR;
3743       if (data)
3744        data->flags |= SCF_SEEN_ACCEPT;
3745      }
3746      if (data) {
3747       if (data_fake.flags & SF_HAS_EVAL)
3748        data->flags |= SF_HAS_EVAL;
3749       data->whilem_c = data_fake.whilem_c;
3750      }
3751      if (flags & SCF_DO_STCLASS)
3752       ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3753     }
3754     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3755      min1 = 0;
3756     if (flags & SCF_DO_SUBSTR) {
3757      data->pos_min += min1;
3758      if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3759       data->pos_delta = SSize_t_MAX;
3760      else
3761       data->pos_delta += max1 - min1;
3762      if (max1 != min1 || is_inf)
3763       data->longest = &(data->longest_float);
3764     }
3765     min += min1;
3766     if (delta == SSize_t_MAX
3767     || SSize_t_MAX - delta - (max1 - min1) < 0)
3768      delta = SSize_t_MAX;
3769     else
3770      delta += max1 - min1;
3771     if (flags & SCF_DO_STCLASS_OR) {
3772      ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3773      if (min1) {
3774       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3775       flags &= ~SCF_DO_STCLASS;
3776      }
3777     }
3778     else if (flags & SCF_DO_STCLASS_AND) {
3779      if (min1) {
3780       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3781       flags &= ~SCF_DO_STCLASS;
3782      }
3783      else {
3784       /* Switch to OR mode: cache the old value of
3785       * data->start_class */
3786       INIT_AND_WITHP;
3787       StructCopy(data->start_class, and_withp, regnode_ssc);
3788       flags &= ~SCF_DO_STCLASS_AND;
3789       StructCopy(&accum, data->start_class, regnode_ssc);
3790       flags |= SCF_DO_STCLASS_OR;
3791      }
3792     }
3793
3794     if (PERL_ENABLE_TRIE_OPTIMISATION &&
3795       OP( startbranch ) == BRANCH )
3796     {
3797     /* demq.
3798
3799     Assuming this was/is a branch we are dealing with: 'scan'
3800     now points at the item that follows the branch sequence,
3801     whatever it is. We now start at the beginning of the
3802     sequence and look for subsequences of
3803
3804     BRANCH->EXACT=>x1
3805     BRANCH->EXACT=>x2
3806     tail
3807
3808     which would be constructed from a pattern like
3809     /A|LIST|OF|WORDS/
3810
3811     If we can find such a subsequence we need to turn the first
3812     element into a trie and then add the subsequent branch exact
3813     strings to the trie.
3814
3815     We have two cases
3816
3817      1. patterns where the whole set of branches can be
3818       converted.
3819
3820      2. patterns where only a subset can be converted.
3821
3822     In case 1 we can replace the whole set with a single regop
3823     for the trie. In case 2 we need to keep the start and end
3824     branches so
3825
3826      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3827      becomes BRANCH TRIE; BRANCH X;
3828
3829     There is an additional case, that being where there is a
3830     common prefix, which gets split out into an EXACT like node
3831     preceding the TRIE node.
3832
3833     If x(1..n)==tail then we can do a simple trie, if not we make
3834     a "jump" trie, such that when we match the appropriate word
3835     we "jump" to the appropriate tail node. Essentially we turn
3836     a nested if into a case structure of sorts.
3837
3838     */
3839
3840      int made=0;
3841      if (!re_trie_maxbuff) {
3842       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3843       if (!SvIOK(re_trie_maxbuff))
3844        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3845      }
3846      if ( SvIV(re_trie_maxbuff)>=0  ) {
3847       regnode *cur;
3848       regnode *first = (regnode *)NULL;
3849       regnode *last = (regnode *)NULL;
3850       regnode *tail = scan;
3851       U8 trietype = 0;
3852       U32 count=0;
3853
3854 #ifdef DEBUGGING
3855       SV * const mysv = sv_newmortal();   /* for dumping */
3856 #endif
3857       /* var tail is used because there may be a TAIL
3858       regop in the way. Ie, the exacts will point to the
3859       thing following the TAIL, but the last branch will
3860       point at the TAIL. So we advance tail. If we
3861       have nested (?:) we may have to move through several
3862       tails.
3863       */
3864
3865       while ( OP( tail ) == TAIL ) {
3866        /* this is the TAIL generated by (?:) */
3867        tail = regnext( tail );
3868       }
3869
3870
3871       DEBUG_TRIE_COMPILE_r({
3872        regprop(RExC_rx, mysv, tail, NULL);
3873        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3874        (int)depth * 2 + 2, "",
3875        "Looking for TRIE'able sequences. Tail node is: ",
3876        SvPV_nolen_const( mysv )
3877        );
3878       });
3879
3880       /*
3881
3882        Step through the branches
3883         cur represents each branch,
3884         noper is the first thing to be matched as part
3885          of that branch
3886         noper_next is the regnext() of that node.
3887
3888        We normally handle a case like this
3889        /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3890        support building with NOJUMPTRIE, which restricts
3891        the trie logic to structures like /FOO|BAR/.
3892
3893        If noper is a trieable nodetype then the branch is
3894        a possible optimization target. If we are building
3895        under NOJUMPTRIE then we require that noper_next is
3896        the same as scan (our current position in the regex
3897        program).
3898
3899        Once we have two or more consecutive such branches
3900        we can create a trie of the EXACT's contents and
3901        stitch it in place into the program.
3902
3903        If the sequence represents all of the branches in
3904        the alternation we replace the entire thing with a
3905        single TRIE node.
3906
3907        Otherwise when it is a subsequence we need to
3908        stitch it in place and replace only the relevant
3909        branches. This means the first branch has to remain
3910        as it is used by the alternation logic, and its
3911        next pointer, and needs to be repointed at the item
3912        on the branch chain following the last branch we
3913        have optimized away.
3914
3915        This could be either a BRANCH, in which case the
3916        subsequence is internal, or it could be the item
3917        following the branch sequence in which case the
3918        subsequence is at the end (which does not
3919        necessarily mean the first node is the start of the
3920        alternation).
3921
3922        TRIE_TYPE(X) is a define which maps the optype to a
3923        trietype.
3924
3925         optype          |  trietype
3926         ----------------+-----------
3927         NOTHING         | NOTHING
3928         EXACT           | EXACT
3929         EXACTFU         | EXACTFU
3930         EXACTFU_SS      | EXACTFU
3931         EXACTFA         | EXACTFA
3932
3933
3934       */
3935 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3936      ( EXACT == (X) )   ? EXACT :        \
3937      ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
3938      ( EXACTFA == (X) ) ? EXACTFA :        \
3939      0 )
3940
3941       /* dont use tail as the end marker for this traverse */
3942       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3943        regnode * const noper = NEXTOPER( cur );
3944        U8 noper_type = OP( noper );
3945        U8 noper_trietype = TRIE_TYPE( noper_type );
3946 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3947        regnode * const noper_next = regnext( noper );
3948        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3949        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3950 #endif
3951
3952        DEBUG_TRIE_COMPILE_r({
3953         regprop(RExC_rx, mysv, cur, NULL);
3954         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3955         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3956
3957         regprop(RExC_rx, mysv, noper, NULL);
3958         PerlIO_printf( Perl_debug_log, " -> %s",
3959          SvPV_nolen_const(mysv));
3960
3961         if ( noper_next ) {
3962         regprop(RExC_rx, mysv, noper_next, NULL);
3963         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3964          SvPV_nolen_const(mysv));
3965         }
3966         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3967         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3968         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3969         );
3970        });
3971
3972        /* Is noper a trieable nodetype that can be merged
3973        * with the current trie (if there is one)? */
3974        if ( noper_trietype
3975         &&
3976         (
3977           ( noper_trietype == NOTHING)
3978           || ( trietype == NOTHING )
3979           || ( trietype == noper_trietype )
3980         )
3981 #ifdef NOJUMPTRIE
3982         && noper_next == tail
3983 #endif
3984         && count < U16_MAX)
3985        {
3986         /* Handle mergable triable node Either we are
3987         * the first node in a new trieable sequence,
3988         * in which case we do some bookkeeping,
3989         * otherwise we update the end pointer. */
3990         if ( !first ) {
3991          first = cur;
3992          if ( noper_trietype == NOTHING ) {
3993 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3994           regnode * const noper_next = regnext( noper );
3995           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3996           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3997 #endif
3998
3999           if ( noper_next_trietype ) {
4000            trietype = noper_next_trietype;
4001           } else if (noper_next_type)  {
4002            /* a NOTHING regop is 1 regop wide.
4003            * We need at least two for a trie
4004            * so we can't merge this in */
4005            first = NULL;
4006           }
4007          } else {
4008           trietype = noper_trietype;
4009          }
4010         } else {
4011          if ( trietype == NOTHING )
4012           trietype = noper_trietype;
4013          last = cur;
4014         }
4015         if (first)
4016          count++;
4017        } /* end handle mergable triable node */
4018        else {
4019         /* handle unmergable node -
4020         * noper may either be a triable node which can
4021         * not be tried together with the current trie,
4022         * or a non triable node */
4023         if ( last ) {
4024          /* If last is set and trietype is not
4025          * NOTHING then we have found at least two
4026          * triable branch sequences in a row of a
4027          * similar trietype so we can turn them
4028          * into a trie. If/when we allow NOTHING to
4029          * start a trie sequence this condition
4030          * will be required, and it isn't expensive
4031          * so we leave it in for now. */
4032          if ( trietype && trietype != NOTHING )
4033           make_trie( pRExC_state,
4034             startbranch, first, cur, tail,
4035             count, trietype, depth+1 );
4036          last = NULL; /* note: we clear/update
4037              first, trietype etc below,
4038              so we dont do it here */
4039         }
4040         if ( noper_trietype
4041 #ifdef NOJUMPTRIE
4042          && noper_next == tail
4043 #endif
4044         ){
4045          /* noper is triable, so we can start a new
4046          * trie sequence */
4047          count = 1;
4048          first = cur;
4049          trietype = noper_trietype;
4050         } else if (first) {
4051          /* if we already saw a first but the
4052          * current node is not triable then we have
4053          * to reset the first information. */
4054          count = 0;
4055          first = NULL;
4056          trietype = 0;
4057         }
4058        } /* end handle unmergable node */
4059       } /* loop over branches */
4060       DEBUG_TRIE_COMPILE_r({
4061        regprop(RExC_rx, mysv, cur, NULL);
4062        PerlIO_printf( Perl_debug_log,
4063        "%*s- %s (%d) <SCAN FINISHED>\n",
4064        (int)depth * 2 + 2,
4065        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4066
4067       });
4068       if ( last && trietype ) {
4069        if ( trietype != NOTHING ) {
4070         /* the last branch of the sequence was part of
4071         * a trie, so we have to construct it here
4072         * outside of the loop */
4073         made= make_trie( pRExC_state, startbranch,
4074             first, scan, tail, count,
4075             trietype, depth+1 );
4076 #ifdef TRIE_STUDY_OPT
4077         if ( ((made == MADE_EXACT_TRIE &&
4078          startbranch == first)
4079          || ( first_non_open == first )) &&
4080          depth==0 ) {
4081          flags |= SCF_TRIE_RESTUDY;
4082          if ( startbranch == first
4083           && scan == tail )
4084          {
4085           RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4086          }
4087         }
4088 #endif
4089        } else {
4090         /* at this point we know whatever we have is a
4091         * NOTHING sequence/branch AND if 'startbranch'
4092         * is 'first' then we can turn the whole thing
4093         * into a NOTHING
4094         */
4095         if ( startbranch == first ) {
4096          regnode *opt;
4097          /* the entire thing is a NOTHING sequence,
4098          * something like this: (?:|) So we can
4099          * turn it into a plain NOTHING op. */
4100          DEBUG_TRIE_COMPILE_r({
4101           regprop(RExC_rx, mysv, cur, NULL);
4102           PerlIO_printf( Perl_debug_log,
4103           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4104           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4105
4106          });
4107          OP(startbranch)= NOTHING;
4108          NEXT_OFF(startbranch)= tail - startbranch;
4109          for ( opt= startbranch + 1; opt < tail ; opt++ )
4110           OP(opt)= OPTIMIZED;
4111         }
4112        }
4113       } /* end if ( last) */
4114      } /* TRIE_MAXBUF is non zero */
4115
4116     } /* do trie */
4117
4118    }
4119    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
4120     scan = NEXTOPER(NEXTOPER(scan));
4121    } else   /* single branch is optimized. */
4122     scan = NEXTOPER(scan);
4123    continue;
4124   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4125    scan_frame *newframe = NULL;
4126    I32 paren;
4127    regnode *start;
4128    regnode *end;
4129    U32 my_recursed_depth= recursed_depth;
4130
4131    if (OP(scan) != SUSPEND) {
4132     /* set the pointer */
4133     if (OP(scan) == GOSUB) {
4134      paren = ARG(scan);
4135      RExC_recurse[ARG2L(scan)] = scan;
4136      start = RExC_open_parens[paren-1];
4137      end   = RExC_close_parens[paren-1];
4138     } else {
4139      paren = 0;
4140      start = RExC_rxi->program + 1;
4141      end   = RExC_opend;
4142     }
4143     if (!recursed_depth
4144      ||
4145      !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4146     ) {
4147      if (!recursed_depth) {
4148       Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4149      } else {
4150       Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4151        RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4152        RExC_study_chunk_recursed_bytes, U8);
4153      }
4154      /* we havent recursed into this paren yet, so recurse into it */
4155      DEBUG_STUDYDATA("set:", data,depth);
4156      PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4157      my_recursed_depth= recursed_depth + 1;
4158      Newx(newframe,1,scan_frame);
4159     } else {
4160      DEBUG_STUDYDATA("inf:", data,depth);
4161      /* some form of infinite recursion, assume infinite length
4162      * */
4163      if (flags & SCF_DO_SUBSTR) {
4164       scan_commit(pRExC_state, data, minlenp, is_inf);
4165       data->longest = &(data->longest_float);
4166      }
4167      is_inf = is_inf_internal = 1;
4168      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4169       ssc_anything(data->start_class);
4170      flags &= ~SCF_DO_STCLASS;
4171     }
4172    } else {
4173     Newx(newframe,1,scan_frame);
4174     paren = stopparen;
4175     start = scan+2;
4176     end = regnext(scan);
4177    }
4178    if (newframe) {
4179     assert(start);
4180     assert(end);
4181     SAVEFREEPV(newframe);
4182     newframe->next = regnext(scan);
4183     newframe->last = last;
4184     newframe->stop = stopparen;
4185     newframe->prev = frame;
4186     newframe->prev_recursed_depth = recursed_depth;
4187
4188     DEBUG_STUDYDATA("frame-new:",data,depth);
4189     DEBUG_PEEP("fnew", scan, depth);
4190
4191     frame = newframe;
4192     scan =  start;
4193     stopparen = paren;
4194     last = end;
4195     depth = depth + 1;
4196     recursed_depth= my_recursed_depth;
4197
4198     continue;
4199    }
4200   }
4201   else if (OP(scan) == EXACT) {
4202    SSize_t l = STR_LEN(scan);
4203    UV uc;
4204    if (UTF) {
4205     const U8 * const s = (U8*)STRING(scan);
4206     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4207     l = utf8_length(s, s + l);
4208    } else {
4209     uc = *((U8*)STRING(scan));
4210    }
4211    min += l;
4212    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4213     /* The code below prefers earlier match for fixed
4214     offset, later match for variable offset.  */
4215     if (data->last_end == -1) { /* Update the start info. */
4216      data->last_start_min = data->pos_min;
4217      data->last_start_max = is_inf
4218       ? SSize_t_MAX : data->pos_min + data->pos_delta;
4219     }
4220     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4221     if (UTF)
4222      SvUTF8_on(data->last_found);
4223     {
4224      SV * const sv = data->last_found;
4225      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4226       mg_find(sv, PERL_MAGIC_utf8) : NULL;
4227      if (mg && mg->mg_len >= 0)
4228       mg->mg_len += utf8_length((U8*)STRING(scan),
4229            (U8*)STRING(scan)+STR_LEN(scan));
4230     }
4231     data->last_end = data->pos_min + l;
4232     data->pos_min += l; /* As in the first entry. */
4233     data->flags &= ~SF_BEFORE_EOL;
4234    }
4235
4236    /* ANDing the code point leaves at most it, and not in locale, and
4237    * can't match null string */
4238    if (flags & SCF_DO_STCLASS_AND) {
4239     ssc_cp_and(data->start_class, uc);
4240     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4241     ssc_clear_locale(data->start_class);
4242    }
4243    else if (flags & SCF_DO_STCLASS_OR) {
4244     ssc_add_cp(data->start_class, uc);
4245     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4246
4247     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4248     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4249    }
4250    flags &= ~SCF_DO_STCLASS;
4251   }
4252   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4253    SSize_t l = STR_LEN(scan);
4254    UV uc = *((U8*)STRING(scan));
4255    SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4256              separate code points */
4257
4258    /* Search for fixed substrings supports EXACT only. */
4259    if (flags & SCF_DO_SUBSTR) {
4260     assert(data);
4261     scan_commit(pRExC_state, data, minlenp, is_inf);
4262    }
4263    if (UTF) {
4264     const U8 * const s = (U8 *)STRING(scan);
4265     uc = utf8_to_uvchr_buf(s, s + l, NULL);
4266     l = utf8_length(s, s + l);
4267    }
4268    if (unfolded_multi_char) {
4269     RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4270    }
4271    min += l - min_subtract;
4272    assert (min >= 0);
4273    delta += min_subtract;
4274    if (flags & SCF_DO_SUBSTR) {
4275     data->pos_min += l - min_subtract;
4276     if (data->pos_min < 0) {
4277      data->pos_min = 0;
4278     }
4279     data->pos_delta += min_subtract;
4280     if (min_subtract) {
4281      data->longest = &(data->longest_float);
4282     }
4283    }
4284    if (OP(scan) == EXACTFL) {
4285
4286     /* We don't know what the folds are; it could be anything. XXX
4287     * Actually, we only support UTF-8 encoding for code points
4288     * above Latin1, so we could know what those folds are. */
4289     EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4290              0,
4291              UV_MAX);
4292    }
4293    else {  /* Non-locale EXACTFish */
4294     EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4295     if (flags & SCF_DO_STCLASS_AND) {
4296      ssc_clear_locale(data->start_class);
4297     }
4298     if (uc < 256) { /* We know what the Latin1 folds are ... */
4299      if (IS_IN_SOME_FOLD_L1(uc)) {   /* For instance, we
4300              know if anything folds
4301              with this */
4302       EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4303               PL_fold_latin1[uc]);
4304       if (OP(scan) != EXACTFA) { /* The folds below aren't
4305              legal under /iaa */
4306        if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4307         EXACTF_invlist
4308          = add_cp_to_invlist(EXACTF_invlist,
4309             LATIN_SMALL_LETTER_SHARP_S);
4310        }
4311        else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4312         EXACTF_invlist
4313          = add_cp_to_invlist(EXACTF_invlist, 's');
4314         EXACTF_invlist
4315          = add_cp_to_invlist(EXACTF_invlist, 'S');
4316        }
4317       }
4318
4319       /* We also know if there are above-Latin1 code points
4320       * that fold to this (none legal for ASCII and /iaa) */
4321       if ((! isASCII(uc) || OP(scan) != EXACTFA)
4322        && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4323       {
4324        /* XXX We could know exactly what does fold to this
4325        * if the reverse folds are loaded, as currently in
4326        * S_regclass() */
4327        _invlist_union(EXACTF_invlist,
4328           PL_AboveLatin1,
4329           &EXACTF_invlist);
4330       }
4331      }
4332     }
4333     else {  /* Non-locale, above Latin1.  XXX We don't currently
4334       know what participates in folds with this, so have
4335       to assume anything could */
4336
4337      /* XXX We could know exactly what does fold to this if the
4338      * reverse folds are loaded, as currently in S_regclass().
4339      * But we do know that under /iaa nothing in the ASCII
4340      * range can participate */
4341      if (OP(scan) == EXACTFA) {
4342       _invlist_union_complement_2nd(EXACTF_invlist,
4343              PL_XPosix_ptrs[_CC_ASCII],
4344              &EXACTF_invlist);
4345      }
4346      else {
4347       EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4348                0, UV_MAX);
4349      }
4350     }
4351    }
4352    if (flags & SCF_DO_STCLASS_AND) {
4353     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4354     ANYOF_POSIXL_ZERO(data->start_class);
4355     ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4356    }
4357    else if (flags & SCF_DO_STCLASS_OR) {
4358     ssc_union(data->start_class, EXACTF_invlist, FALSE);
4359     ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4360
4361     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4362     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4363    }
4364    flags &= ~SCF_DO_STCLASS;
4365    SvREFCNT_dec(EXACTF_invlist);
4366   }
4367   else if (REGNODE_VARIES(OP(scan))) {
4368    SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4369    I32 fl = 0, f = flags;
4370    regnode * const oscan = scan;
4371    regnode_ssc this_class;
4372    regnode_ssc *oclass = NULL;
4373    I32 next_is_eval = 0;
4374
4375    switch (PL_regkind[OP(scan)]) {
4376    case WHILEM:  /* End of (?:...)* . */
4377     scan = NEXTOPER(scan);
4378     goto finish;
4379    case PLUS:
4380     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4381      next = NEXTOPER(scan);
4382      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4383       mincount = 1;
4384       maxcount = REG_INFTY;
4385       next = regnext(scan);
4386       scan = NEXTOPER(scan);
4387       goto do_curly;
4388      }
4389     }
4390     if (flags & SCF_DO_SUBSTR)
4391      data->pos_min++;
4392     min++;
4393     /* Fall through. */
4394    case STAR:
4395     if (flags & SCF_DO_STCLASS) {
4396      mincount = 0;
4397      maxcount = REG_INFTY;
4398      next = regnext(scan);
4399      scan = NEXTOPER(scan);
4400      goto do_curly;
4401     }
4402     if (flags & SCF_DO_SUBSTR) {
4403      scan_commit(pRExC_state, data, minlenp, is_inf);
4404      /* Cannot extend fixed substrings */
4405      data->longest = &(data->longest_float);
4406     }
4407     is_inf = is_inf_internal = 1;
4408     scan = regnext(scan);
4409     goto optimize_curly_tail;
4410    case CURLY:
4411     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4412      && (scan->flags == stopparen))
4413     {
4414      mincount = 1;
4415      maxcount = 1;
4416     } else {
4417      mincount = ARG1(scan);
4418      maxcount = ARG2(scan);
4419     }
4420     next = regnext(scan);
4421     if (OP(scan) == CURLYX) {
4422      I32 lp = (data ? *(data->last_closep) : 0);
4423      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4424     }
4425     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4426     next_is_eval = (OP(scan) == EVAL);
4427    do_curly:
4428     if (flags & SCF_DO_SUBSTR) {
4429      if (mincount == 0)
4430       scan_commit(pRExC_state, data, minlenp, is_inf);
4431      /* Cannot extend fixed substrings */
4432      pos_before = data->pos_min;
4433     }
4434     if (data) {
4435      fl = data->flags;
4436      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4437      if (is_inf)
4438       data->flags |= SF_IS_INF;
4439     }
4440     if (flags & SCF_DO_STCLASS) {
4441      ssc_init(pRExC_state, &this_class);
4442      oclass = data->start_class;
4443      data->start_class = &this_class;
4444      f |= SCF_DO_STCLASS_AND;
4445      f &= ~SCF_DO_STCLASS_OR;
4446     }
4447     /* Exclude from super-linear cache processing any {n,m}
4448     regops for which the combination of input pos and regex
4449     pos is not enough information to determine if a match
4450     will be possible.
4451
4452     For example, in the regex /foo(bar\s*){4,8}baz/ with the
4453     regex pos at the \s*, the prospects for a match depend not
4454     only on the input position but also on how many (bar\s*)
4455     repeats into the {4,8} we are. */
4456    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4457      f &= ~SCF_WHILEM_VISITED_POS;
4458
4459     /* This will finish on WHILEM, setting scan, or on NULL: */
4460     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4461         last, data, stopparen, recursed_depth, NULL,
4462         (mincount == 0
4463         ? (f & ~SCF_DO_SUBSTR)
4464         : f)
4465         ,depth+1);
4466
4467     if (flags & SCF_DO_STCLASS)
4468      data->start_class = oclass;
4469     if (mincount == 0 || minnext == 0) {
4470      if (flags & SCF_DO_STCLASS_OR) {
4471       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4472      }
4473      else if (flags & SCF_DO_STCLASS_AND) {
4474       /* Switch to OR mode: cache the old value of
4475       * data->start_class */
4476       INIT_AND_WITHP;
4477       StructCopy(data->start_class, and_withp, regnode_ssc);
4478       flags &= ~SCF_DO_STCLASS_AND;
4479       StructCopy(&this_class, data->start_class, regnode_ssc);
4480       flags |= SCF_DO_STCLASS_OR;
4481       ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4482      }
4483     } else {  /* Non-zero len */
4484      if (flags & SCF_DO_STCLASS_OR) {
4485       ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4486       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4487      }
4488      else if (flags & SCF_DO_STCLASS_AND)
4489       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4490      flags &= ~SCF_DO_STCLASS;
4491     }
4492     if (!scan)   /* It was not CURLYX, but CURLY. */
4493      scan = next;
4494     if (!(flags & SCF_TRIE_DOING_RESTUDY)
4495      /* ? quantifier ok, except for (?{ ... }) */
4496      && (next_is_eval || !(mincount == 0 && maxcount == 1))
4497      && (minnext == 0) && (deltanext == 0)
4498      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4499      && maxcount <= REG_INFTY/3) /* Complement check for big
4500             count */
4501     {
4502      /* Fatal warnings may leak the regexp without this: */
4503      SAVEFREESV(RExC_rx_sv);
4504      ckWARNreg(RExC_parse,
4505        "Quantifier unexpected on zero-length expression");
4506      (void)ReREFCNT_inc(RExC_rx_sv);
4507     }
4508
4509     min += minnext * mincount;
4510     is_inf_internal |= deltanext == SSize_t_MAX
4511       || (maxcount == REG_INFTY && minnext + deltanext > 0);
4512     is_inf |= is_inf_internal;
4513     if (is_inf) {
4514      delta = SSize_t_MAX;
4515     } else {
4516      delta += (minnext + deltanext) * maxcount
4517        - minnext * mincount;
4518     }
4519     /* Try powerful optimization CURLYX => CURLYN. */
4520     if (  OP(oscan) == CURLYX && data
4521      && data->flags & SF_IN_PAR
4522      && !(data->flags & SF_HAS_EVAL)
4523      && !deltanext && minnext == 1 ) {
4524      /* Try to optimize to CURLYN.  */
4525      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4526      regnode * const nxt1 = nxt;
4527 #ifdef DEBUGGING
4528      regnode *nxt2;
4529 #endif
4530
4531      /* Skip open. */
4532      nxt = regnext(nxt);
4533      if (!REGNODE_SIMPLE(OP(nxt))
4534       && !(PL_regkind[OP(nxt)] == EXACT
4535        && STR_LEN(nxt) == 1))
4536       goto nogo;
4537 #ifdef DEBUGGING
4538      nxt2 = nxt;
4539 #endif
4540      nxt = regnext(nxt);
4541      if (OP(nxt) != CLOSE)
4542       goto nogo;
4543      if (RExC_open_parens) {
4544       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4545       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4546      }
4547      /* Now we know that nxt2 is the only contents: */
4548      oscan->flags = (U8)ARG(nxt);
4549      OP(oscan) = CURLYN;
4550      OP(nxt1) = NOTHING; /* was OPEN. */
4551
4552 #ifdef DEBUGGING
4553      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4554      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4555      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4556      OP(nxt) = OPTIMIZED; /* was CLOSE. */
4557      OP(nxt + 1) = OPTIMIZED; /* was count. */
4558      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4559 #endif
4560     }
4561    nogo:
4562
4563     /* Try optimization CURLYX => CURLYM. */
4564     if (  OP(oscan) == CURLYX && data
4565      && !(data->flags & SF_HAS_PAR)
4566      && !(data->flags & SF_HAS_EVAL)
4567      && !deltanext /* atom is fixed width */
4568      && minnext != 0 /* CURLYM can't handle zero width */
4569
4570       /* Nor characters whose fold at run-time may be
4571       * multi-character */
4572      && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4573     ) {
4574      /* XXXX How to optimize if data == 0? */
4575      /* Optimize to a simpler form.  */
4576      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4577      regnode *nxt2;
4578
4579      OP(oscan) = CURLYM;
4580      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4581        && (OP(nxt2) != WHILEM))
4582       nxt = nxt2;
4583      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4584      /* Need to optimize away parenths. */
4585      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4586       /* Set the parenth number.  */
4587       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4588
4589       oscan->flags = (U8)ARG(nxt);
4590       if (RExC_open_parens) {
4591        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4592        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4593       }
4594       OP(nxt1) = OPTIMIZED; /* was OPEN. */
4595       OP(nxt) = OPTIMIZED; /* was CLOSE. */
4596
4597 #ifdef DEBUGGING
4598       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4599       OP(nxt + 1) = OPTIMIZED; /* was count. */
4600       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4601       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4602 #endif
4603 #if 0
4604       while ( nxt1 && (OP(nxt1) != WHILEM)) {
4605        regnode *nnxt = regnext(nxt1);
4606        if (nnxt == nxt) {
4607         if (reg_off_by_arg[OP(nxt1)])
4608          ARG_SET(nxt1, nxt2 - nxt1);
4609         else if (nxt2 - nxt1 < U16_MAX)
4610          NEXT_OFF(nxt1) = nxt2 - nxt1;
4611         else
4612          OP(nxt) = NOTHING; /* Cannot beautify */
4613        }
4614        nxt1 = nnxt;
4615       }
4616 #endif
4617       /* Optimize again: */
4618       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4619          NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4620      }
4621      else
4622       oscan->flags = 0;
4623     }
4624     else if ((OP(oscan) == CURLYX)
4625       && (flags & SCF_WHILEM_VISITED_POS)
4626       /* See the comment on a similar expression above.
4627        However, this time it's not a subexpression
4628        we care about, but the expression itself. */
4629       && (maxcount == REG_INFTY)
4630       && data && ++data->whilem_c < 16) {
4631      /* This stays as CURLYX, we can put the count/of pair. */
4632      /* Find WHILEM (as in regexec.c) */
4633      regnode *nxt = oscan + NEXT_OFF(oscan);
4634
4635      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4636       nxt += ARG(nxt);
4637      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4638       | (RExC_whilem_seen << 4)); /* On WHILEM */
4639     }
4640     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4641      pars++;
4642     if (flags & SCF_DO_SUBSTR) {
4643      SV *last_str = NULL;
4644      STRLEN last_chrs = 0;
4645      int counted = mincount != 0;
4646
4647      if (data->last_end > 0 && mincount != 0) { /* Ends with a
4648                 string. */
4649       SSize_t b = pos_before >= data->last_start_min
4650        ? pos_before : data->last_start_min;
4651       STRLEN l;
4652       const char * const s = SvPV_const(data->last_found, l);
4653       SSize_t old = b - data->last_start_min;
4654
4655       if (UTF)
4656        old = utf8_hop((U8*)s, old) - (U8*)s;
4657       l -= old;
4658       /* Get the added string: */
4659       last_str = newSVpvn_utf8(s  + old, l, UTF);
4660       last_chrs = UTF ? utf8_length((U8*)(s + old),
4661            (U8*)(s + old + l)) : l;
4662       if (deltanext == 0 && pos_before == b) {
4663        /* What was added is a constant string */
4664        if (mincount > 1) {
4665
4666         SvGROW(last_str, (mincount * l) + 1);
4667         repeatcpy(SvPVX(last_str) + l,
4668           SvPVX_const(last_str), l,
4669           mincount - 1);
4670         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4671         /* Add additional parts. */
4672         SvCUR_set(data->last_found,
4673           SvCUR(data->last_found) - l);
4674         sv_catsv(data->last_found, last_str);
4675         {
4676          SV * sv = data->last_found;
4677          MAGIC *mg =
4678           SvUTF8(sv) && SvMAGICAL(sv) ?
4679           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4680          if (mg && mg->mg_len >= 0)
4681           mg->mg_len += last_chrs * (mincount-1);
4682         }
4683         last_chrs *= mincount;
4684         data->last_end += l * (mincount - 1);
4685        }
4686       } else {
4687        /* start offset must point into the last copy */
4688        data->last_start_min += minnext * (mincount - 1);
4689        data->last_start_max += is_inf ? SSize_t_MAX
4690         : (maxcount - 1) * (minnext + data->pos_delta);
4691       }
4692      }
4693      /* It is counted once already... */
4694      data->pos_min += minnext * (mincount - counted);
4695 #if 0
4696 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4697        " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4698        " maxcount=%"UVdf" mincount=%"UVdf"\n",
4699  (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4700  (UV)mincount);
4701 if (deltanext != SSize_t_MAX)
4702 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4703  (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4704   - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4705 #endif
4706      if (deltanext == SSize_t_MAX
4707       || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4708       data->pos_delta = SSize_t_MAX;
4709      else
4710       data->pos_delta += - counted * deltanext +
4711       (minnext + deltanext) * maxcount - minnext * mincount;
4712      if (mincount != maxcount) {
4713       /* Cannot extend fixed substrings found inside
4714        the group.  */
4715       scan_commit(pRExC_state, data, minlenp, is_inf);
4716       if (mincount && last_str) {
4717        SV * const sv = data->last_found;
4718        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4719         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4720
4721        if (mg)
4722         mg->mg_len = -1;
4723        sv_setsv(sv, last_str);
4724        data->last_end = data->pos_min;
4725        data->last_start_min = data->pos_min - last_chrs;
4726        data->last_start_max = is_inf
4727         ? SSize_t_MAX
4728         : data->pos_min + data->pos_delta - last_chrs;
4729       }
4730       data->longest = &(data->longest_float);
4731      }
4732      SvREFCNT_dec(last_str);
4733     }
4734     if (data && (fl & SF_HAS_EVAL))
4735      data->flags |= SF_HAS_EVAL;
4736    optimize_curly_tail:
4737     if (OP(oscan) != CURLYX) {
4738      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4739       && NEXT_OFF(next))
4740       NEXT_OFF(oscan) += NEXT_OFF(next);
4741     }
4742     continue;
4743
4744    default:
4745 #ifdef DEBUGGING
4746     Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4747                  OP(scan));
4748 #endif
4749    case REF:
4750    case CLUMP:
4751     if (flags & SCF_DO_SUBSTR) {
4752      /* Cannot expect anything... */
4753      scan_commit(pRExC_state, data, minlenp, is_inf);
4754      data->longest = &(data->longest_float);
4755     }
4756     is_inf = is_inf_internal = 1;
4757     if (flags & SCF_DO_STCLASS_OR) {
4758      if (OP(scan) == CLUMP) {
4759       /* Actually is any start char, but very few code points
4760       * aren't start characters */
4761       ssc_match_all_cp(data->start_class);
4762      }
4763      else {
4764       ssc_anything(data->start_class);
4765      }
4766     }
4767     flags &= ~SCF_DO_STCLASS;
4768     break;
4769    }
4770   }
4771   else if (OP(scan) == LNBREAK) {
4772    if (flags & SCF_DO_STCLASS) {
4773      if (flags & SCF_DO_STCLASS_AND) {
4774      ssc_intersection(data->start_class,
4775          PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4776      ssc_clear_locale(data->start_class);
4777      ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4778     }
4779     else if (flags & SCF_DO_STCLASS_OR) {
4780      ssc_union(data->start_class,
4781        PL_XPosix_ptrs[_CC_VERTSPACE],
4782        FALSE);
4783      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4784
4785      /* See commit msg for
4786      * 749e076fceedeb708a624933726e7989f2302f6a */
4787      ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4788     }
4789     flags &= ~SCF_DO_STCLASS;
4790    }
4791    min++;
4792    delta++;    /* Because of the 2 char string cr-lf */
4793    if (flags & SCF_DO_SUBSTR) {
4794     /* Cannot expect anything... */
4795     scan_commit(pRExC_state, data, minlenp, is_inf);
4796      data->pos_min += 1;
4797     data->pos_delta += 1;
4798     data->longest = &(data->longest_float);
4799     }
4800   }
4801   else if (REGNODE_SIMPLE(OP(scan))) {
4802
4803    if (flags & SCF_DO_SUBSTR) {
4804     scan_commit(pRExC_state, data, minlenp, is_inf);
4805     data->pos_min++;
4806    }
4807    min++;
4808    if (flags & SCF_DO_STCLASS) {
4809     bool invert = 0;
4810     SV* my_invlist = sv_2mortal(_new_invlist(0));
4811     U8 namedclass;
4812
4813     /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4814     ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4815
4816     /* Some of the logic below assumes that switching
4817     locale on will only add false positives. */
4818     switch (OP(scan)) {
4819
4820     default:
4821 #ifdef DEBUGGING
4822     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4823                  OP(scan));
4824 #endif
4825     case CANY:
4826     case SANY:
4827      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4828       ssc_match_all_cp(data->start_class);
4829      break;
4830
4831     case REG_ANY:
4832      {
4833       SV* REG_ANY_invlist = _new_invlist(2);
4834       REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4835                '\n');
4836       if (flags & SCF_DO_STCLASS_OR) {
4837        ssc_union(data->start_class,
4838          REG_ANY_invlist,
4839          TRUE /* TRUE => invert, hence all but \n
4840            */
4841          );
4842       }
4843       else if (flags & SCF_DO_STCLASS_AND) {
4844        ssc_intersection(data->start_class,
4845            REG_ANY_invlist,
4846            TRUE  /* TRUE => invert */
4847            );
4848        ssc_clear_locale(data->start_class);
4849       }
4850       SvREFCNT_dec_NN(REG_ANY_invlist);
4851      }
4852      break;
4853
4854     case ANYOF:
4855      if (flags & SCF_DO_STCLASS_AND)
4856       ssc_and(pRExC_state, data->start_class,
4857         (regnode_charclass *) scan);
4858      else
4859       ssc_or(pRExC_state, data->start_class,
4860               (regnode_charclass *) scan);
4861      break;
4862
4863     case NPOSIXL:
4864      invert = 1;
4865      /* FALL THROUGH */
4866
4867     case POSIXL:
4868      namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4869      if (flags & SCF_DO_STCLASS_AND) {
4870       bool was_there = cBOOL(
4871           ANYOF_POSIXL_TEST(data->start_class,
4872                 namedclass));
4873       ANYOF_POSIXL_ZERO(data->start_class);
4874       if (was_there) {    /* Do an AND */
4875        ANYOF_POSIXL_SET(data->start_class, namedclass);
4876       }
4877       /* No individual code points can now match */
4878       data->start_class->invlist
4879             = sv_2mortal(_new_invlist(0));
4880      }
4881      else {
4882       int complement = namedclass + ((invert) ? -1 : 1);
4883
4884       assert(flags & SCF_DO_STCLASS_OR);
4885
4886       /* If the complement of this class was already there,
4887       * the result is that they match all code points,
4888       * (\d + \D == everything).  Remove the classes from
4889       * future consideration.  Locale is not relevant in
4890       * this case */
4891       if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4892        ssc_match_all_cp(data->start_class);
4893        ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4894        ANYOF_POSIXL_CLEAR(data->start_class, complement);
4895       }
4896       else {  /* The usual case; just add this class to the
4897         existing set */
4898        ANYOF_POSIXL_SET(data->start_class, namedclass);
4899       }
4900      }
4901      break;
4902
4903     case NPOSIXA:   /* For these, we always know the exact set of
4904         what's matched */
4905      invert = 1;
4906      /* FALL THROUGH */
4907     case POSIXA:
4908      if (FLAGS(scan) == _CC_ASCII) {
4909       my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4910      }
4911      else {
4912       _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4913            PL_XPosix_ptrs[_CC_ASCII],
4914            &my_invlist);
4915      }
4916      goto join_posix;
4917
4918     case NPOSIXD:
4919     case NPOSIXU:
4920      invert = 1;
4921      /* FALL THROUGH */
4922     case POSIXD:
4923     case POSIXU:
4924      my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
4925
4926      /* NPOSIXD matches all upper Latin1 code points unless the
4927      * target string being matched is UTF-8, which is
4928      * unknowable until match time.  Since we are going to
4929      * invert, we want to get rid of all of them so that the
4930      * inversion will match all */
4931      if (OP(scan) == NPOSIXD) {
4932       _invlist_subtract(my_invlist, PL_UpperLatin1,
4933           &my_invlist);
4934      }
4935
4936     join_posix:
4937
4938      if (flags & SCF_DO_STCLASS_AND) {
4939       ssc_intersection(data->start_class, my_invlist, invert);
4940       ssc_clear_locale(data->start_class);
4941      }
4942      else {
4943       assert(flags & SCF_DO_STCLASS_OR);
4944       ssc_union(data->start_class, my_invlist, invert);
4945      }
4946     }
4947     if (flags & SCF_DO_STCLASS_OR)
4948      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4949     flags &= ~SCF_DO_STCLASS;
4950    }
4951   }
4952   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4953    data->flags |= (OP(scan) == MEOL
4954        ? SF_BEFORE_MEOL
4955        : SF_BEFORE_SEOL);
4956    scan_commit(pRExC_state, data, minlenp, is_inf);
4957
4958   }
4959   else if (  PL_regkind[OP(scan)] == BRANCHJ
4960     /* Lookbehind, or need to calculate parens/evals/stclass: */
4961     && (scan->flags || data || (flags & SCF_DO_STCLASS))
4962     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4963    if ( OP(scan) == UNLESSM &&
4964     scan->flags == 0 &&
4965     OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4966     OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4967    ) {
4968     regnode *opt;
4969     regnode *upto= regnext(scan);
4970     DEBUG_PARSE_r({
4971      SV * const mysv_val=sv_newmortal();
4972      DEBUG_STUDYDATA("OPFAIL",data,depth);
4973
4974      /*DEBUG_PARSE_MSG("opfail");*/
4975      regprop(RExC_rx, mysv_val, upto, NULL);
4976      PerlIO_printf(Perl_debug_log,
4977       "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4978       SvPV_nolen_const(mysv_val),
4979       (IV)REG_NODE_NUM(upto),
4980       (IV)(upto - scan)
4981      );
4982     });
4983     OP(scan) = OPFAIL;
4984     NEXT_OFF(scan) = upto - scan;
4985     for (opt= scan + 1; opt < upto ; opt++)
4986      OP(opt) = OPTIMIZED;
4987     scan= upto;
4988     continue;
4989    }
4990    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4991     || OP(scan) == UNLESSM )
4992    {
4993     /* Negative Lookahead/lookbehind
4994     In this case we can't do fixed string optimisation.
4995     */
4996
4997     SSize_t deltanext, minnext, fake = 0;
4998     regnode *nscan;
4999     regnode_ssc intrnl;
5000     int f = 0;
5001
5002     data_fake.flags = 0;
5003     if (data) {
5004      data_fake.whilem_c = data->whilem_c;
5005      data_fake.last_closep = data->last_closep;
5006     }
5007     else
5008      data_fake.last_closep = &fake;
5009     data_fake.pos_delta = delta;
5010     if ( flags & SCF_DO_STCLASS && !scan->flags
5011      && OP(scan) == IFMATCH ) { /* Lookahead */
5012      ssc_init(pRExC_state, &intrnl);
5013      data_fake.start_class = &intrnl;
5014      f |= SCF_DO_STCLASS_AND;
5015     }
5016     if (flags & SCF_WHILEM_VISITED_POS)
5017      f |= SCF_WHILEM_VISITED_POS;
5018     next = regnext(scan);
5019     nscan = NEXTOPER(NEXTOPER(scan));
5020     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5021          last, &data_fake, stopparen,
5022          recursed_depth, NULL, f, depth+1);
5023     if (scan->flags) {
5024      if (deltanext) {
5025       FAIL("Variable length lookbehind not implemented");
5026      }
5027      else if (minnext > (I32)U8_MAX) {
5028       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5029        (UV)U8_MAX);
5030      }
5031      scan->flags = (U8)minnext;
5032     }
5033     if (data) {
5034      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5035       pars++;
5036      if (data_fake.flags & SF_HAS_EVAL)
5037       data->flags |= SF_HAS_EVAL;
5038      data->whilem_c = data_fake.whilem_c;
5039     }
5040     if (f & SCF_DO_STCLASS_AND) {
5041      if (flags & SCF_DO_STCLASS_OR) {
5042       /* OR before, AND after: ideally we would recurse with
5043       * data_fake to get the AND applied by study of the
5044       * remainder of the pattern, and then derecurse;
5045       * *** HACK *** for now just treat as "no information".
5046       * See [perl #56690].
5047       */
5048       ssc_init(pRExC_state, data->start_class);
5049      }  else {
5050       /* AND before and after: combine and continue.  These
5051       * assertions are zero-length, so can match an EMPTY
5052       * string */
5053       ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5054       ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5055      }
5056     }
5057    }
5058 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5059    else {
5060     /* Positive Lookahead/lookbehind
5061     In this case we can do fixed string optimisation,
5062     but we must be careful about it. Note in the case of
5063     lookbehind the positions will be offset by the minimum
5064     length of the pattern, something we won't know about
5065     until after the recurse.
5066     */
5067     SSize_t deltanext, fake = 0;
5068     regnode *nscan;
5069     regnode_ssc intrnl;
5070     int f = 0;
5071     /* We use SAVEFREEPV so that when the full compile
5072      is finished perl will clean up the allocated
5073      minlens when it's all done. This way we don't
5074      have to worry about freeing them when we know
5075      they wont be used, which would be a pain.
5076     */
5077     SSize_t *minnextp;
5078     Newx( minnextp, 1, SSize_t );
5079     SAVEFREEPV(minnextp);
5080
5081     if (data) {
5082      StructCopy(data, &data_fake, scan_data_t);
5083      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5084       f |= SCF_DO_SUBSTR;
5085       if (scan->flags)
5086        scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5087       data_fake.last_found=newSVsv(data->last_found);
5088      }
5089     }
5090     else
5091      data_fake.last_closep = &fake;
5092     data_fake.flags = 0;
5093     data_fake.pos_delta = delta;
5094     if (is_inf)
5095      data_fake.flags |= SF_IS_INF;
5096     if ( flags & SCF_DO_STCLASS && !scan->flags
5097      && OP(scan) == IFMATCH ) { /* Lookahead */
5098      ssc_init(pRExC_state, &intrnl);
5099      data_fake.start_class = &intrnl;
5100      f |= SCF_DO_STCLASS_AND;
5101     }
5102     if (flags & SCF_WHILEM_VISITED_POS)
5103      f |= SCF_WHILEM_VISITED_POS;
5104     next = regnext(scan);
5105     nscan = NEXTOPER(NEXTOPER(scan));
5106
5107     *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5108           &deltanext, last, &data_fake,
5109           stopparen, recursed_depth, NULL,
5110           f,depth+1);
5111     if (scan->flags) {
5112      if (deltanext) {
5113       FAIL("Variable length lookbehind not implemented");
5114      }
5115      else if (*minnextp > (I32)U8_MAX) {
5116       FAIL2("Lookbehind longer than %"UVuf" not implemented",
5117        (UV)U8_MAX);
5118      }
5119      scan->flags = (U8)*minnextp;
5120     }
5121
5122     *minnextp += min;
5123
5124     if (f & SCF_DO_STCLASS_AND) {
5125      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5126      ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5127     }
5128     if (data) {
5129      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5130       pars++;
5131      if (data_fake.flags & SF_HAS_EVAL)
5132       data->flags |= SF_HAS_EVAL;
5133      data->whilem_c = data_fake.whilem_c;
5134      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5135       if (RExC_rx->minlen<*minnextp)
5136        RExC_rx->minlen=*minnextp;
5137       scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5138       SvREFCNT_dec_NN(data_fake.last_found);
5139
5140       if ( data_fake.minlen_fixed != minlenp )
5141       {
5142        data->offset_fixed= data_fake.offset_fixed;
5143        data->minlen_fixed= data_fake.minlen_fixed;
5144        data->lookbehind_fixed+= scan->flags;
5145       }
5146       if ( data_fake.minlen_float != minlenp )
5147       {
5148        data->minlen_float= data_fake.minlen_float;
5149        data->offset_float_min=data_fake.offset_float_min;
5150        data->offset_float_max=data_fake.offset_float_max;
5151        data->lookbehind_float+= scan->flags;
5152       }
5153      }
5154     }
5155    }
5156 #endif
5157   }
5158   else if (OP(scan) == OPEN) {
5159    if (stopparen != (I32)ARG(scan))
5160     pars++;
5161   }
5162   else if (OP(scan) == CLOSE) {
5163    if (stopparen == (I32)ARG(scan)) {
5164     break;
5165    }
5166    if ((I32)ARG(scan) == is_par) {
5167     next = regnext(scan);
5168
5169     if ( next && (OP(next) != WHILEM) && next < last)
5170      is_par = 0;  /* Disable optimization */
5171    }
5172    if (data)
5173     *(data->last_closep) = ARG(scan);
5174   }
5175   else if (OP(scan) == EVAL) {
5176     if (data)
5177      data->flags |= SF_HAS_EVAL;
5178   }
5179   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5180    if (flags & SCF_DO_SUBSTR) {
5181     scan_commit(pRExC_state, data, minlenp, is_inf);
5182     flags &= ~SCF_DO_SUBSTR;
5183    }
5184    if (data && OP(scan)==ACCEPT) {
5185     data->flags |= SCF_SEEN_ACCEPT;
5186     if (stopmin > min)
5187      stopmin = min;
5188    }
5189   }
5190   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5191   {
5192     if (flags & SCF_DO_SUBSTR) {
5193      scan_commit(pRExC_state, data, minlenp, is_inf);
5194      data->longest = &(data->longest_float);
5195     }
5196     is_inf = is_inf_internal = 1;
5197     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5198      ssc_anything(data->start_class);
5199     flags &= ~SCF_DO_STCLASS;
5200   }
5201   else if (OP(scan) == GPOS) {
5202    if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5203     !(delta || is_inf || (data && data->pos_delta)))
5204    {
5205     if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5206      RExC_rx->intflags |= PREGf_ANCH_GPOS;
5207     if (RExC_rx->gofs < (STRLEN)min)
5208      RExC_rx->gofs = min;
5209    } else {
5210     RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5211     RExC_rx->gofs = 0;
5212    }
5213   }
5214 #ifdef TRIE_STUDY_OPT
5215 #ifdef FULL_TRIE_STUDY
5216   else if (PL_regkind[OP(scan)] == TRIE) {
5217    /* NOTE - There is similar code to this block above for handling
5218    BRANCH nodes on the initial study.  If you change stuff here
5219    check there too. */
5220    regnode *trie_node= scan;
5221    regnode *tail= regnext(scan);
5222    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5223    SSize_t max1 = 0, min1 = SSize_t_MAX;
5224    regnode_ssc accum;
5225
5226    if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5227     /* Cannot merge strings after this. */
5228     scan_commit(pRExC_state, data, minlenp, is_inf);
5229    }
5230    if (flags & SCF_DO_STCLASS)
5231     ssc_init_zero(pRExC_state, &accum);
5232
5233    if (!trie->jump) {
5234     min1= trie->minlen;
5235     max1= trie->maxlen;
5236    } else {
5237     const regnode *nextbranch= NULL;
5238     U32 word;
5239
5240     for ( word=1 ; word <= trie->wordcount ; word++)
5241     {
5242      SSize_t deltanext=0, minnext=0, f = 0, fake;
5243      regnode_ssc this_class;
5244
5245      data_fake.flags = 0;
5246      if (data) {
5247       data_fake.whilem_c = data->whilem_c;
5248       data_fake.last_closep = data->last_closep;
5249      }
5250      else
5251       data_fake.last_closep = &fake;
5252      data_fake.pos_delta = delta;
5253      if (flags & SCF_DO_STCLASS) {
5254       ssc_init(pRExC_state, &this_class);
5255       data_fake.start_class = &this_class;
5256       f = SCF_DO_STCLASS_AND;
5257      }
5258      if (flags & SCF_WHILEM_VISITED_POS)
5259       f |= SCF_WHILEM_VISITED_POS;
5260
5261      if (trie->jump[word]) {
5262       if (!nextbranch)
5263        nextbranch = trie_node + trie->jump[0];
5264       scan= trie_node + trie->jump[word];
5265       /* We go from the jump point to the branch that follows
5266       it. Note this means we need the vestigal unused
5267       branches even though they arent otherwise used. */
5268       minnext = study_chunk(pRExC_state, &scan, minlenp,
5269        &deltanext, (regnode *)nextbranch, &data_fake,
5270        stopparen, recursed_depth, NULL, f,depth+1);
5271      }
5272      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5273       nextbranch= regnext((regnode*)nextbranch);
5274
5275      if (min1 > (SSize_t)(minnext + trie->minlen))
5276       min1 = minnext + trie->minlen;
5277      if (deltanext == SSize_t_MAX) {
5278       is_inf = is_inf_internal = 1;
5279       max1 = SSize_t_MAX;
5280      } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5281       max1 = minnext + deltanext + trie->maxlen;
5282
5283      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5284       pars++;
5285      if (data_fake.flags & SCF_SEEN_ACCEPT) {
5286       if ( stopmin > min + min1)
5287        stopmin = min + min1;
5288       flags &= ~SCF_DO_SUBSTR;
5289       if (data)
5290        data->flags |= SCF_SEEN_ACCEPT;
5291      }
5292      if (data) {
5293       if (data_fake.flags & SF_HAS_EVAL)
5294        data->flags |= SF_HAS_EVAL;
5295       data->whilem_c = data_fake.whilem_c;
5296      }
5297      if (flags & SCF_DO_STCLASS)
5298       ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5299     }
5300    }
5301    if (flags & SCF_DO_SUBSTR) {
5302     data->pos_min += min1;
5303     data->pos_delta += max1 - min1;
5304     if (max1 != min1 || is_inf)
5305      data->longest = &(data->longest_float);
5306    }
5307    min += min1;
5308    delta += max1 - min1;
5309    if (flags & SCF_DO_STCLASS_OR) {
5310     ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5311     if (min1) {
5312      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5313      flags &= ~SCF_DO_STCLASS;
5314     }
5315    }
5316    else if (flags & SCF_DO_STCLASS_AND) {
5317     if (min1) {
5318      ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5319      flags &= ~SCF_DO_STCLASS;
5320     }
5321     else {
5322      /* Switch to OR mode: cache the old value of
5323      * data->start_class */
5324      INIT_AND_WITHP;
5325      StructCopy(data->start_class, and_withp, regnode_ssc);
5326      flags &= ~SCF_DO_STCLASS_AND;
5327      StructCopy(&accum, data->start_class, regnode_ssc);
5328      flags |= SCF_DO_STCLASS_OR;
5329     }
5330    }
5331    scan= tail;
5332    continue;
5333   }
5334 #else
5335   else if (PL_regkind[OP(scan)] == TRIE) {
5336    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5337    U8*bang=NULL;
5338
5339    min += trie->minlen;
5340    delta += (trie->maxlen - trie->minlen);
5341    flags &= ~SCF_DO_STCLASS; /* xxx */
5342    if (flags & SCF_DO_SUBSTR) {
5343     /* Cannot expect anything... */
5344     scan_commit(pRExC_state, data, minlenp, is_inf);
5345      data->pos_min += trie->minlen;
5346      data->pos_delta += (trie->maxlen - trie->minlen);
5347     if (trie->maxlen != trie->minlen)
5348      data->longest = &(data->longest_float);
5349     }
5350     if (trie->jump) /* no more substrings -- for now /grr*/
5351    flags &= ~SCF_DO_SUBSTR;
5352   }
5353 #endif /* old or new */
5354 #endif /* TRIE_STUDY_OPT */
5355
5356   /* Else: zero-length, ignore. */
5357   scan = regnext(scan);
5358  }
5359  /* If we are exiting a recursion we can unset its recursed bit
5360  * and allow ourselves to enter it again - no danger of an
5361  * infinite loop there.
5362  if (stopparen > -1 && recursed) {
5363   DEBUG_STUDYDATA("unset:", data,depth);
5364   PAREN_UNSET( recursed, stopparen);
5365  }
5366  */
5367  if (frame) {
5368   DEBUG_STUDYDATA("frame-end:",data,depth);
5369   DEBUG_PEEP("fend", scan, depth);
5370   /* restore previous context */
5371   last = frame->last;
5372   scan = frame->next;
5373   stopparen = frame->stop;
5374   recursed_depth = frame->prev_recursed_depth;
5375   depth = depth - 1;
5376
5377   frame = frame->prev;
5378   goto fake_study_recurse;
5379  }
5380
5381   finish:
5382  assert(!frame);
5383  DEBUG_STUDYDATA("pre-fin:",data,depth);
5384
5385  *scanp = scan;
5386  *deltap = is_inf_internal ? SSize_t_MAX : delta;
5387
5388  if (flags & SCF_DO_SUBSTR && is_inf)
5389   data->pos_delta = SSize_t_MAX - data->pos_min;
5390  if (is_par > (I32)U8_MAX)
5391   is_par = 0;
5392  if (is_par && pars==1 && data) {
5393   data->flags |= SF_IN_PAR;
5394   data->flags &= ~SF_HAS_PAR;
5395  }
5396  else if (pars && data) {
5397   data->flags |= SF_HAS_PAR;
5398   data->flags &= ~SF_IN_PAR;
5399  }
5400  if (flags & SCF_DO_STCLASS_OR)
5401   ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5402  if (flags & SCF_TRIE_RESTUDY)
5403   data->flags |=  SCF_TRIE_RESTUDY;
5404
5405  DEBUG_STUDYDATA("post-fin:",data,depth);
5406
5407  {
5408   SSize_t final_minlen= min < stopmin ? min : stopmin;
5409
5410   if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5411    RExC_maxlen = final_minlen + delta;
5412   }
5413   return final_minlen;
5414  }
5415  /* not-reached */
5416 }
5417
5418 STATIC U32
5419 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5420 {
5421  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5422
5423  PERL_ARGS_ASSERT_ADD_DATA;
5424
5425  Renewc(RExC_rxi->data,
5426   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5427   char, struct reg_data);
5428  if(count)
5429   Renew(RExC_rxi->data->what, count + n, U8);
5430  else
5431   Newx(RExC_rxi->data->what, n, U8);
5432  RExC_rxi->data->count = count + n;
5433  Copy(s, RExC_rxi->data->what + count, n, U8);
5434  return count;
5435 }
5436
5437 /*XXX: todo make this not included in a non debugging perl */
5438 #ifndef PERL_IN_XSUB_RE
5439 void
5440 Perl_reginitcolors(pTHX)
5441 {
5442  dVAR;
5443  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5444  if (s) {
5445   char *t = savepv(s);
5446   int i = 0;
5447   PL_colors[0] = t;
5448   while (++i < 6) {
5449    t = strchr(t, '\t');
5450    if (t) {
5451     *t = '\0';
5452     PL_colors[i] = ++t;
5453    }
5454    else
5455     PL_colors[i] = t = (char *)"";
5456   }
5457  } else {
5458   int i = 0;
5459   while (i < 6)
5460    PL_colors[i++] = (char *)"";
5461  }
5462  PL_colorset = 1;
5463 }
5464 #endif
5465
5466
5467 #ifdef TRIE_STUDY_OPT
5468 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
5469  STMT_START {                                            \
5470   if (                                                \
5471    (data.flags & SCF_TRIE_RESTUDY)               \
5472    && ! restudied++                              \
5473   ) {                                                 \
5474    dOsomething;                                    \
5475    goto reStudy;                                   \
5476   }                                                   \
5477  } STMT_END
5478 #else
5479 #define CHECK_RESTUDY_GOTO_butfirst
5480 #endif
5481
5482 /*
5483  * pregcomp - compile a regular expression into internal code
5484  *
5485  * Decides which engine's compiler to call based on the hint currently in
5486  * scope
5487  */
5488
5489 #ifndef PERL_IN_XSUB_RE
5490
5491 /* return the currently in-scope regex engine (or the default if none)  */
5492
5493 regexp_engine const *
5494 Perl_current_re_engine(pTHX)
5495 {
5496  dVAR;
5497
5498  if (IN_PERL_COMPILETIME) {
5499   HV * const table = GvHV(PL_hintgv);
5500   SV **ptr;
5501
5502   if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5503    return &reh_regexp_engine;
5504   ptr = hv_fetchs(table, "regcomp", FALSE);
5505   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5506    return &reh_regexp_engine;
5507   return INT2PTR(regexp_engine*,SvIV(*ptr));
5508  }
5509  else {
5510   SV *ptr;
5511   if (!PL_curcop->cop_hints_hash)
5512    return &reh_regexp_engine;
5513   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5514   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5515    return &reh_regexp_engine;
5516   return INT2PTR(regexp_engine*,SvIV(ptr));
5517  }
5518 }
5519
5520
5521 REGEXP *
5522 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5523 {
5524  dVAR;
5525  regexp_engine const *eng = current_re_engine();
5526  GET_RE_DEBUG_FLAGS_DECL;
5527
5528  PERL_ARGS_ASSERT_PREGCOMP;
5529
5530  /* Dispatch a request to compile a regexp to correct regexp engine. */
5531  DEBUG_COMPILE_r({
5532   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5533       PTR2UV(eng));
5534  });
5535  return CALLREGCOMP_ENG(eng, pattern, flags);
5536 }
5537 #endif
5538
5539 /* public(ish) entry point for the perl core's own regex compiling code.
5540  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5541  * pattern rather than a list of OPs, and uses the internal engine rather
5542  * than the current one */
5543
5544 REGEXP *
5545 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5546 {
5547  SV *pat = pattern; /* defeat constness! */
5548  PERL_ARGS_ASSERT_RE_COMPILE;
5549  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5550 #ifdef PERL_IN_XSUB_RE
5551         &my_reg_engine,
5552 #else
5553         &reh_regexp_engine,
5554 #endif
5555         NULL, NULL, rx_flags, 0);
5556 }
5557
5558
5559 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5560  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5561  * point to the realloced string and length.
5562  *
5563  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5564  * stuff added */
5565
5566 static void
5567 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5568      char **pat_p, STRLEN *plen_p, int num_code_blocks)
5569 {
5570  U8 *const src = (U8*)*pat_p;
5571  U8 *dst;
5572  int n=0;
5573  STRLEN s = 0, d = 0;
5574  bool do_end = 0;
5575  GET_RE_DEBUG_FLAGS_DECL;
5576
5577  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5578   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5579
5580  Newx(dst, *plen_p * 2 + 1, U8);
5581
5582  while (s < *plen_p) {
5583   if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5584    dst[d]   = src[s];
5585   else {
5586    dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5587    dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
5588   }
5589   if (n < num_code_blocks) {
5590    if (!do_end && pRExC_state->code_blocks[n].start == s) {
5591     pRExC_state->code_blocks[n].start = d;
5592     assert(dst[d] == '(');
5593     do_end = 1;
5594    }
5595    else if (do_end && pRExC_state->code_blocks[n].end == s) {
5596     pRExC_state->code_blocks[n].end = d;
5597     assert(dst[d] == ')');
5598     do_end = 0;
5599     n++;
5600    }
5601   }
5602   s++;
5603   d++;
5604  }
5605  dst[d] = '\0';
5606  *plen_p = d;
5607  *pat_p = (char*) dst;
5608  SAVEFREEPV(*pat_p);
5609  RExC_orig_utf8 = RExC_utf8 = 1;
5610 }
5611
5612
5613
5614 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5615  * while recording any code block indices, and handling overloading,
5616  * nested qr// objects etc.  If pat is null, it will allocate a new
5617  * string, or just return the first arg, if there's only one.
5618  *
5619  * Returns the malloced/updated pat.
5620  * patternp and pat_count is the array of SVs to be concatted;
5621  * oplist is the optional list of ops that generated the SVs;
5622  * recompile_p is a pointer to a boolean that will be set if
5623  *   the regex will need to be recompiled.
5624  * delim, if non-null is an SV that will be inserted between each element
5625  */
5626
5627 static SV*
5628 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5629     SV *pat, SV ** const patternp, int pat_count,
5630     OP *oplist, bool *recompile_p, SV *delim)
5631 {
5632  SV **svp;
5633  int n = 0;
5634  bool use_delim = FALSE;
5635  bool alloced = FALSE;
5636
5637  /* if we know we have at least two args, create an empty string,
5638  * then concatenate args to that. For no args, return an empty string */
5639  if (!pat && pat_count != 1) {
5640   pat = newSVpvn("", 0);
5641   SAVEFREESV(pat);
5642   alloced = TRUE;
5643  }
5644
5645  for (svp = patternp; svp < patternp + pat_count; svp++) {
5646   SV *sv;
5647   SV *rx  = NULL;
5648   STRLEN orig_patlen = 0;
5649   bool code = 0;
5650   SV *msv = use_delim ? delim : *svp;
5651   if (!msv) msv = &PL_sv_undef;
5652
5653   /* if we've got a delimiter, we go round the loop twice for each
5654   * svp slot (except the last), using the delimiter the second
5655   * time round */
5656   if (use_delim) {
5657    svp--;
5658    use_delim = FALSE;
5659   }
5660   else if (delim)
5661    use_delim = TRUE;
5662
5663   if (SvTYPE(msv) == SVt_PVAV) {
5664    /* we've encountered an interpolated array within
5665    * the pattern, e.g. /...@a..../. Expand the list of elements,
5666    * then recursively append elements.
5667    * The code in this block is based on S_pushav() */
5668
5669    AV *const av = (AV*)msv;
5670    const SSize_t maxarg = AvFILL(av) + 1;
5671    SV **array;
5672
5673    if (oplist) {
5674     assert(oplist->op_type == OP_PADAV
5675      || oplist->op_type == OP_RV2AV);
5676     oplist = oplist->op_sibling;;
5677    }
5678
5679    if (SvRMAGICAL(av)) {
5680     SSize_t i;
5681
5682     Newx(array, maxarg, SV*);
5683     SAVEFREEPV(array);
5684     for (i=0; i < maxarg; i++) {
5685      SV ** const svp = av_fetch(av, i, FALSE);
5686      array[i] = svp ? *svp : &PL_sv_undef;
5687     }
5688    }
5689    else
5690     array = AvARRAY(av);
5691
5692    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5693         array, maxarg, NULL, recompile_p,
5694         /* $" */
5695         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5696
5697    continue;
5698   }
5699
5700
5701   /* we make the assumption here that each op in the list of
5702   * op_siblings maps to one SV pushed onto the stack,
5703   * except for code blocks, with have both an OP_NULL and
5704   * and OP_CONST.
5705   * This allows us to match up the list of SVs against the
5706   * list of OPs to find the next code block.
5707   *
5708   * Note that       PUSHMARK PADSV PADSV ..
5709   * is optimised to
5710   *                 PADRANGE PADSV  PADSV  ..
5711   * so the alignment still works. */
5712
5713   if (oplist) {
5714    if (oplist->op_type == OP_NULL
5715     && (oplist->op_flags & OPf_SPECIAL))
5716    {
5717     assert(n < pRExC_state->num_code_blocks);
5718     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5719     pRExC_state->code_blocks[n].block = oplist;
5720     pRExC_state->code_blocks[n].src_regex = NULL;
5721     n++;
5722     code = 1;
5723     oplist = oplist->op_sibling; /* skip CONST */
5724     assert(oplist);
5725    }
5726    oplist = oplist->op_sibling;;
5727   }
5728
5729   /* apply magic and QR overloading to arg */
5730
5731   SvGETMAGIC(msv);
5732   if (SvROK(msv) && SvAMAGIC(msv)) {
5733    SV *sv = AMG_CALLunary(msv, regexp_amg);
5734    if (sv) {
5735     if (SvROK(sv))
5736      sv = SvRV(sv);
5737     if (SvTYPE(sv) != SVt_REGEXP)
5738      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5739     msv = sv;
5740    }
5741   }
5742
5743   /* try concatenation overload ... */
5744   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5745     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5746   {
5747    sv_setsv(pat, sv);
5748    /* overloading involved: all bets are off over literal
5749    * code. Pretend we haven't seen it */
5750    pRExC_state->num_code_blocks -= n;
5751    n = 0;
5752   }
5753   else  {
5754    /* ... or failing that, try "" overload */
5755    while (SvAMAGIC(msv)
5756      && (sv = AMG_CALLunary(msv, string_amg))
5757      && sv != msv
5758      &&  !(   SvROK(msv)
5759       && SvROK(sv)
5760       && SvRV(msv) == SvRV(sv))
5761    ) {
5762     msv = sv;
5763     SvGETMAGIC(msv);
5764    }
5765    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5766     msv = SvRV(msv);
5767
5768    if (pat) {
5769     /* this is a partially unrolled
5770     *     sv_catsv_nomg(pat, msv);
5771     * that allows us to adjust code block indices if
5772     * needed */
5773     STRLEN dlen;
5774     char *dst = SvPV_force_nomg(pat, dlen);
5775     orig_patlen = dlen;
5776     if (SvUTF8(msv) && !SvUTF8(pat)) {
5777      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5778      sv_setpvn(pat, dst, dlen);
5779      SvUTF8_on(pat);
5780     }
5781     sv_catsv_nomg(pat, msv);
5782     rx = msv;
5783    }
5784    else
5785     pat = msv;
5786
5787    if (code)
5788     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5789   }
5790
5791   /* extract any code blocks within any embedded qr//'s */
5792   if (rx && SvTYPE(rx) == SVt_REGEXP
5793    && RX_ENGINE((REGEXP*)rx)->op_comp)
5794   {
5795
5796    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5797    if (ri->num_code_blocks) {
5798     int i;
5799     /* the presence of an embedded qr// with code means
5800     * we should always recompile: the text of the
5801     * qr// may not have changed, but it may be a
5802     * different closure than last time */
5803     *recompile_p = 1;
5804     Renew(pRExC_state->code_blocks,
5805      pRExC_state->num_code_blocks + ri->num_code_blocks,
5806      struct reg_code_block);
5807     pRExC_state->num_code_blocks += ri->num_code_blocks;
5808
5809     for (i=0; i < ri->num_code_blocks; i++) {
5810      struct reg_code_block *src, *dst;
5811      STRLEN offset =  orig_patlen
5812       + ReANY((REGEXP *)rx)->pre_prefix;
5813      assert(n < pRExC_state->num_code_blocks);
5814      src = &ri->code_blocks[i];
5815      dst = &pRExC_state->code_blocks[n];
5816      dst->start     = src->start + offset;
5817      dst->end     = src->end   + offset;
5818      dst->block     = src->block;
5819      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5820            src->src_regex
5821             ? src->src_regex
5822             : (REGEXP*)rx);
5823      n++;
5824     }
5825    }
5826   }
5827  }
5828  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5829  if (alloced)
5830   SvSETMAGIC(pat);
5831
5832  return pat;
5833 }
5834
5835
5836
5837 /* see if there are any run-time code blocks in the pattern.
5838  * False positives are allowed */
5839
5840 static bool
5841 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5842      char *pat, STRLEN plen)
5843 {
5844  int n = 0;
5845  STRLEN s;
5846
5847  for (s = 0; s < plen; s++) {
5848   if (n < pRExC_state->num_code_blocks
5849    && s == pRExC_state->code_blocks[n].start)
5850   {
5851    s = pRExC_state->code_blocks[n].end;
5852    n++;
5853    continue;
5854   }
5855   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5856   * positives here */
5857   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5858    (pat[s+2] == '{'
5859     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5860   )
5861    return 1;
5862  }
5863  return 0;
5864 }
5865
5866 /* Handle run-time code blocks. We will already have compiled any direct
5867  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5868  * copy of it, but with any literal code blocks blanked out and
5869  * appropriate chars escaped; then feed it into
5870  *
5871  *    eval "qr'modified_pattern'"
5872  *
5873  * For example,
5874  *
5875  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5876  *
5877  * becomes
5878  *
5879  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5880  *
5881  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5882  * and merge them with any code blocks of the original regexp.
5883  *
5884  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5885  * instead, just save the qr and return FALSE; this tells our caller that
5886  * the original pattern needs upgrading to utf8.
5887  */
5888
5889 static bool
5890 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5891  char *pat, STRLEN plen)
5892 {
5893  SV *qr;
5894
5895  GET_RE_DEBUG_FLAGS_DECL;
5896
5897  if (pRExC_state->runtime_code_qr) {
5898   /* this is the second time we've been called; this should
5899   * only happen if the main pattern got upgraded to utf8
5900   * during compilation; re-use the qr we compiled first time
5901   * round (which should be utf8 too)
5902   */
5903   qr = pRExC_state->runtime_code_qr;
5904   pRExC_state->runtime_code_qr = NULL;
5905   assert(RExC_utf8 && SvUTF8(qr));
5906  }
5907  else {
5908   int n = 0;
5909   STRLEN s;
5910   char *p, *newpat;
5911   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5912   SV *sv, *qr_ref;
5913   dSP;
5914
5915   /* determine how many extra chars we need for ' and \ escaping */
5916   for (s = 0; s < plen; s++) {
5917    if (pat[s] == '\'' || pat[s] == '\\')
5918     newlen++;
5919   }
5920
5921   Newx(newpat, newlen, char);
5922   p = newpat;
5923   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5924
5925   for (s = 0; s < plen; s++) {
5926    if (n < pRExC_state->num_code_blocks
5927     && s == pRExC_state->code_blocks[n].start)
5928    {
5929     /* blank out literal code block */
5930     assert(pat[s] == '(');
5931     while (s <= pRExC_state->code_blocks[n].end) {
5932      *p++ = '_';
5933      s++;
5934     }
5935     s--;
5936     n++;
5937     continue;
5938    }
5939    if (pat[s] == '\'' || pat[s] == '\\')
5940     *p++ = '\\';
5941    *p++ = pat[s];
5942   }
5943   *p++ = '\'';
5944   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5945    *p++ = 'x';
5946   *p++ = '\0';
5947   DEBUG_COMPILE_r({
5948    PerlIO_printf(Perl_debug_log,
5949     "%sre-parsing pattern for runtime code:%s %s\n",
5950     PL_colors[4],PL_colors[5],newpat);
5951   });
5952
5953   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5954   Safefree(newpat);
5955
5956   ENTER;
5957   SAVETMPS;
5958   save_re_context();
5959   PUSHSTACKi(PERLSI_REQUIRE);
5960   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5961   * parsing qr''; normally only q'' does this. It also alters
5962   * hints handling */
5963   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5964   SvREFCNT_dec_NN(sv);
5965   SPAGAIN;
5966   qr_ref = POPs;
5967   PUTBACK;
5968   {
5969    SV * const errsv = ERRSV;
5970    if (SvTRUE_NN(errsv))
5971    {
5972     Safefree(pRExC_state->code_blocks);
5973     /* use croak_sv ? */
5974     Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5975    }
5976   }
5977   assert(SvROK(qr_ref));
5978   qr = SvRV(qr_ref);
5979   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5980   /* the leaving below frees the tmp qr_ref.
5981   * Give qr a life of its own */
5982   SvREFCNT_inc(qr);
5983   POPSTACK;
5984   FREETMPS;
5985   LEAVE;
5986
5987  }
5988
5989  if (!RExC_utf8 && SvUTF8(qr)) {
5990   /* first time through; the pattern got upgraded; save the
5991   * qr for the next time through */
5992   assert(!pRExC_state->runtime_code_qr);
5993   pRExC_state->runtime_code_qr = qr;
5994   return 0;
5995  }
5996
5997
5998  /* extract any code blocks within the returned qr//  */
5999
6000
6001  /* merge the main (r1) and run-time (r2) code blocks into one */
6002  {
6003   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6004   struct reg_code_block *new_block, *dst;
6005   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6006   int i1 = 0, i2 = 0;
6007
6008   if (!r2->num_code_blocks) /* we guessed wrong */
6009   {
6010    SvREFCNT_dec_NN(qr);
6011    return 1;
6012   }
6013
6014   Newx(new_block,
6015    r1->num_code_blocks + r2->num_code_blocks,
6016    struct reg_code_block);
6017   dst = new_block;
6018
6019   while (    i1 < r1->num_code_blocks
6020     || i2 < r2->num_code_blocks)
6021   {
6022    struct reg_code_block *src;
6023    bool is_qr = 0;
6024
6025    if (i1 == r1->num_code_blocks) {
6026     src = &r2->code_blocks[i2++];
6027     is_qr = 1;
6028    }
6029    else if (i2 == r2->num_code_blocks)
6030     src = &r1->code_blocks[i1++];
6031    else if (  r1->code_blocks[i1].start
6032      < r2->code_blocks[i2].start)
6033    {
6034     src = &r1->code_blocks[i1++];
6035     assert(src->end < r2->code_blocks[i2].start);
6036    }
6037    else {
6038     assert(  r1->code_blocks[i1].start
6039      > r2->code_blocks[i2].start);
6040     src = &r2->code_blocks[i2++];
6041     is_qr = 1;
6042     assert(src->end < r1->code_blocks[i1].start);
6043    }
6044
6045    assert(pat[src->start] == '(');
6046    assert(pat[src->end]   == ')');
6047    dst->start     = src->start;
6048    dst->end     = src->end;
6049    dst->block     = src->block;
6050    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6051          : src->src_regex;
6052    dst++;
6053   }
6054   r1->num_code_blocks += r2->num_code_blocks;
6055   Safefree(r1->code_blocks);
6056   r1->code_blocks = new_block;
6057  }
6058
6059  SvREFCNT_dec_NN(qr);
6060  return 1;
6061 }
6062
6063
6064 STATIC bool
6065 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6066      SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6067      SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6068      STRLEN longest_length, bool eol, bool meol)
6069 {
6070  /* This is the common code for setting up the floating and fixed length
6071  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
6072  * as to whether succeeded or not */
6073
6074  I32 t;
6075  SSize_t ml;
6076
6077  if (! (longest_length
6078   || (eol /* Can't have SEOL and MULTI */
6079    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6080   )
6081    /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6082   || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6083  {
6084   return FALSE;
6085  }
6086
6087  /* copy the information about the longest from the reg_scan_data
6088   over to the program. */
6089  if (SvUTF8(sv_longest)) {
6090   *rx_utf8 = sv_longest;
6091   *rx_substr = NULL;
6092  } else {
6093   *rx_substr = sv_longest;
6094   *rx_utf8 = NULL;
6095  }
6096  /* end_shift is how many chars that must be matched that
6097   follow this item. We calculate it ahead of time as once the
6098   lookbehind offset is added in we lose the ability to correctly
6099   calculate it.*/
6100  ml = minlen ? *(minlen) : (SSize_t)longest_length;
6101  *rx_end_shift = ml - offset
6102   - longest_length + (SvTAIL(sv_longest) != 0)
6103   + lookbehind;
6104
6105  t = (eol/* Can't have SEOL and MULTI */
6106   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6107  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6108
6109  return TRUE;
6110 }
6111
6112 /*
6113  * Perl_re_op_compile - the perl internal RE engine's function to compile a
6114  * regular expression into internal code.
6115  * The pattern may be passed either as:
6116  *    a list of SVs (patternp plus pat_count)
6117  *    a list of OPs (expr)
6118  * If both are passed, the SV list is used, but the OP list indicates
6119  * which SVs are actually pre-compiled code blocks
6120  *
6121  * The SVs in the list have magic and qr overloading applied to them (and
6122  * the list may be modified in-place with replacement SVs in the latter
6123  * case).
6124  *
6125  * If the pattern hasn't changed from old_re, then old_re will be
6126  * returned.
6127  *
6128  * eng is the current engine. If that engine has an op_comp method, then
6129  * handle directly (i.e. we assume that op_comp was us); otherwise, just
6130  * do the initial concatenation of arguments and pass on to the external
6131  * engine.
6132  *
6133  * If is_bare_re is not null, set it to a boolean indicating whether the
6134  * arg list reduced (after overloading) to a single bare regex which has
6135  * been returned (i.e. /$qr/).
6136  *
6137  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6138  *
6139  * pm_flags contains the PMf_* flags, typically based on those from the
6140  * pm_flags field of the related PMOP. Currently we're only interested in
6141  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6142  *
6143  * We can't allocate space until we know how big the compiled form will be,
6144  * but we can't compile it (and thus know how big it is) until we've got a
6145  * place to put the code.  So we cheat:  we compile it twice, once with code
6146  * generation turned off and size counting turned on, and once "for real".
6147  * This also means that we don't allocate space until we are sure that the
6148  * thing really will compile successfully, and we never have to move the
6149  * code and thus invalidate pointers into it.  (Note that it has to be in
6150  * one piece because free() must be able to free it all.) [NB: not true in perl]
6151  *
6152  * Beware that the optimization-preparation code in here knows about some
6153  * of the structure of the compiled regexp.  [I'll say.]
6154  */
6155
6156 REGEXP *
6157 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6158      OP *expr, const regexp_engine* eng, REGEXP *old_re,
6159      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6160 {
6161  dVAR;
6162  REGEXP *rx;
6163  struct regexp *r;
6164  regexp_internal *ri;
6165  STRLEN plen;
6166  char *exp;
6167  regnode *scan;
6168  I32 flags;
6169  SSize_t minlen = 0;
6170  U32 rx_flags;
6171  SV *pat;
6172  SV *code_blocksv = NULL;
6173  SV** new_patternp = patternp;
6174
6175  /* these are all flags - maybe they should be turned
6176  * into a single int with different bit masks */
6177  I32 sawlookahead = 0;
6178  I32 sawplus = 0;
6179  I32 sawopen = 0;
6180  I32 sawminmod = 0;
6181
6182  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6183  bool recompile = 0;
6184  bool runtime_code = 0;
6185  scan_data_t data;
6186  RExC_state_t RExC_state;
6187  RExC_state_t * const pRExC_state = &RExC_state;
6188 #ifdef TRIE_STUDY_OPT
6189  int restudied = 0;
6190  RExC_state_t copyRExC_state;
6191 #endif
6192  GET_RE_DEBUG_FLAGS_DECL;
6193
6194  PERL_ARGS_ASSERT_RE_OP_COMPILE;
6195
6196  DEBUG_r(if (!PL_colorset) reginitcolors());
6197
6198 #ifndef PERL_IN_XSUB_RE
6199  /* Initialize these here instead of as-needed, as is quick and avoids
6200  * having to test them each time otherwise */
6201  if (! PL_AboveLatin1) {
6202   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6203   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6204   PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6205   PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6206   PL_HasMultiCharFold =
6207      _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6208  }
6209 #endif
6210
6211  pRExC_state->code_blocks = NULL;
6212  pRExC_state->num_code_blocks = 0;
6213
6214  if (is_bare_re)
6215   *is_bare_re = FALSE;
6216
6217  if (expr && (expr->op_type == OP_LIST ||
6218     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6219   /* allocate code_blocks if needed */
6220   OP *o;
6221   int ncode = 0;
6222
6223   for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6224    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6225     ncode++; /* count of DO blocks */
6226   if (ncode) {
6227    pRExC_state->num_code_blocks = ncode;
6228    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6229   }
6230  }
6231
6232  if (!pat_count) {
6233   /* compile-time pattern with just OP_CONSTs and DO blocks */
6234
6235   int n;
6236   OP *o;
6237
6238   /* find how many CONSTs there are */
6239   assert(expr);
6240   n = 0;
6241   if (expr->op_type == OP_CONST)
6242    n = 1;
6243   else
6244    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6245     if (o->op_type == OP_CONST)
6246      n++;
6247    }
6248
6249   /* fake up an SV array */
6250
6251   assert(!new_patternp);
6252   Newx(new_patternp, n, SV*);
6253   SAVEFREEPV(new_patternp);
6254   pat_count = n;
6255
6256   n = 0;
6257   if (expr->op_type == OP_CONST)
6258    new_patternp[n] = cSVOPx_sv(expr);
6259   else
6260    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6261     if (o->op_type == OP_CONST)
6262      new_patternp[n++] = cSVOPo_sv;
6263    }
6264
6265  }
6266
6267  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6268   "Assembling pattern from %d elements%s\n", pat_count,
6269    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6270
6271  /* set expr to the first arg op */
6272
6273  if (pRExC_state->num_code_blocks
6274   && expr->op_type != OP_CONST)
6275  {
6276    expr = cLISTOPx(expr)->op_first;
6277    assert(   expr->op_type == OP_PUSHMARK
6278     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6279     || expr->op_type == OP_PADRANGE);
6280    expr = expr->op_sibling;
6281  }
6282
6283  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6284       expr, &recompile, NULL);
6285
6286  /* handle bare (possibly after overloading) regex: foo =~ $re */
6287  {
6288   SV *re = pat;
6289   if (SvROK(re))
6290    re = SvRV(re);
6291   if (SvTYPE(re) == SVt_REGEXP) {
6292    if (is_bare_re)
6293     *is_bare_re = TRUE;
6294    SvREFCNT_inc(re);
6295    Safefree(pRExC_state->code_blocks);
6296    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6297     "Precompiled pattern%s\n",
6298      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6299
6300    return (REGEXP*)re;
6301   }
6302  }
6303
6304  exp = SvPV_nomg(pat, plen);
6305
6306  if (!eng->op_comp) {
6307   if ((SvUTF8(pat) && IN_BYTES)
6308     || SvGMAGICAL(pat) || SvAMAGIC(pat))
6309   {
6310    /* make a temporary copy; either to convert to bytes,
6311    * or to avoid repeating get-magic / overloaded stringify */
6312    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6313           (IN_BYTES ? 0 : SvUTF8(pat)));
6314   }
6315   Safefree(pRExC_state->code_blocks);
6316   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6317  }
6318
6319  /* ignore the utf8ness if the pattern is 0 length */
6320  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6321  RExC_uni_semantics = 0;
6322  RExC_contains_locale = 0;
6323  RExC_contains_i = 0;
6324  pRExC_state->runtime_code_qr = NULL;
6325
6326  DEBUG_COMPILE_r({
6327    SV *dsv= sv_newmortal();
6328    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6329    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6330       PL_colors[4],PL_colors[5],s);
6331   });
6332
6333   redo_first_pass:
6334  /* we jump here if we upgrade the pattern to utf8 and have to
6335  * recompile */
6336
6337  if ((pm_flags & PMf_USE_RE_EVAL)
6338     /* this second condition covers the non-regex literal case,
6339     * i.e.  $foo =~ '(?{})'. */
6340     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6341  )
6342   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6343
6344  /* return old regex if pattern hasn't changed */
6345  /* XXX: note in the below we have to check the flags as well as the
6346  * pattern.
6347  *
6348  * Things get a touch tricky as we have to compare the utf8 flag
6349  * independently from the compile flags.  */
6350
6351  if (   old_re
6352   && !recompile
6353   && !!RX_UTF8(old_re) == !!RExC_utf8
6354   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6355   && RX_PRECOMP(old_re)
6356   && RX_PRELEN(old_re) == plen
6357   && memEQ(RX_PRECOMP(old_re), exp, plen)
6358   && !runtime_code /* with runtime code, always recompile */ )
6359  {
6360   Safefree(pRExC_state->code_blocks);
6361   return old_re;
6362  }
6363
6364  rx_flags = orig_rx_flags;
6365
6366  if (rx_flags & PMf_FOLD) {
6367   RExC_contains_i = 1;
6368  }
6369  if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6370
6371   /* Set to use unicode semantics if the pattern is in utf8 and has the
6372   * 'depends' charset specified, as it means unicode when utf8  */
6373   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6374  }
6375
6376  RExC_precomp = exp;
6377  RExC_flags = rx_flags;
6378  RExC_pm_flags = pm_flags;
6379
6380  if (runtime_code) {
6381   if (TAINTING_get && TAINT_get)
6382    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6383
6384   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6385    /* whoops, we have a non-utf8 pattern, whilst run-time code
6386    * got compiled as utf8. Try again with a utf8 pattern */
6387    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6388          pRExC_state->num_code_blocks);
6389    goto redo_first_pass;
6390   }
6391  }
6392  assert(!pRExC_state->runtime_code_qr);
6393
6394  RExC_sawback = 0;
6395
6396  RExC_seen = 0;
6397  RExC_maxlen = 0;
6398  RExC_in_lookbehind = 0;
6399  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6400  RExC_extralen = 0;
6401  RExC_override_recoding = 0;
6402  RExC_in_multi_char_class = 0;
6403
6404  /* First pass: determine size, legality. */
6405  RExC_parse = exp;
6406  RExC_start = exp;
6407  RExC_end = exp + plen;
6408  RExC_naughty = 0;
6409  RExC_npar = 1;
6410  RExC_nestroot = 0;
6411  RExC_size = 0L;
6412  RExC_emit = (regnode *) &RExC_emit_dummy;
6413  RExC_whilem_seen = 0;
6414  RExC_open_parens = NULL;
6415  RExC_close_parens = NULL;
6416  RExC_opend = NULL;
6417  RExC_paren_names = NULL;
6418 #ifdef DEBUGGING
6419  RExC_paren_name_list = NULL;
6420 #endif
6421  RExC_recurse = NULL;
6422  RExC_study_chunk_recursed = NULL;
6423  RExC_study_chunk_recursed_bytes= 0;
6424  RExC_recurse_count = 0;
6425  pRExC_state->code_index = 0;
6426
6427 #if 0 /* REGC() is (currently) a NOP at the first pass.
6428  * Clever compilers notice this and complain. --jhi */
6429  REGC((U8)REG_MAGIC, (char*)RExC_emit);
6430 #endif
6431  DEBUG_PARSE_r(
6432   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6433   RExC_lastnum=0;
6434   RExC_lastparse=NULL;
6435  );
6436  /* reg may croak on us, not giving us a chance to free
6437  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
6438  need it to survive as long as the regexp (qr/(?{})/).
6439  We must check that code_blocksv is not already set, because we may
6440  have jumped back to restart the sizing pass. */
6441  if (pRExC_state->code_blocks && !code_blocksv) {
6442   code_blocksv = newSV_type(SVt_PV);
6443   SAVEFREESV(code_blocksv);
6444   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6445   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6446  }
6447  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6448   /* It's possible to write a regexp in ascii that represents Unicode
6449   codepoints outside of the byte range, such as via \x{100}. If we
6450   detect such a sequence we have to convert the entire pattern to utf8
6451   and then recompile, as our sizing calculation will have been based
6452   on 1 byte == 1 character, but we will need to use utf8 to encode
6453   at least some part of the pattern, and therefore must convert the whole
6454   thing.
6455   -- dmq */
6456   if (flags & RESTART_UTF8) {
6457    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6458          pRExC_state->num_code_blocks);
6459    goto redo_first_pass;
6460   }
6461   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6462  }
6463  if (code_blocksv)
6464   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6465
6466  DEBUG_PARSE_r({
6467   PerlIO_printf(Perl_debug_log,
6468    "Required size %"IVdf" nodes\n"
6469    "Starting second pass (creation)\n",
6470    (IV)RExC_size);
6471   RExC_lastnum=0;
6472   RExC_lastparse=NULL;
6473  });
6474
6475  /* The first pass could have found things that force Unicode semantics */
6476  if ((RExC_utf8 || RExC_uni_semantics)
6477   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6478  {
6479   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6480  }
6481
6482  /* Small enough for pointer-storage convention?
6483  If extralen==0, this means that we will not need long jumps. */
6484  if (RExC_size >= 0x10000L && RExC_extralen)
6485   RExC_size += RExC_extralen;
6486  else
6487   RExC_extralen = 0;
6488  if (RExC_whilem_seen > 15)
6489   RExC_whilem_seen = 15;
6490
6491  /* Allocate space and zero-initialize. Note, the two step process
6492  of zeroing when in debug mode, thus anything assigned has to
6493  happen after that */
6494  rx = (REGEXP*) newSV_type(SVt_REGEXP);
6495  r = ReANY(rx);
6496  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6497   char, regexp_internal);
6498  if ( r == NULL || ri == NULL )
6499   FAIL("Regexp out of space");
6500 #ifdef DEBUGGING
6501  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6502  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6503   char);
6504 #else
6505  /* bulk initialize base fields with 0. */
6506  Zero(ri, sizeof(regexp_internal), char);
6507 #endif
6508
6509  /* non-zero initialization begins here */
6510  RXi_SET( r, ri );
6511  r->engine= eng;
6512  r->extflags = rx_flags;
6513  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6514
6515  if (pm_flags & PMf_IS_QR) {
6516   ri->code_blocks = pRExC_state->code_blocks;
6517   ri->num_code_blocks = pRExC_state->num_code_blocks;
6518  }
6519  else
6520  {
6521   int n;
6522   for (n = 0; n < pRExC_state->num_code_blocks; n++)
6523    if (pRExC_state->code_blocks[n].src_regex)
6524     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6525   SAVEFREEPV(pRExC_state->code_blocks);
6526  }
6527
6528  {
6529   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6530   bool has_charset = (get_regex_charset(r->extflags)
6531              != REGEX_DEPENDS_CHARSET);
6532
6533   /* The caret is output if there are any defaults: if not all the STD
6534   * flags are set, or if no character set specifier is needed */
6535   bool has_default =
6536      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6537      || ! has_charset);
6538   bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6539             == REG_RUN_ON_COMMENT_SEEN);
6540   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6541        >> RXf_PMf_STD_PMMOD_SHIFT);
6542   const char *fptr = STD_PAT_MODS;        /*"msix"*/
6543   char *p;
6544   /* Allocate for the worst case, which is all the std flags are turned
6545   * on.  If more precision is desired, we could do a population count of
6546   * the flags set.  This could be done with a small lookup table, or by
6547   * shifting, masking and adding, or even, when available, assembly
6548   * language for a machine-language population count.
6549   * We never output a minus, as all those are defaults, so are
6550   * covered by the caret */
6551   const STRLEN wraplen = plen + has_p + has_runon
6552    + has_default       /* If needs a caret */
6553
6554     /* If needs a character set specifier */
6555    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6556    + (sizeof(STD_PAT_MODS) - 1)
6557    + (sizeof("(?:)") - 1);
6558
6559   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6560   r->xpv_len_u.xpvlenu_pv = p;
6561   if (RExC_utf8)
6562    SvFLAGS(rx) |= SVf_UTF8;
6563   *p++='('; *p++='?';
6564
6565   /* If a default, cover it using the caret */
6566   if (has_default) {
6567    *p++= DEFAULT_PAT_MOD;
6568   }
6569   if (has_charset) {
6570    STRLEN len;
6571    const char* const name = get_regex_charset_name(r->extflags, &len);
6572    Copy(name, p, len, char);
6573    p += len;
6574   }
6575   if (has_p)
6576    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6577   {
6578    char ch;
6579    while((ch = *fptr++)) {
6580     if(reganch & 1)
6581      *p++ = ch;
6582     reganch >>= 1;
6583    }
6584   }
6585
6586   *p++ = ':';
6587   Copy(RExC_precomp, p, plen, char);
6588   assert ((RX_WRAPPED(rx) - p) < 16);
6589   r->pre_prefix = p - RX_WRAPPED(rx);
6590   p += plen;
6591   if (has_runon)
6592    *p++ = '\n';
6593   *p++ = ')';
6594   *p = 0;
6595   SvCUR_set(rx, p - RX_WRAPPED(rx));
6596  }
6597
6598  r->intflags = 0;
6599  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6600
6601  /* setup various meta data about recursion, this all requires
6602  * RExC_npar to be correctly set, and a bit later on we clear it */
6603  if (RExC_seen & REG_RECURSE_SEEN) {
6604   Newxz(RExC_open_parens, RExC_npar,regnode *);
6605   SAVEFREEPV(RExC_open_parens);
6606   Newxz(RExC_close_parens,RExC_npar,regnode *);
6607   SAVEFREEPV(RExC_close_parens);
6608  }
6609  if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6610   /* Note, RExC_npar is 1 + the number of parens in a pattern.
6611   * So its 1 if there are no parens. */
6612   RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6613           ((RExC_npar & 0x07) != 0);
6614   Newx(RExC_study_chunk_recursed,
6615    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6616   SAVEFREEPV(RExC_study_chunk_recursed);
6617  }
6618
6619  /* Useful during FAIL. */
6620 #ifdef RE_TRACK_PATTERN_OFFSETS
6621  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6622  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6623       "%s %"UVuf" bytes for offset annotations.\n",
6624       ri->u.offsets ? "Got" : "Couldn't get",
6625       (UV)((2*RExC_size+1) * sizeof(U32))));
6626 #endif
6627  SetProgLen(ri,RExC_size);
6628  RExC_rx_sv = rx;
6629  RExC_rx = r;
6630  RExC_rxi = ri;
6631  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6632
6633  /* Second pass: emit code. */
6634  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6635  RExC_pm_flags = pm_flags;
6636  RExC_parse = exp;
6637  RExC_end = exp + plen;
6638  RExC_naughty = 0;
6639  RExC_npar = 1;
6640  RExC_emit_start = ri->program;
6641  RExC_emit = ri->program;
6642  RExC_emit_bound = ri->program + RExC_size + 1;
6643  pRExC_state->code_index = 0;
6644
6645  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6646  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6647   ReREFCNT_dec(rx);
6648   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6649  }
6650  /* XXXX To minimize changes to RE engine we always allocate
6651  3-units-long substrs field. */
6652  Newx(r->substrs, 1, struct reg_substr_data);
6653  if (RExC_recurse_count) {
6654   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6655   SAVEFREEPV(RExC_recurse);
6656  }
6657
6658 reStudy:
6659  r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6660  Zero(r->substrs, 1, struct reg_substr_data);
6661  if (RExC_study_chunk_recursed)
6662   Zero(RExC_study_chunk_recursed,
6663    RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6664
6665 #ifdef TRIE_STUDY_OPT
6666  if (!restudied) {
6667   StructCopy(&zero_scan_data, &data, scan_data_t);
6668   copyRExC_state = RExC_state;
6669  } else {
6670   U32 seen=RExC_seen;
6671   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6672
6673   RExC_state = copyRExC_state;
6674   if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6675    RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6676   else
6677    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6678   StructCopy(&zero_scan_data, &data, scan_data_t);
6679  }
6680 #else
6681  StructCopy(&zero_scan_data, &data, scan_data_t);
6682 #endif
6683
6684  /* Dig out information for optimizations. */
6685  r->extflags = RExC_flags; /* was pm_op */
6686  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6687
6688  if (UTF)
6689   SvUTF8_on(rx); /* Unicode in it? */
6690  ri->regstclass = NULL;
6691  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6692   r->intflags |= PREGf_NAUGHTY;
6693  scan = ri->program + 1;  /* First BRANCH. */
6694
6695  /* testing for BRANCH here tells us whether there is "must appear"
6696  data in the pattern. If there is then we can use it for optimisations */
6697  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /*  Only one top-level choice.
6698             */
6699   SSize_t fake;
6700   STRLEN longest_float_length, longest_fixed_length;
6701   regnode_ssc ch_class; /* pointed to by data */
6702   int stclass_flag;
6703   SSize_t last_close = 0; /* pointed to by data */
6704   regnode *first= scan;
6705   regnode *first_next= regnext(first);
6706   /*
6707   * Skip introductions and multiplicators >= 1
6708   * so that we can extract the 'meat' of the pattern that must
6709   * match in the large if() sequence following.
6710   * NOTE that EXACT is NOT covered here, as it is normally
6711   * picked up by the optimiser separately.
6712   *
6713   * This is unfortunate as the optimiser isnt handling lookahead
6714   * properly currently.
6715   *
6716   */
6717   while ((OP(first) == OPEN && (sawopen = 1)) ||
6718    /* An OR of *one* alternative - should not happen now. */
6719    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6720    /* for now we can't handle lookbehind IFMATCH*/
6721    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6722    (OP(first) == PLUS) ||
6723    (OP(first) == MINMOD) ||
6724    /* An {n,m} with n>0 */
6725    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6726    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6727   {
6728     /*
6729     * the only op that could be a regnode is PLUS, all the rest
6730     * will be regnode_1 or regnode_2.
6731     *
6732     * (yves doesn't think this is true)
6733     */
6734     if (OP(first) == PLUS)
6735      sawplus = 1;
6736     else {
6737      if (OP(first) == MINMOD)
6738       sawminmod = 1;
6739      first += regarglen[OP(first)];
6740     }
6741     first = NEXTOPER(first);
6742     first_next= regnext(first);
6743   }
6744
6745   /* Starting-point info. */
6746  again:
6747   DEBUG_PEEP("first:",first,0);
6748   /* Ignore EXACT as we deal with it later. */
6749   if (PL_regkind[OP(first)] == EXACT) {
6750    if (OP(first) == EXACT)
6751     NOOP; /* Empty, get anchored substr later. */
6752    else
6753     ri->regstclass = first;
6754   }
6755 #ifdef TRIE_STCLASS
6756   else if (PL_regkind[OP(first)] == TRIE &&
6757     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6758   {
6759    regnode *trie_op;
6760    /* this can happen only on restudy */
6761    if ( OP(first) == TRIE ) {
6762     struct regnode_1 *trieop = (struct regnode_1 *)
6763      PerlMemShared_calloc(1, sizeof(struct regnode_1));
6764     StructCopy(first,trieop,struct regnode_1);
6765     trie_op=(regnode *)trieop;
6766    } else {
6767     struct regnode_charclass *trieop = (struct regnode_charclass *)
6768      PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6769     StructCopy(first,trieop,struct regnode_charclass);
6770     trie_op=(regnode *)trieop;
6771    }
6772    OP(trie_op)+=2;
6773    make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6774    ri->regstclass = trie_op;
6775   }
6776 #endif
6777   else if (REGNODE_SIMPLE(OP(first)))
6778    ri->regstclass = first;
6779   else if (PL_regkind[OP(first)] == BOUND ||
6780     PL_regkind[OP(first)] == NBOUND)
6781    ri->regstclass = first;
6782   else if (PL_regkind[OP(first)] == BOL) {
6783    r->intflags |= (OP(first) == MBOL
6784       ? PREGf_ANCH_MBOL
6785       : (OP(first) == SBOL
6786        ? PREGf_ANCH_SBOL
6787        : PREGf_ANCH_BOL));
6788    first = NEXTOPER(first);
6789    goto again;
6790   }
6791   else if (OP(first) == GPOS) {
6792    r->intflags |= PREGf_ANCH_GPOS;
6793    first = NEXTOPER(first);
6794    goto again;
6795   }
6796   else if ((!sawopen || !RExC_sawback) &&
6797    (OP(first) == STAR &&
6798    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6799    !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6800   {
6801    /* turn .* into ^.* with an implied $*=1 */
6802    const int type =
6803     (OP(NEXTOPER(first)) == REG_ANY)
6804      ? PREGf_ANCH_MBOL
6805      : PREGf_ANCH_SBOL;
6806    r->intflags |= (type | PREGf_IMPLICIT);
6807    first = NEXTOPER(first);
6808    goto again;
6809   }
6810   if (sawplus && !sawminmod && !sawlookahead
6811    && (!sawopen || !RExC_sawback)
6812    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6813    /* x+ must match at the 1st pos of run of x's */
6814    r->intflags |= PREGf_SKIP;
6815
6816   /* Scan is after the zeroth branch, first is atomic matcher. */
6817 #ifdef TRIE_STUDY_OPT
6818   DEBUG_PARSE_r(
6819    if (!restudied)
6820     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6821        (IV)(first - scan + 1))
6822   );
6823 #else
6824   DEBUG_PARSE_r(
6825    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6826     (IV)(first - scan + 1))
6827   );
6828 #endif
6829
6830
6831   /*
6832   * If there's something expensive in the r.e., find the
6833   * longest literal string that must appear and make it the
6834   * regmust.  Resolve ties in favor of later strings, since
6835   * the regstart check works with the beginning of the r.e.
6836   * and avoiding duplication strengthens checking.  Not a
6837   * strong reason, but sufficient in the absence of others.
6838   * [Now we resolve ties in favor of the earlier string if
6839   * it happens that c_offset_min has been invalidated, since the
6840   * earlier string may buy us something the later one won't.]
6841   */
6842
6843   data.longest_fixed = newSVpvs("");
6844   data.longest_float = newSVpvs("");
6845   data.last_found = newSVpvs("");
6846   data.longest = &(data.longest_fixed);
6847   ENTER_with_name("study_chunk");
6848   SAVEFREESV(data.longest_fixed);
6849   SAVEFREESV(data.longest_float);
6850   SAVEFREESV(data.last_found);
6851   first = scan;
6852   if (!ri->regstclass) {
6853    ssc_init(pRExC_state, &ch_class);
6854    data.start_class = &ch_class;
6855    stclass_flag = SCF_DO_STCLASS_AND;
6856   } else    /* XXXX Check for BOUND? */
6857    stclass_flag = 0;
6858   data.last_closep = &last_close;
6859
6860   DEBUG_RExC_seen();
6861   minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6862        scan + RExC_size, /* Up to end */
6863    &data, -1, 0, NULL,
6864    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6865       | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6866    0);
6867
6868
6869   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6870
6871
6872   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6873    && data.last_start_min == 0 && data.last_end > 0
6874    && !RExC_seen_zerolen
6875    && !(RExC_seen & REG_VERBARG_SEEN)
6876    && !(RExC_seen & REG_GPOS_SEEN)
6877   ){
6878    r->extflags |= RXf_CHECK_ALL;
6879   }
6880   scan_commit(pRExC_state, &data,&minlen,0);
6881
6882   longest_float_length = CHR_SVLEN(data.longest_float);
6883
6884   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6885     && data.offset_fixed == data.offset_float_min
6886     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6887    && S_setup_longest (aTHX_ pRExC_state,
6888          data.longest_float,
6889          &(r->float_utf8),
6890          &(r->float_substr),
6891          &(r->float_end_shift),
6892          data.lookbehind_float,
6893          data.offset_float_min,
6894          data.minlen_float,
6895          longest_float_length,
6896          cBOOL(data.flags & SF_FL_BEFORE_EOL),
6897          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6898   {
6899    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6900    r->float_max_offset = data.offset_float_max;
6901    if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6902     r->float_max_offset -= data.lookbehind_float;
6903    SvREFCNT_inc_simple_void_NN(data.longest_float);
6904   }
6905   else {
6906    r->float_substr = r->float_utf8 = NULL;
6907    longest_float_length = 0;
6908   }
6909
6910   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6911
6912   if (S_setup_longest (aTHX_ pRExC_state,
6913         data.longest_fixed,
6914         &(r->anchored_utf8),
6915         &(r->anchored_substr),
6916         &(r->anchored_end_shift),
6917         data.lookbehind_fixed,
6918         data.offset_fixed,
6919         data.minlen_fixed,
6920         longest_fixed_length,
6921         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6922         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6923   {
6924    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6925    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6926   }
6927   else {
6928    r->anchored_substr = r->anchored_utf8 = NULL;
6929    longest_fixed_length = 0;
6930   }
6931   LEAVE_with_name("study_chunk");
6932
6933   if (ri->regstclass
6934    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6935    ri->regstclass = NULL;
6936
6937   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6938    && stclass_flag
6939    && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6940    && !ssc_is_anything(data.start_class))
6941   {
6942    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6943
6944    ssc_finalize(pRExC_state, data.start_class);
6945
6946    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6947    StructCopy(data.start_class,
6948      (regnode_ssc*)RExC_rxi->data->data[n],
6949      regnode_ssc);
6950    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6951    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6952    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6953      regprop(r, sv, (regnode*)data.start_class, NULL);
6954      PerlIO_printf(Perl_debug_log,
6955          "synthetic stclass \"%s\".\n",
6956          SvPVX_const(sv));});
6957    data.start_class = NULL;
6958   }
6959
6960   /* A temporary algorithm prefers floated substr to fixed one to dig
6961   * more info. */
6962   if (longest_fixed_length > longest_float_length) {
6963    r->substrs->check_ix = 0;
6964    r->check_end_shift = r->anchored_end_shift;
6965    r->check_substr = r->anchored_substr;
6966    r->check_utf8 = r->anchored_utf8;
6967    r->check_offset_min = r->check_offset_max = r->anchored_offset;
6968    if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
6969     r->intflags |= PREGf_NOSCAN;
6970   }
6971   else {
6972    r->substrs->check_ix = 1;
6973    r->check_end_shift = r->float_end_shift;
6974    r->check_substr = r->float_substr;
6975    r->check_utf8 = r->float_utf8;
6976    r->check_offset_min = r->float_min_offset;
6977    r->check_offset_max = r->float_max_offset;
6978   }
6979   if ((r->check_substr || r->check_utf8) ) {
6980    r->extflags |= RXf_USE_INTUIT;
6981    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6982     r->extflags |= RXf_INTUIT_TAIL;
6983   }
6984   r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
6985
6986   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6987   if ( (STRLEN)minlen < longest_float_length )
6988    minlen= longest_float_length;
6989   if ( (STRLEN)minlen < longest_fixed_length )
6990    minlen= longest_fixed_length;
6991   */
6992  }
6993  else {
6994   /* Several toplevels. Best we can is to set minlen. */
6995   SSize_t fake;
6996   regnode_ssc ch_class;
6997   SSize_t last_close = 0;
6998
6999   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7000
7001   scan = ri->program + 1;
7002   ssc_init(pRExC_state, &ch_class);
7003   data.start_class = &ch_class;
7004   data.last_closep = &last_close;
7005
7006   DEBUG_RExC_seen();
7007   minlen = study_chunk(pRExC_state,
7008    &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7009    SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7010              ? SCF_TRIE_DOING_RESTUDY
7011              : 0),
7012    0);
7013
7014   CHECK_RESTUDY_GOTO_butfirst(NOOP);
7015
7016   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7017     = r->float_substr = r->float_utf8 = NULL;
7018
7019   if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7020    && ! ssc_is_anything(data.start_class))
7021   {
7022    const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7023
7024    ssc_finalize(pRExC_state, data.start_class);
7025
7026    Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7027    StructCopy(data.start_class,
7028      (regnode_ssc*)RExC_rxi->data->data[n],
7029      regnode_ssc);
7030    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7031    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7032    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7033      regprop(r, sv, (regnode*)data.start_class, NULL);
7034      PerlIO_printf(Perl_debug_log,
7035          "synthetic stclass \"%s\".\n",
7036          SvPVX_const(sv));});
7037    data.start_class = NULL;
7038   }
7039  }
7040
7041  if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7042   r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7043   r->maxlen = REG_INFTY;
7044  }
7045  else {
7046   r->maxlen = RExC_maxlen;
7047  }
7048
7049  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7050  the "real" pattern. */
7051  DEBUG_OPTIMISE_r({
7052   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7053      (IV)minlen, (IV)r->minlen, RExC_maxlen);
7054  });
7055  r->minlenret = minlen;
7056  if (r->minlen < minlen)
7057   r->minlen = minlen;
7058
7059  if (RExC_seen & REG_GPOS_SEEN)
7060   r->intflags |= PREGf_GPOS_SEEN;
7061  if (RExC_seen & REG_LOOKBEHIND_SEEN)
7062   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7063             lookbehind */
7064  if (pRExC_state->num_code_blocks)
7065   r->extflags |= RXf_EVAL_SEEN;
7066  if (RExC_seen & REG_CANY_SEEN)
7067   r->intflags |= PREGf_CANY_SEEN;
7068  if (RExC_seen & REG_VERBARG_SEEN)
7069  {
7070   r->intflags |= PREGf_VERBARG_SEEN;
7071   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7072  }
7073  if (RExC_seen & REG_CUTGROUP_SEEN)
7074   r->intflags |= PREGf_CUTGROUP_SEEN;
7075  if (pm_flags & PMf_USE_RE_EVAL)
7076   r->intflags |= PREGf_USE_RE_EVAL;
7077  if (RExC_paren_names)
7078   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7079  else
7080   RXp_PAREN_NAMES(r) = NULL;
7081
7082  /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7083  * so it can be used in pp.c */
7084  if (r->intflags & PREGf_ANCH)
7085   r->extflags |= RXf_IS_ANCHORED;
7086
7087
7088  {
7089   /* this is used to identify "special" patterns that might result
7090   * in Perl NOT calling the regex engine and instead doing the match "itself",
7091   * particularly special cases in split//. By having the regex compiler
7092   * do this pattern matching at a regop level (instead of by inspecting the pattern)
7093   * we avoid weird issues with equivalent patterns resulting in different behavior,
7094   * AND we allow non Perl engines to get the same optimizations by the setting the
7095   * flags appropriately - Yves */
7096   regnode *first = ri->program + 1;
7097   U8 fop = OP(first);
7098   regnode *next = NEXTOPER(first);
7099   U8 nop = OP(next);
7100
7101   if (PL_regkind[fop] == NOTHING && nop == END)
7102    r->extflags |= RXf_NULL;
7103   else if (PL_regkind[fop] == BOL && nop == END)
7104    r->extflags |= RXf_START_ONLY;
7105   else if (fop == PLUS
7106     && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7107     && OP(regnext(first)) == END)
7108    r->extflags |= RXf_WHITE;
7109   else if ( r->extflags & RXf_SPLIT
7110     && fop == EXACT
7111     && STR_LEN(first) == 1
7112     && *(STRING(first)) == ' '
7113     && OP(regnext(first)) == END )
7114    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7115
7116  }
7117
7118  if (RExC_contains_locale) {
7119   RXp_EXTFLAGS(r) |= RXf_TAINTED;
7120  }
7121
7122 #ifdef DEBUGGING
7123  if (RExC_paren_names) {
7124   ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7125   ri->data->data[ri->name_list_idx]
7126         = (void*)SvREFCNT_inc(RExC_paren_name_list);
7127  } else
7128 #endif
7129   ri->name_list_idx = 0;
7130
7131  if (RExC_recurse_count) {
7132   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7133    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7134    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7135   }
7136  }
7137  Newxz(r->offs, RExC_npar, regexp_paren_pair);
7138  /* assume we don't need to swap parens around before we match */
7139
7140  DEBUG_DUMP_r({
7141   DEBUG_RExC_seen();
7142   PerlIO_printf(Perl_debug_log,"Final program:\n");
7143   regdump(r);
7144  });
7145 #ifdef RE_TRACK_PATTERN_OFFSETS
7146  DEBUG_OFFSETS_r(if (ri->u.offsets) {
7147   const STRLEN len = ri->u.offsets[0];
7148   STRLEN i;
7149   GET_RE_DEBUG_FLAGS_DECL;
7150   PerlIO_printf(Perl_debug_log,
7151      "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7152   for (i = 1; i <= len; i++) {
7153    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7154     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7155     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7156    }
7157   PerlIO_printf(Perl_debug_log, "\n");
7158  });
7159 #endif
7160
7161 #ifdef USE_ITHREADS
7162  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7163  * by setting the regexp SV to readonly-only instead. If the
7164  * pattern's been recompiled, the USEDness should remain. */
7165  if (old_re && SvREADONLY(old_re))
7166   SvREADONLY_on(rx);
7167 #endif
7168  return rx;
7169 }
7170
7171
7172 SV*
7173 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7174      const U32 flags)
7175 {
7176  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7177
7178  PERL_UNUSED_ARG(value);
7179
7180  if (flags & RXapif_FETCH) {
7181   return reg_named_buff_fetch(rx, key, flags);
7182  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7183   Perl_croak_no_modify();
7184   return NULL;
7185  } else if (flags & RXapif_EXISTS) {
7186   return reg_named_buff_exists(rx, key, flags)
7187    ? &PL_sv_yes
7188    : &PL_sv_no;
7189  } else if (flags & RXapif_REGNAMES) {
7190   return reg_named_buff_all(rx, flags);
7191  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7192   return reg_named_buff_scalar(rx, flags);
7193  } else {
7194   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7195   return NULL;
7196  }
7197 }
7198
7199 SV*
7200 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7201       const U32 flags)
7202 {
7203  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7204  PERL_UNUSED_ARG(lastkey);
7205
7206  if (flags & RXapif_FIRSTKEY)
7207   return reg_named_buff_firstkey(rx, flags);
7208  else if (flags & RXapif_NEXTKEY)
7209   return reg_named_buff_nextkey(rx, flags);
7210  else {
7211   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7212            (int)flags);
7213   return NULL;
7214  }
7215 }
7216
7217 SV*
7218 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7219       const U32 flags)
7220 {
7221  AV *retarray = NULL;
7222  SV *ret;
7223  struct regexp *const rx = ReANY(r);
7224
7225  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7226
7227  if (flags & RXapif_ALL)
7228   retarray=newAV();
7229
7230  if (rx && RXp_PAREN_NAMES(rx)) {
7231   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7232   if (he_str) {
7233    IV i;
7234    SV* sv_dat=HeVAL(he_str);
7235    I32 *nums=(I32*)SvPVX(sv_dat);
7236    for ( i=0; i<SvIVX(sv_dat); i++ ) {
7237     if ((I32)(rx->nparens) >= nums[i]
7238      && rx->offs[nums[i]].start != -1
7239      && rx->offs[nums[i]].end != -1)
7240     {
7241      ret = newSVpvs("");
7242      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7243      if (!retarray)
7244       return ret;
7245     } else {
7246      if (retarray)
7247       ret = newSVsv(&PL_sv_undef);
7248     }
7249     if (retarray)
7250      av_push(retarray, ret);
7251    }
7252    if (retarray)
7253     return newRV_noinc(MUTABLE_SV(retarray));
7254   }
7255  }
7256  return NULL;
7257 }
7258
7259 bool
7260 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7261       const U32 flags)
7262 {
7263  struct regexp *const rx = ReANY(r);
7264
7265  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7266
7267  if (rx && RXp_PAREN_NAMES(rx)) {
7268   if (flags & RXapif_ALL) {
7269    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7270   } else {
7271    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7272    if (sv) {
7273     SvREFCNT_dec_NN(sv);
7274     return TRUE;
7275    } else {
7276     return FALSE;
7277    }
7278   }
7279  } else {
7280   return FALSE;
7281  }
7282 }
7283
7284 SV*
7285 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7286 {
7287  struct regexp *const rx = ReANY(r);
7288
7289  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7290
7291  if ( rx && RXp_PAREN_NAMES(rx) ) {
7292   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7293
7294   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7295  } else {
7296   return FALSE;
7297  }
7298 }
7299
7300 SV*
7301 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7302 {
7303  struct regexp *const rx = ReANY(r);
7304  GET_RE_DEBUG_FLAGS_DECL;
7305
7306  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7307
7308  if (rx && RXp_PAREN_NAMES(rx)) {
7309   HV *hv = RXp_PAREN_NAMES(rx);
7310   HE *temphe;
7311   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7312    IV i;
7313    IV parno = 0;
7314    SV* sv_dat = HeVAL(temphe);
7315    I32 *nums = (I32*)SvPVX(sv_dat);
7316    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7317     if ((I32)(rx->lastparen) >= nums[i] &&
7318      rx->offs[nums[i]].start != -1 &&
7319      rx->offs[nums[i]].end != -1)
7320     {
7321      parno = nums[i];
7322      break;
7323     }
7324    }
7325    if (parno || flags & RXapif_ALL) {
7326     return newSVhek(HeKEY_hek(temphe));
7327    }
7328   }
7329  }
7330  return NULL;
7331 }
7332
7333 SV*
7334 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7335 {
7336  SV *ret;
7337  AV *av;
7338  SSize_t length;
7339  struct regexp *const rx = ReANY(r);
7340
7341  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7342
7343  if (rx && RXp_PAREN_NAMES(rx)) {
7344   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7345    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7346   } else if (flags & RXapif_ONE) {
7347    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7348    av = MUTABLE_AV(SvRV(ret));
7349    length = av_tindex(av);
7350    SvREFCNT_dec_NN(ret);
7351    return newSViv(length + 1);
7352   } else {
7353    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7354             (int)flags);
7355    return NULL;
7356   }
7357  }
7358  return &PL_sv_undef;
7359 }
7360
7361 SV*
7362 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7363 {
7364  struct regexp *const rx = ReANY(r);
7365  AV *av = newAV();
7366
7367  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7368
7369  if (rx && RXp_PAREN_NAMES(rx)) {
7370   HV *hv= RXp_PAREN_NAMES(rx);
7371   HE *temphe;
7372   (void)hv_iterinit(hv);
7373   while ( (temphe = hv_iternext_flags(hv,0)) ) {
7374    IV i;
7375    IV parno = 0;
7376    SV* sv_dat = HeVAL(temphe);
7377    I32 *nums = (I32*)SvPVX(sv_dat);
7378    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7379     if ((I32)(rx->lastparen) >= nums[i] &&
7380      rx->offs[nums[i]].start != -1 &&
7381      rx->offs[nums[i]].end != -1)
7382     {
7383      parno = nums[i];
7384      break;
7385     }
7386    }
7387    if (parno || flags & RXapif_ALL) {
7388     av_push(av, newSVhek(HeKEY_hek(temphe)));
7389    }
7390   }
7391  }
7392
7393  return newRV_noinc(MUTABLE_SV(av));
7394 }
7395
7396 void
7397 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7398        SV * const sv)
7399 {
7400  struct regexp *const rx = ReANY(r);
7401  char *s = NULL;
7402  SSize_t i = 0;
7403  SSize_t s1, t1;
7404  I32 n = paren;
7405
7406  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7407
7408  if (      n == RX_BUFF_IDX_CARET_PREMATCH
7409   || n == RX_BUFF_IDX_CARET_FULLMATCH
7410   || n == RX_BUFF_IDX_CARET_POSTMATCH
7411  )
7412  {
7413   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7414   if (!keepcopy) {
7415    /* on something like
7416    *    $r = qr/.../;
7417    *    /$qr/p;
7418    * the KEEPCOPY is set on the PMOP rather than the regex */
7419    if (PL_curpm && r == PM_GETRE(PL_curpm))
7420     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7421   }
7422   if (!keepcopy)
7423    goto ret_undef;
7424  }
7425
7426  if (!rx->subbeg)
7427   goto ret_undef;
7428
7429  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7430   /* no need to distinguish between them any more */
7431   n = RX_BUFF_IDX_FULLMATCH;
7432
7433  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7434   && rx->offs[0].start != -1)
7435  {
7436   /* $`, ${^PREMATCH} */
7437   i = rx->offs[0].start;
7438   s = rx->subbeg;
7439  }
7440  else
7441  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7442   && rx->offs[0].end != -1)
7443  {
7444   /* $', ${^POSTMATCH} */
7445   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7446   i = rx->sublen + rx->suboffset - rx->offs[0].end;
7447  }
7448  else
7449  if ( 0 <= n && n <= (I32)rx->nparens &&
7450   (s1 = rx->offs[n].start) != -1 &&
7451   (t1 = rx->offs[n].end) != -1)
7452  {
7453   /* $&, ${^MATCH},  $1 ... */
7454   i = t1 - s1;
7455   s = rx->subbeg + s1 - rx->suboffset;
7456  } else {
7457   goto ret_undef;
7458  }
7459
7460  assert(s >= rx->subbeg);
7461  assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7462  if (i >= 0) {
7463 #ifdef NO_TAINT_SUPPORT
7464   sv_setpvn(sv, s, i);
7465 #else
7466   const int oldtainted = TAINT_get;
7467   TAINT_NOT;
7468   sv_setpvn(sv, s, i);
7469   TAINT_set(oldtainted);
7470 #endif
7471   if ( (rx->intflags & PREGf_CANY_SEEN)
7472    ? (RXp_MATCH_UTF8(rx)
7473       && (!i || is_utf8_string((U8*)s, i)))
7474    : (RXp_MATCH_UTF8(rx)) )
7475   {
7476    SvUTF8_on(sv);
7477   }
7478   else
7479    SvUTF8_off(sv);
7480   if (TAINTING_get) {
7481    if (RXp_MATCH_TAINTED(rx)) {
7482     if (SvTYPE(sv) >= SVt_PVMG) {
7483      MAGIC* const mg = SvMAGIC(sv);
7484      MAGIC* mgt;
7485      TAINT;
7486      SvMAGIC_set(sv, mg->mg_moremagic);
7487      SvTAINT(sv);
7488      if ((mgt = SvMAGIC(sv))) {
7489       mg->mg_moremagic = mgt;
7490       SvMAGIC_set(sv, mg);
7491      }
7492     } else {
7493      TAINT;
7494      SvTAINT(sv);
7495     }
7496    } else
7497     SvTAINTED_off(sv);
7498   }
7499  } else {
7500  ret_undef:
7501   sv_setsv(sv,&PL_sv_undef);
7502   return;
7503  }
7504 }
7505
7506 void
7507 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7508               SV const * const value)
7509 {
7510  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7511
7512  PERL_UNUSED_ARG(rx);
7513  PERL_UNUSED_ARG(paren);
7514  PERL_UNUSED_ARG(value);
7515
7516  if (!PL_localizing)
7517   Perl_croak_no_modify();
7518 }
7519
7520 I32
7521 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7522        const I32 paren)
7523 {
7524  struct regexp *const rx = ReANY(r);
7525  I32 i;
7526  I32 s1, t1;
7527
7528  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7529
7530  if (   paren == RX_BUFF_IDX_CARET_PREMATCH
7531   || paren == RX_BUFF_IDX_CARET_FULLMATCH
7532   || paren == RX_BUFF_IDX_CARET_POSTMATCH
7533  )
7534  {
7535   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7536   if (!keepcopy) {
7537    /* on something like
7538    *    $r = qr/.../;
7539    *    /$qr/p;
7540    * the KEEPCOPY is set on the PMOP rather than the regex */
7541    if (PL_curpm && r == PM_GETRE(PL_curpm))
7542     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7543   }
7544   if (!keepcopy)
7545    goto warn_undef;
7546  }
7547
7548  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7549  switch (paren) {
7550  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7551  case RX_BUFF_IDX_PREMATCH:       /* $` */
7552   if (rx->offs[0].start != -1) {
7553       i = rx->offs[0].start;
7554       if (i > 0) {
7555         s1 = 0;
7556         t1 = i;
7557         goto getlen;
7558       }
7559    }
7560   return 0;
7561
7562  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7563  case RX_BUFF_IDX_POSTMATCH:       /* $' */
7564    if (rx->offs[0].end != -1) {
7565       i = rx->sublen - rx->offs[0].end;
7566       if (i > 0) {
7567         s1 = rx->offs[0].end;
7568         t1 = rx->sublen;
7569         goto getlen;
7570       }
7571    }
7572   return 0;
7573
7574  default: /* $& / ${^MATCH}, $1, $2, ... */
7575    if (paren <= (I32)rx->nparens &&
7576    (s1 = rx->offs[paren].start) != -1 &&
7577    (t1 = rx->offs[paren].end) != -1)
7578    {
7579    i = t1 - s1;
7580    goto getlen;
7581   } else {
7582   warn_undef:
7583    if (ckWARN(WARN_UNINITIALIZED))
7584     report_uninit((const SV *)sv);
7585    return 0;
7586   }
7587  }
7588   getlen:
7589  if (i > 0 && RXp_MATCH_UTF8(rx)) {
7590   const char * const s = rx->subbeg - rx->suboffset + s1;
7591   const U8 *ep;
7592   STRLEN el;
7593
7594   i = t1 - s1;
7595   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7596       i = el;
7597  }
7598  return i;
7599 }
7600
7601 SV*
7602 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7603 {
7604  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7605   PERL_UNUSED_ARG(rx);
7606   if (0)
7607    return NULL;
7608   else
7609    return newSVpvs("Regexp");
7610 }
7611
7612 /* Scans the name of a named buffer from the pattern.
7613  * If flags is REG_RSN_RETURN_NULL returns null.
7614  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7615  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7616  * to the parsed name as looked up in the RExC_paren_names hash.
7617  * If there is an error throws a vFAIL().. type exception.
7618  */
7619
7620 #define REG_RSN_RETURN_NULL    0
7621 #define REG_RSN_RETURN_NAME    1
7622 #define REG_RSN_RETURN_DATA    2
7623
7624 STATIC SV*
7625 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7626 {
7627  char *name_start = RExC_parse;
7628
7629  PERL_ARGS_ASSERT_REG_SCAN_NAME;
7630
7631  assert (RExC_parse <= RExC_end);
7632  if (RExC_parse == RExC_end) NOOP;
7633  else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7634   /* skip IDFIRST by using do...while */
7635   if (UTF)
7636    do {
7637     RExC_parse += UTF8SKIP(RExC_parse);
7638    } while (isWORDCHAR_utf8((U8*)RExC_parse));
7639   else
7640    do {
7641     RExC_parse++;
7642    } while (isWORDCHAR(*RExC_parse));
7643  } else {
7644   RExC_parse++; /* so the <- from the vFAIL is after the offending
7645       character */
7646   vFAIL("Group name must start with a non-digit word character");
7647  }
7648  if ( flags ) {
7649   SV* sv_name
7650    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7651        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7652   if ( flags == REG_RSN_RETURN_NAME)
7653    return sv_name;
7654   else if (flags==REG_RSN_RETURN_DATA) {
7655    HE *he_str = NULL;
7656    SV *sv_dat = NULL;
7657    if ( ! sv_name )      /* should not happen*/
7658     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7659    if (RExC_paren_names)
7660     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7661    if ( he_str )
7662     sv_dat = HeVAL(he_str);
7663    if ( ! sv_dat )
7664     vFAIL("Reference to nonexistent named group");
7665    return sv_dat;
7666   }
7667   else {
7668    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7669      (unsigned long) flags);
7670   }
7671   assert(0); /* NOT REACHED */
7672  }
7673  return NULL;
7674 }
7675
7676 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7677  int rem=(int)(RExC_end - RExC_parse);                       \
7678  int cut;                                                    \
7679  int num;                                                    \
7680  int iscut=0;                                                \
7681  if (rem>10) {                                               \
7682   rem=10;                                                 \
7683   iscut=1;                                                \
7684  }                                                           \
7685  cut=10-rem;                                                 \
7686  if (RExC_lastparse!=RExC_parse)                             \
7687   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7688    rem, RExC_parse,                                    \
7689    cut + 4,                                            \
7690    iscut ? "..." : "<"                                 \
7691   );                                                      \
7692  else                                                        \
7693   PerlIO_printf(Perl_debug_log,"%16s","");                \
7694                 \
7695  if (SIZE_ONLY)                                              \
7696  num = RExC_size + 1;                                     \
7697  else                                                        \
7698  num=REG_NODE_NUM(RExC_emit);                             \
7699  if (RExC_lastnum!=num)                                      \
7700  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7701  else                                                        \
7702  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7703  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7704   (int)((depth*2)), "",                                   \
7705   (funcname)                                              \
7706  );                                                          \
7707  RExC_lastnum=num;                                           \
7708  RExC_lastparse=RExC_parse;                                  \
7709 })
7710
7711
7712
7713 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7714  DEBUG_PARSE_MSG((funcname));                            \
7715  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7716 })
7717 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7718  DEBUG_PARSE_MSG((funcname));                            \
7719  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7720 })
7721
7722 /* This section of code defines the inversion list object and its methods.  The
7723  * interfaces are highly subject to change, so as much as possible is static to
7724  * this file.  An inversion list is here implemented as a malloc'd C UV array
7725  * as an SVt_INVLIST scalar.
7726  *
7727  * An inversion list for Unicode is an array of code points, sorted by ordinal
7728  * number.  The zeroth element is the first code point in the list.  The 1th
7729  * element is the first element beyond that not in the list.  In other words,
7730  * the first range is
7731  *  invlist[0]..(invlist[1]-1)
7732  * The other ranges follow.  Thus every element whose index is divisible by two
7733  * marks the beginning of a range that is in the list, and every element not
7734  * divisible by two marks the beginning of a range not in the list.  A single
7735  * element inversion list that contains the single code point N generally
7736  * consists of two elements
7737  *  invlist[0] == N
7738  *  invlist[1] == N+1
7739  * (The exception is when N is the highest representable value on the
7740  * machine, in which case the list containing just it would be a single
7741  * element, itself.  By extension, if the last range in the list extends to
7742  * infinity, then the first element of that range will be in the inversion list
7743  * at a position that is divisible by two, and is the final element in the
7744  * list.)
7745  * Taking the complement (inverting) an inversion list is quite simple, if the
7746  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7747  * This implementation reserves an element at the beginning of each inversion
7748  * list to always contain 0; there is an additional flag in the header which
7749  * indicates if the list begins at the 0, or is offset to begin at the next
7750  * element.
7751  *
7752  * More about inversion lists can be found in "Unicode Demystified"
7753  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7754  * More will be coming when functionality is added later.
7755  *
7756  * The inversion list data structure is currently implemented as an SV pointing
7757  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7758  * array of UV whose memory management is automatically handled by the existing
7759  * facilities for SV's.
7760  *
7761  * Some of the methods should always be private to the implementation, and some
7762  * should eventually be made public */
7763
7764 /* The header definitions are in F<inline_invlist.c> */
7765
7766 PERL_STATIC_INLINE UV*
7767 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7768 {
7769  /* Returns a pointer to the first element in the inversion list's array.
7770  * This is called upon initialization of an inversion list.  Where the
7771  * array begins depends on whether the list has the code point U+0000 in it
7772  * or not.  The other parameter tells it whether the code that follows this
7773  * call is about to put a 0 in the inversion list or not.  The first
7774  * element is either the element reserved for 0, if TRUE, or the element
7775  * after it, if FALSE */
7776
7777  bool* offset = get_invlist_offset_addr(invlist);
7778  UV* zero_addr = (UV *) SvPVX(invlist);
7779
7780  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7781
7782  /* Must be empty */
7783  assert(! _invlist_len(invlist));
7784
7785  *zero_addr = 0;
7786
7787  /* 1^1 = 0; 1^0 = 1 */
7788  *offset = 1 ^ will_have_0;
7789  return zero_addr + *offset;
7790 }
7791
7792 PERL_STATIC_INLINE UV*
7793 S_invlist_array(pTHX_ SV* const invlist)
7794 {
7795  /* Returns the pointer to the inversion list's array.  Every time the
7796  * length changes, this needs to be called in case malloc or realloc moved
7797  * it */
7798
7799  PERL_ARGS_ASSERT_INVLIST_ARRAY;
7800
7801  /* Must not be empty.  If these fail, you probably didn't check for <len>
7802  * being non-zero before trying to get the array */
7803  assert(_invlist_len(invlist));
7804
7805  /* The very first element always contains zero, The array begins either
7806  * there, or if the inversion list is offset, at the element after it.
7807  * The offset header field determines which; it contains 0 or 1 to indicate
7808  * how much additionally to add */
7809  assert(0 == *(SvPVX(invlist)));
7810  return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7811 }
7812
7813 PERL_STATIC_INLINE void
7814 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7815 {
7816  /* Sets the current number of elements stored in the inversion list.
7817  * Updates SvCUR correspondingly */
7818
7819  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7820
7821  assert(SvTYPE(invlist) == SVt_INVLIST);
7822
7823  SvCUR_set(invlist,
7824    (len == 0)
7825    ? 0
7826    : TO_INTERNAL_SIZE(len + offset));
7827  assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7828 }
7829
7830 PERL_STATIC_INLINE IV*
7831 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7832 {
7833  /* Return the address of the IV that is reserved to hold the cached index
7834  * */
7835
7836  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7837
7838  assert(SvTYPE(invlist) == SVt_INVLIST);
7839
7840  return &(((XINVLIST*) SvANY(invlist))->prev_index);
7841 }
7842
7843 PERL_STATIC_INLINE IV
7844 S_invlist_previous_index(pTHX_ SV* const invlist)
7845 {
7846  /* Returns cached index of previous search */
7847
7848  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7849
7850  return *get_invlist_previous_index_addr(invlist);
7851 }
7852
7853 PERL_STATIC_INLINE void
7854 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7855 {
7856  /* Caches <index> for later retrieval */
7857
7858  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7859
7860  assert(index == 0 || index < (int) _invlist_len(invlist));
7861
7862  *get_invlist_previous_index_addr(invlist) = index;
7863 }
7864
7865 PERL_STATIC_INLINE UV
7866 S_invlist_max(pTHX_ SV* const invlist)
7867 {
7868  /* Returns the maximum number of elements storable in the inversion list's
7869  * array, without having to realloc() */
7870
7871  PERL_ARGS_ASSERT_INVLIST_MAX;
7872
7873  assert(SvTYPE(invlist) == SVt_INVLIST);
7874
7875  /* Assumes worst case, in which the 0 element is not counted in the
7876  * inversion list, so subtracts 1 for that */
7877  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7878   ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7879   : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7880 }
7881
7882 #ifndef PERL_IN_XSUB_RE
7883 SV*
7884 Perl__new_invlist(pTHX_ IV initial_size)
7885 {
7886
7887  /* Return a pointer to a newly constructed inversion list, with enough
7888  * space to store 'initial_size' elements.  If that number is negative, a
7889  * system default is used instead */
7890
7891  SV* new_list;
7892
7893  if (initial_size < 0) {
7894   initial_size = 10;
7895  }
7896
7897  /* Allocate the initial space */
7898  new_list = newSV_type(SVt_INVLIST);
7899
7900  /* First 1 is in case the zero element isn't in the list; second 1 is for
7901  * trailing NUL */
7902  SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7903  invlist_set_len(new_list, 0, 0);
7904
7905  /* Force iterinit() to be used to get iteration to work */
7906  *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7907
7908  *get_invlist_previous_index_addr(new_list) = 0;
7909
7910  return new_list;
7911 }
7912
7913 SV*
7914 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7915 {
7916  /* Return a pointer to a newly constructed inversion list, initialized to
7917  * point to <list>, which has to be in the exact correct inversion list
7918  * form, including internal fields.  Thus this is a dangerous routine that
7919  * should not be used in the wrong hands.  The passed in 'list' contains
7920  * several header fields at the beginning that are not part of the
7921  * inversion list body proper */
7922
7923  const STRLEN length = (STRLEN) list[0];
7924  const UV version_id =          list[1];
7925  const bool offset   =    cBOOL(list[2]);
7926 #define HEADER_LENGTH 3
7927  /* If any of the above changes in any way, you must change HEADER_LENGTH
7928  * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7929  *      perl -E 'say int(rand 2**31-1)'
7930  */
7931 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7932           data structure type, so that one being
7933           passed in can be validated to be an
7934           inversion list of the correct vintage.
7935          */
7936
7937  SV* invlist = newSV_type(SVt_INVLIST);
7938
7939  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7940
7941  if (version_id != INVLIST_VERSION_ID) {
7942   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7943  }
7944
7945  /* The generated array passed in includes header elements that aren't part
7946  * of the list proper, so start it just after them */
7947  SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7948
7949  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7950        shouldn't touch it */
7951
7952  *(get_invlist_offset_addr(invlist)) = offset;
7953
7954  /* The 'length' passed to us is the physical number of elements in the
7955  * inversion list.  But if there is an offset the logical number is one
7956  * less than that */
7957  invlist_set_len(invlist, length  - offset, offset);
7958
7959  invlist_set_previous_index(invlist, 0);
7960
7961  /* Initialize the iteration pointer. */
7962  invlist_iterfinish(invlist);
7963
7964  SvREADONLY_on(invlist);
7965
7966  return invlist;
7967 }
7968 #endif /* ifndef PERL_IN_XSUB_RE */
7969
7970 STATIC void
7971 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7972 {
7973  /* Grow the maximum size of an inversion list */
7974
7975  PERL_ARGS_ASSERT_INVLIST_EXTEND;
7976
7977  assert(SvTYPE(invlist) == SVt_INVLIST);
7978
7979  /* Add one to account for the zero element at the beginning which may not
7980  * be counted by the calling parameters */
7981  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7982 }
7983
7984 PERL_STATIC_INLINE void
7985 S_invlist_trim(pTHX_ SV* const invlist)
7986 {
7987  PERL_ARGS_ASSERT_INVLIST_TRIM;
7988
7989  assert(SvTYPE(invlist) == SVt_INVLIST);
7990
7991  /* Change the length of the inversion list to how many entries it currently
7992  * has */
7993  SvPV_shrink_to_cur((SV *) invlist);
7994 }
7995
7996 STATIC void
7997 S__append_range_to_invlist(pTHX_ SV* const invlist,
7998         const UV start, const UV end)
7999 {
8000    /* Subject to change or removal.  Append the range from 'start' to 'end' at
8001  * the end of the inversion list.  The range must be above any existing
8002  * ones. */
8003
8004  UV* array;
8005  UV max = invlist_max(invlist);
8006  UV len = _invlist_len(invlist);
8007  bool offset;
8008
8009  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8010
8011  if (len == 0) { /* Empty lists must be initialized */
8012   offset = start != 0;
8013   array = _invlist_array_init(invlist, ! offset);
8014  }
8015  else {
8016   /* Here, the existing list is non-empty. The current max entry in the
8017   * list is generally the first value not in the set, except when the
8018   * set extends to the end of permissible values, in which case it is
8019   * the first entry in that final set, and so this call is an attempt to
8020   * append out-of-order */
8021
8022   UV final_element = len - 1;
8023   array = invlist_array(invlist);
8024   if (array[final_element] > start
8025    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8026   {
8027    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8028      array[final_element], start,
8029      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8030   }
8031
8032   /* Here, it is a legal append.  If the new range begins with the first
8033   * value not in the set, it is extending the set, so the new first
8034   * value not in the set is one greater than the newly extended range.
8035   * */
8036   offset = *get_invlist_offset_addr(invlist);
8037   if (array[final_element] == start) {
8038    if (end != UV_MAX) {
8039     array[final_element] = end + 1;
8040    }
8041    else {
8042     /* But if the end is the maximum representable on the machine,
8043     * just let the range that this would extend to have no end */
8044     invlist_set_len(invlist, len - 1, offset);
8045    }
8046    return;
8047   }
8048  }
8049
8050  /* Here the new range doesn't extend any existing set.  Add it */
8051
8052  len += 2; /* Includes an element each for the start and end of range */
8053
8054  /* If wll overflow the existing space, extend, which may cause the array to
8055  * be moved */
8056  if (max < len) {
8057   invlist_extend(invlist, len);
8058
8059   /* Have to set len here to avoid assert failure in invlist_array() */
8060   invlist_set_len(invlist, len, offset);
8061
8062   array = invlist_array(invlist);
8063  }
8064  else {
8065   invlist_set_len(invlist, len, offset);
8066  }
8067
8068  /* The next item on the list starts the range, the one after that is
8069  * one past the new range.  */
8070  array[len - 2] = start;
8071  if (end != UV_MAX) {
8072   array[len - 1] = end + 1;
8073  }
8074  else {
8075   /* But if the end is the maximum representable on the machine, just let
8076   * the range have no end */
8077   invlist_set_len(invlist, len - 1, offset);
8078  }
8079 }
8080
8081 #ifndef PERL_IN_XSUB_RE
8082
8083 IV
8084 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
8085 {
8086  /* Searches the inversion list for the entry that contains the input code
8087  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
8088  * return value is the index into the list's array of the range that
8089  * contains <cp> */
8090
8091  IV low = 0;
8092  IV mid;
8093  IV high = _invlist_len(invlist);
8094  const IV highest_element = high - 1;
8095  const UV* array;
8096
8097  PERL_ARGS_ASSERT__INVLIST_SEARCH;
8098
8099  /* If list is empty, return failure. */
8100  if (high == 0) {
8101   return -1;
8102  }
8103
8104  /* (We can't get the array unless we know the list is non-empty) */
8105  array = invlist_array(invlist);
8106
8107  mid = invlist_previous_index(invlist);
8108  assert(mid >=0 && mid <= highest_element);
8109
8110  /* <mid> contains the cache of the result of the previous call to this
8111  * function (0 the first time).  See if this call is for the same result,
8112  * or if it is for mid-1.  This is under the theory that calls to this
8113  * function will often be for related code points that are near each other.
8114  * And benchmarks show that caching gives better results.  We also test
8115  * here if the code point is within the bounds of the list.  These tests
8116  * replace others that would have had to be made anyway to make sure that
8117  * the array bounds were not exceeded, and these give us extra information
8118  * at the same time */
8119  if (cp >= array[mid]) {
8120   if (cp >= array[highest_element]) {
8121    return highest_element;
8122   }
8123
8124   /* Here, array[mid] <= cp < array[highest_element].  This means that
8125   * the final element is not the answer, so can exclude it; it also
8126   * means that <mid> is not the final element, so can refer to 'mid + 1'
8127   * safely */
8128   if (cp < array[mid + 1]) {
8129    return mid;
8130   }
8131   high--;
8132   low = mid + 1;
8133  }
8134  else { /* cp < aray[mid] */
8135   if (cp < array[0]) { /* Fail if outside the array */
8136    return -1;
8137   }
8138   high = mid;
8139   if (cp >= array[mid - 1]) {
8140    goto found_entry;
8141   }
8142  }
8143
8144  /* Binary search.  What we are looking for is <i> such that
8145  * array[i] <= cp < array[i+1]
8146  * The loop below converges on the i+1.  Note that there may not be an
8147  * (i+1)th element in the array, and things work nonetheless */
8148  while (low < high) {
8149   mid = (low + high) / 2;
8150   assert(mid <= highest_element);
8151   if (array[mid] <= cp) { /* cp >= array[mid] */
8152    low = mid + 1;
8153
8154    /* We could do this extra test to exit the loop early.
8155    if (cp < array[low]) {
8156     return mid;
8157    }
8158    */
8159   }
8160   else { /* cp < array[mid] */
8161    high = mid;
8162   }
8163  }
8164
8165   found_entry:
8166  high--;
8167  invlist_set_previous_index(invlist, high);
8168  return high;
8169 }
8170
8171 void
8172 Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
8173          const UV start, const UV end, U8* swatch)
8174 {
8175  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8176  * but is used when the swash has an inversion list.  This makes this much
8177  * faster, as it uses a binary search instead of a linear one.  This is
8178  * intimately tied to that function, and perhaps should be in utf8.c,
8179  * except it is intimately tied to inversion lists as well.  It assumes
8180  * that <swatch> is all 0's on input */
8181
8182  UV current = start;
8183  const IV len = _invlist_len(invlist);
8184  IV i;
8185  const UV * array;
8186
8187  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8188
8189  if (len == 0) { /* Empty inversion list */
8190   return;
8191  }
8192
8193  array = invlist_array(invlist);
8194
8195  /* Find which element it is */
8196  i = _invlist_search(invlist, start);
8197
8198  /* We populate from <start> to <end> */
8199  while (current < end) {
8200   UV upper;
8201
8202   /* The inversion list gives the results for every possible code point
8203   * after the first one in the list.  Only those ranges whose index is
8204   * even are ones that the inversion list matches.  For the odd ones,
8205   * and if the initial code point is not in the list, we have to skip
8206   * forward to the next element */
8207   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8208    i++;
8209    if (i >= len) { /* Finished if beyond the end of the array */
8210     return;
8211    }
8212    current = array[i];
8213    if (current >= end) {   /* Finished if beyond the end of what we
8214          are populating */
8215     if (LIKELY(end < UV_MAX)) {
8216      return;
8217     }
8218
8219     /* We get here when the upper bound is the maximum
8220     * representable on the machine, and we are looking for just
8221     * that code point.  Have to special case it */
8222     i = len;
8223     goto join_end_of_list;
8224    }
8225   }
8226   assert(current >= start);
8227
8228   /* The current range ends one below the next one, except don't go past
8229   * <end> */
8230   i++;
8231   upper = (i < len && array[i] < end) ? array[i] : end;
8232
8233   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
8234   * for each code point in it */
8235   for (; current < upper; current++) {
8236    const STRLEN offset = (STRLEN)(current - start);
8237    swatch[offset >> 3] |= 1 << (offset & 7);
8238   }
8239
8240  join_end_of_list:
8241
8242   /* Quit if at the end of the list */
8243   if (i >= len) {
8244
8245    /* But first, have to deal with the highest possible code point on
8246    * the platform.  The previous code assumes that <end> is one
8247    * beyond where we want to populate, but that is impossible at the
8248    * platform's infinity, so have to handle it specially */
8249    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8250    {
8251     const STRLEN offset = (STRLEN)(end - start);
8252     swatch[offset >> 3] |= 1 << (offset & 7);
8253    }
8254    return;
8255   }
8256
8257   /* Advance to the next range, which will be for code points not in the
8258   * inversion list */
8259   current = array[i];
8260  }
8261
8262  return;
8263 }
8264
8265 void
8266 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8267           const bool complement_b, SV** output)
8268 {
8269  /* Take the union of two inversion lists and point <output> to it.  *output
8270  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8271  * the reference count to that list will be decremented if not already a
8272  * temporary (mortal); otherwise *output will be made correspondingly
8273  * mortal.  The first list, <a>, may be NULL, in which case a copy of the
8274  * second list is returned.  If <complement_b> is TRUE, the union is taken
8275  * of the complement (inversion) of <b> instead of b itself.
8276  *
8277  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8278  * Richard Gillam, published by Addison-Wesley, and explained at some
8279  * length there.  The preface says to incorporate its examples into your
8280  * code at your own risk.
8281  *
8282  * The algorithm is like a merge sort.
8283  *
8284  * XXX A potential performance improvement is to keep track as we go along
8285  * if only one of the inputs contributes to the result, meaning the other
8286  * is a subset of that one.  In that case, we can skip the final copy and
8287  * return the larger of the input lists, but then outside code might need
8288  * to keep track of whether to free the input list or not */
8289
8290  const UV* array_a;    /* a's array */
8291  const UV* array_b;
8292  UV len_a;     /* length of a's array */
8293  UV len_b;
8294
8295  SV* u;   /* the resulting union */
8296  UV* array_u;
8297  UV len_u;
8298
8299  UV i_a = 0;      /* current index into a's array */
8300  UV i_b = 0;
8301  UV i_u = 0;
8302
8303  /* running count, as explained in the algorithm source book; items are
8304  * stopped accumulating and are output when the count changes to/from 0.
8305  * The count is incremented when we start a range that's in the set, and
8306  * decremented when we start a range that's not in the set.  So its range
8307  * is 0 to 2.  Only when the count is zero is something not in the set.
8308  */
8309  UV count = 0;
8310
8311  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8312  assert(a != b);
8313
8314  /* If either one is empty, the union is the other one */
8315  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8316   bool make_temp = FALSE; /* Should we mortalize the result? */
8317
8318   if (*output == a) {
8319    if (a != NULL) {
8320     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8321      SvREFCNT_dec_NN(a);
8322     }
8323    }
8324   }
8325   if (*output != b) {
8326    *output = invlist_clone(b);
8327    if (complement_b) {
8328     _invlist_invert(*output);
8329    }
8330   } /* else *output already = b; */
8331
8332   if (make_temp) {
8333    sv_2mortal(*output);
8334   }
8335   return;
8336  }
8337  else if ((len_b = _invlist_len(b)) == 0) {
8338   bool make_temp = FALSE;
8339   if (*output == b) {
8340    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8341     SvREFCNT_dec_NN(b);
8342    }
8343   }
8344
8345   /* The complement of an empty list is a list that has everything in it,
8346   * so the union with <a> includes everything too */
8347   if (complement_b) {
8348    if (a == *output) {
8349     if (! (make_temp = cBOOL(SvTEMP(a)))) {
8350      SvREFCNT_dec_NN(a);
8351     }
8352    }
8353    *output = _new_invlist(1);
8354    _append_range_to_invlist(*output, 0, UV_MAX);
8355   }
8356   else if (*output != a) {
8357    *output = invlist_clone(a);
8358   }
8359   /* else *output already = a; */
8360
8361   if (make_temp) {
8362    sv_2mortal(*output);
8363   }
8364   return;
8365  }
8366
8367  /* Here both lists exist and are non-empty */
8368  array_a = invlist_array(a);
8369  array_b = invlist_array(b);
8370
8371  /* If are to take the union of 'a' with the complement of b, set it
8372  * up so are looking at b's complement. */
8373  if (complement_b) {
8374
8375   /* To complement, we invert: if the first element is 0, remove it.  To
8376   * do this, we just pretend the array starts one later */
8377   if (array_b[0] == 0) {
8378    array_b++;
8379    len_b--;
8380   }
8381   else {
8382
8383    /* But if the first element is not zero, we pretend the list starts
8384    * at the 0 that is always stored immediately before the array. */
8385    array_b--;
8386    len_b++;
8387   }
8388  }
8389
8390  /* Size the union for the worst case: that the sets are completely
8391  * disjoint */
8392  u = _new_invlist(len_a + len_b);
8393
8394  /* Will contain U+0000 if either component does */
8395  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8396          || (len_b > 0 && array_b[0] == 0));
8397
8398  /* Go through each list item by item, stopping when exhausted one of
8399  * them */
8400  while (i_a < len_a && i_b < len_b) {
8401   UV cp;     /* The element to potentially add to the union's array */
8402   bool cp_in_set;   /* is it in the the input list's set or not */
8403
8404   /* We need to take one or the other of the two inputs for the union.
8405   * Since we are merging two sorted lists, we take the smaller of the
8406   * next items.  In case of a tie, we take the one that is in its set
8407   * first.  If we took one not in the set first, it would decrement the
8408   * count, possibly to 0 which would cause it to be output as ending the
8409   * range, and the next time through we would take the same number, and
8410   * output it again as beginning the next range.  By doing it the
8411   * opposite way, there is no possibility that the count will be
8412   * momentarily decremented to 0, and thus the two adjoining ranges will
8413   * be seamlessly merged.  (In a tie and both are in the set or both not
8414   * in the set, it doesn't matter which we take first.) */
8415   if (array_a[i_a] < array_b[i_b]
8416    || (array_a[i_a] == array_b[i_b]
8417     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8418   {
8419    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8420    cp= array_a[i_a++];
8421   }
8422   else {
8423    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8424    cp = array_b[i_b++];
8425   }
8426
8427   /* Here, have chosen which of the two inputs to look at.  Only output
8428   * if the running count changes to/from 0, which marks the
8429   * beginning/end of a range in that's in the set */
8430   if (cp_in_set) {
8431    if (count == 0) {
8432     array_u[i_u++] = cp;
8433    }
8434    count++;
8435   }
8436   else {
8437    count--;
8438    if (count == 0) {
8439     array_u[i_u++] = cp;
8440    }
8441   }
8442  }
8443
8444  /* Here, we are finished going through at least one of the lists, which
8445  * means there is something remaining in at most one.  We check if the list
8446  * that hasn't been exhausted is positioned such that we are in the middle
8447  * of a range in its set or not.  (i_a and i_b point to the element beyond
8448  * the one we care about.) If in the set, we decrement 'count'; if 0, there
8449  * is potentially more to output.
8450  * There are four cases:
8451  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
8452  *    in the union is entirely from the non-exhausted set.
8453  * 2) Both were in their sets, count is 2.  Nothing further should
8454  *    be output, as everything that remains will be in the exhausted
8455  *    list's set, hence in the union; decrementing to 1 but not 0 insures
8456  *    that
8457  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8458  *    Nothing further should be output because the union includes
8459  *    everything from the exhausted set.  Not decrementing ensures that.
8460  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8461  *    decrementing to 0 insures that we look at the remainder of the
8462  *    non-exhausted set */
8463  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8464   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8465  {
8466   count--;
8467  }
8468
8469  /* The final length is what we've output so far, plus what else is about to
8470  * be output.  (If 'count' is non-zero, then the input list we exhausted
8471  * has everything remaining up to the machine's limit in its set, and hence
8472  * in the union, so there will be no further output. */
8473  len_u = i_u;
8474  if (count == 0) {
8475   /* At most one of the subexpressions will be non-zero */
8476   len_u += (len_a - i_a) + (len_b - i_b);
8477  }
8478
8479  /* Set result to final length, which can change the pointer to array_u, so
8480  * re-find it */
8481  if (len_u != _invlist_len(u)) {
8482   invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8483   invlist_trim(u);
8484   array_u = invlist_array(u);
8485  }
8486
8487  /* When 'count' is 0, the list that was exhausted (if one was shorter than
8488  * the other) ended with everything above it not in its set.  That means
8489  * that the remaining part of the union is precisely the same as the
8490  * non-exhausted list, so can just copy it unchanged.  (If both list were
8491  * exhausted at the same time, then the operations below will be both 0.)
8492  */
8493  if (count == 0) {
8494   IV copy_count; /* At most one will have a non-zero copy count */
8495   if ((copy_count = len_a - i_a) > 0) {
8496    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8497   }
8498   else if ((copy_count = len_b - i_b) > 0) {
8499    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8500   }
8501  }
8502
8503  /*  We may be removing a reference to one of the inputs.  If so, the output
8504  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8505  *  count decremented) */
8506  if (a == *output || b == *output) {
8507   assert(! invlist_is_iterating(*output));
8508   if ((SvTEMP(*output))) {
8509    sv_2mortal(u);
8510   }
8511   else {
8512    SvREFCNT_dec_NN(*output);
8513   }
8514  }
8515
8516  *output = u;
8517
8518  return;
8519 }
8520
8521 void
8522 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8523            const bool complement_b, SV** i)
8524 {
8525  /* Take the intersection of two inversion lists and point <i> to it.  *i
8526  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8527  * the reference count to that list will be decremented if not already a
8528  * temporary (mortal); otherwise *i will be made correspondingly mortal.
8529  * The first list, <a>, may be NULL, in which case an empty list is
8530  * returned.  If <complement_b> is TRUE, the result will be the
8531  * intersection of <a> and the complement (or inversion) of <b> instead of
8532  * <b> directly.
8533  *
8534  * The basis for this comes from "Unicode Demystified" Chapter 13 by
8535  * Richard Gillam, published by Addison-Wesley, and explained at some
8536  * length there.  The preface says to incorporate its examples into your
8537  * code at your own risk.  In fact, it had bugs
8538  *
8539  * The algorithm is like a merge sort, and is essentially the same as the
8540  * union above
8541  */
8542
8543  const UV* array_a;  /* a's array */
8544  const UV* array_b;
8545  UV len_a; /* length of a's array */
8546  UV len_b;
8547
8548  SV* r;       /* the resulting intersection */
8549  UV* array_r;
8550  UV len_r;
8551
8552  UV i_a = 0;      /* current index into a's array */
8553  UV i_b = 0;
8554  UV i_r = 0;
8555
8556  /* running count, as explained in the algorithm source book; items are
8557  * stopped accumulating and are output when the count changes to/from 2.
8558  * The count is incremented when we start a range that's in the set, and
8559  * decremented when we start a range that's not in the set.  So its range
8560  * is 0 to 2.  Only when the count is 2 is something in the intersection.
8561  */
8562  UV count = 0;
8563
8564  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8565  assert(a != b);
8566
8567  /* Special case if either one is empty */
8568  len_a = (a == NULL) ? 0 : _invlist_len(a);
8569  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8570   bool make_temp = FALSE;
8571
8572   if (len_a != 0 && complement_b) {
8573
8574    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8575    * be empty.  Here, also we are using 'b's complement, which hence
8576    * must be every possible code point.  Thus the intersection is
8577    * simply 'a'. */
8578    if (*i != a) {
8579     if (*i == b) {
8580      if (! (make_temp = cBOOL(SvTEMP(b)))) {
8581       SvREFCNT_dec_NN(b);
8582      }
8583     }
8584
8585     *i = invlist_clone(a);
8586    }
8587    /* else *i is already 'a' */
8588
8589    if (make_temp) {
8590     sv_2mortal(*i);
8591    }
8592    return;
8593   }
8594
8595   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
8596   * intersection must be empty */
8597   if (*i == a) {
8598    if (! (make_temp = cBOOL(SvTEMP(a)))) {
8599     SvREFCNT_dec_NN(a);
8600    }
8601   }
8602   else if (*i == b) {
8603    if (! (make_temp = cBOOL(SvTEMP(b)))) {
8604     SvREFCNT_dec_NN(b);
8605    }
8606   }
8607   *i = _new_invlist(0);
8608   if (make_temp) {
8609    sv_2mortal(*i);
8610   }
8611
8612   return;
8613  }
8614
8615  /* Here both lists exist and are non-empty */
8616  array_a = invlist_array(a);
8617  array_b = invlist_array(b);
8618
8619  /* If are to take the intersection of 'a' with the complement of b, set it
8620  * up so are looking at b's complement. */
8621  if (complement_b) {
8622
8623   /* To complement, we invert: if the first element is 0, remove it.  To
8624   * do this, we just pretend the array starts one later */
8625   if (array_b[0] == 0) {
8626    array_b++;
8627    len_b--;
8628   }
8629   else {
8630
8631    /* But if the first element is not zero, we pretend the list starts
8632    * at the 0 that is always stored immediately before the array. */
8633    array_b--;
8634    len_b++;
8635   }
8636  }
8637
8638  /* Size the intersection for the worst case: that the intersection ends up
8639  * fragmenting everything to be completely disjoint */
8640  r= _new_invlist(len_a + len_b);
8641
8642  /* Will contain U+0000 iff both components do */
8643  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8644          && len_b > 0 && array_b[0] == 0);
8645
8646  /* Go through each list item by item, stopping when exhausted one of
8647  * them */
8648  while (i_a < len_a && i_b < len_b) {
8649   UV cp;     /* The element to potentially add to the intersection's
8650      array */
8651   bool cp_in_set; /* Is it in the input list's set or not */
8652
8653   /* We need to take one or the other of the two inputs for the
8654   * intersection.  Since we are merging two sorted lists, we take the
8655   * smaller of the next items.  In case of a tie, we take the one that
8656   * is not in its set first (a difference from the union algorithm).  If
8657   * we took one in the set first, it would increment the count, possibly
8658   * to 2 which would cause it to be output as starting a range in the
8659   * intersection, and the next time through we would take that same
8660   * number, and output it again as ending the set.  By doing it the
8661   * opposite of this, there is no possibility that the count will be
8662   * momentarily incremented to 2.  (In a tie and both are in the set or
8663   * both not in the set, it doesn't matter which we take first.) */
8664   if (array_a[i_a] < array_b[i_b]
8665    || (array_a[i_a] == array_b[i_b]
8666     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8667   {
8668    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8669    cp= array_a[i_a++];
8670   }
8671   else {
8672    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8673    cp= array_b[i_b++];
8674   }
8675
8676   /* Here, have chosen which of the two inputs to look at.  Only output
8677   * if the running count changes to/from 2, which marks the
8678   * beginning/end of a range that's in the intersection */
8679   if (cp_in_set) {
8680    count++;
8681    if (count == 2) {
8682     array_r[i_r++] = cp;
8683    }
8684   }
8685   else {
8686    if (count == 2) {
8687     array_r[i_r++] = cp;
8688    }
8689    count--;
8690   }
8691  }
8692
8693  /* Here, we are finished going through at least one of the lists, which
8694  * means there is something remaining in at most one.  We check if the list
8695  * that has been exhausted is positioned such that we are in the middle
8696  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
8697  * the ones we care about.)  There are four cases:
8698  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
8699  *    nothing left in the intersection.
8700  * 2) Both were in their sets, count is 2 and perhaps is incremented to
8701  *    above 2.  What should be output is exactly that which is in the
8702  *    non-exhausted set, as everything it has is also in the intersection
8703  *    set, and everything it doesn't have can't be in the intersection
8704  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8705  *    gets incremented to 2.  Like the previous case, the intersection is
8706  *    everything that remains in the non-exhausted set.
8707  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8708  *    remains 1.  And the intersection has nothing more. */
8709  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8710   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8711  {
8712   count++;
8713  }
8714
8715  /* The final length is what we've output so far plus what else is in the
8716  * intersection.  At most one of the subexpressions below will be non-zero
8717  * */
8718  len_r = i_r;
8719  if (count >= 2) {
8720   len_r += (len_a - i_a) + (len_b - i_b);
8721  }
8722
8723  /* Set result to final length, which can change the pointer to array_r, so
8724  * re-find it */
8725  if (len_r != _invlist_len(r)) {
8726   invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8727   invlist_trim(r);
8728   array_r = invlist_array(r);
8729  }
8730
8731  /* Finish outputting any remaining */
8732  if (count >= 2) { /* At most one will have a non-zero copy count */
8733   IV copy_count;
8734   if ((copy_count = len_a - i_a) > 0) {
8735    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8736   }
8737   else if ((copy_count = len_b - i_b) > 0) {
8738    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8739   }
8740  }
8741
8742  /*  We may be removing a reference to one of the inputs.  If so, the output
8743  *  is made mortal if the input was.  (Mortal SVs shouldn't have their ref
8744  *  count decremented) */
8745  if (a == *i || b == *i) {
8746   assert(! invlist_is_iterating(*i));
8747   if (SvTEMP(*i)) {
8748    sv_2mortal(r);
8749   }
8750   else {
8751    SvREFCNT_dec_NN(*i);
8752   }
8753  }
8754
8755  *i = r;
8756
8757  return;
8758 }
8759
8760 SV*
8761 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8762 {
8763  /* Add the range from 'start' to 'end' inclusive to the inversion list's
8764  * set.  A pointer to the inversion list is returned.  This may actually be
8765  * a new list, in which case the passed in one has been destroyed.  The
8766  * passed in inversion list can be NULL, in which case a new one is created
8767  * with just the one range in it */
8768
8769  SV* range_invlist;
8770  UV len;
8771
8772  if (invlist == NULL) {
8773   invlist = _new_invlist(2);
8774   len = 0;
8775  }
8776  else {
8777   len = _invlist_len(invlist);
8778  }
8779
8780  /* If comes after the final entry actually in the list, can just append it
8781  * to the end, */
8782  if (len == 0
8783   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8784    && start >= invlist_array(invlist)[len - 1]))
8785  {
8786   _append_range_to_invlist(invlist, start, end);
8787   return invlist;
8788  }
8789
8790  /* Here, can't just append things, create and return a new inversion list
8791  * which is the union of this range and the existing inversion list */
8792  range_invlist = _new_invlist(2);
8793  _append_range_to_invlist(range_invlist, start, end);
8794
8795  _invlist_union(invlist, range_invlist, &invlist);
8796
8797  /* The temporary can be freed */
8798  SvREFCNT_dec_NN(range_invlist);
8799
8800  return invlist;
8801 }
8802
8803 SV*
8804 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8805         UV** other_elements_ptr)
8806 {
8807  /* Create and return an inversion list whose contents are to be populated
8808  * by the caller.  The caller gives the number of elements (in 'size') and
8809  * the very first element ('element0').  This function will set
8810  * '*other_elements_ptr' to an array of UVs, where the remaining elements
8811  * are to be placed.
8812  *
8813  * Obviously there is some trust involved that the caller will properly
8814  * fill in the other elements of the array.
8815  *
8816  * (The first element needs to be passed in, as the underlying code does
8817  * things differently depending on whether it is zero or non-zero) */
8818
8819  SV* invlist = _new_invlist(size);
8820  bool offset;
8821
8822  PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8823
8824  _append_range_to_invlist(invlist, element0, element0);
8825  offset = *get_invlist_offset_addr(invlist);
8826
8827  invlist_set_len(invlist, size, offset);
8828  *other_elements_ptr = invlist_array(invlist) + 1;
8829  return invlist;
8830 }
8831
8832 #endif
8833
8834 PERL_STATIC_INLINE SV*
8835 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8836  return _add_range_to_invlist(invlist, cp, cp);
8837 }
8838
8839 #ifndef PERL_IN_XSUB_RE
8840 void
8841 Perl__invlist_invert(pTHX_ SV* const invlist)
8842 {
8843  /* Complement the input inversion list.  This adds a 0 if the list didn't
8844  * have a zero; removes it otherwise.  As described above, the data
8845  * structure is set up so that this is very efficient */
8846
8847  PERL_ARGS_ASSERT__INVLIST_INVERT;
8848
8849  assert(! invlist_is_iterating(invlist));
8850
8851  /* The inverse of matching nothing is matching everything */
8852  if (_invlist_len(invlist) == 0) {
8853   _append_range_to_invlist(invlist, 0, UV_MAX);
8854   return;
8855  }
8856
8857  *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8858 }
8859
8860 #endif
8861
8862 PERL_STATIC_INLINE SV*
8863 S_invlist_clone(pTHX_ SV* const invlist)
8864 {
8865
8866  /* Return a new inversion list that is a copy of the input one, which is
8867  * unchanged.  The new list will not be mortal even if the old one was. */
8868
8869  /* Need to allocate extra space to accommodate Perl's addition of a
8870  * trailing NUL to SvPV's, since it thinks they are always strings */
8871  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8872  STRLEN physical_length = SvCUR(invlist);
8873  bool offset = *(get_invlist_offset_addr(invlist));
8874
8875  PERL_ARGS_ASSERT_INVLIST_CLONE;
8876
8877  *(get_invlist_offset_addr(new_invlist)) = offset;
8878  invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8879  Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8880
8881  return new_invlist;
8882 }
8883
8884 PERL_STATIC_INLINE STRLEN*
8885 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8886 {
8887  /* Return the address of the UV that contains the current iteration
8888  * position */
8889
8890  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8891
8892  assert(SvTYPE(invlist) == SVt_INVLIST);
8893
8894  return &(((XINVLIST*) SvANY(invlist))->iterator);
8895 }
8896
8897 PERL_STATIC_INLINE void
8898 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8899 {
8900  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8901
8902  *get_invlist_iter_addr(invlist) = 0;
8903 }
8904
8905 PERL_STATIC_INLINE void
8906 S_invlist_iterfinish(pTHX_ SV* invlist)
8907 {
8908  /* Terminate iterator for invlist.  This is to catch development errors.
8909  * Any iteration that is interrupted before completed should call this
8910  * function.  Functions that add code points anywhere else but to the end
8911  * of an inversion list assert that they are not in the middle of an
8912  * iteration.  If they were, the addition would make the iteration
8913  * problematical: if the iteration hadn't reached the place where things
8914  * were being added, it would be ok */
8915
8916  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8917
8918  *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8919 }
8920
8921 STATIC bool
8922 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8923 {
8924  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8925  * This call sets in <*start> and <*end>, the next range in <invlist>.
8926  * Returns <TRUE> if successful and the next call will return the next
8927  * range; <FALSE> if was already at the end of the list.  If the latter,
8928  * <*start> and <*end> are unchanged, and the next call to this function
8929  * will start over at the beginning of the list */
8930
8931  STRLEN* pos = get_invlist_iter_addr(invlist);
8932  UV len = _invlist_len(invlist);
8933  UV *array;
8934
8935  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8936
8937  if (*pos >= len) {
8938   *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8939   return FALSE;
8940  }
8941
8942  array = invlist_array(invlist);
8943
8944  *start = array[(*pos)++];
8945
8946  if (*pos >= len) {
8947   *end = UV_MAX;
8948  }
8949  else {
8950   *end = array[(*pos)++] - 1;
8951  }
8952
8953  return TRUE;
8954 }
8955
8956 PERL_STATIC_INLINE bool
8957 S_invlist_is_iterating(pTHX_ SV* const invlist)
8958 {
8959  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8960
8961  return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8962 }
8963
8964 PERL_STATIC_INLINE UV
8965 S_invlist_highest(pTHX_ SV* const invlist)
8966 {
8967  /* Returns the highest code point that matches an inversion list.  This API
8968  * has an ambiguity, as it returns 0 under either the highest is actually
8969  * 0, or if the list is empty.  If this distinction matters to you, check
8970  * for emptiness before calling this function */
8971
8972  UV len = _invlist_len(invlist);
8973  UV *array;
8974
8975  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8976
8977  if (len == 0) {
8978   return 0;
8979  }
8980
8981  array = invlist_array(invlist);
8982
8983  /* The last element in the array in the inversion list always starts a
8984  * range that goes to infinity.  That range may be for code points that are
8985  * matched in the inversion list, or it may be for ones that aren't
8986  * matched.  In the latter case, the highest code point in the set is one
8987  * less than the beginning of this range; otherwise it is the final element
8988  * of this range: infinity */
8989  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8990   ? UV_MAX
8991   : array[len - 1] - 1;
8992 }
8993
8994 #ifndef PERL_IN_XSUB_RE
8995 SV *
8996 Perl__invlist_contents(pTHX_ SV* const invlist)
8997 {
8998  /* Get the contents of an inversion list into a string SV so that they can
8999  * be printed out.  It uses the format traditionally done for debug tracing
9000  */
9001
9002  UV start, end;
9003  SV* output = newSVpvs("\n");
9004
9005  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9006
9007  assert(! invlist_is_iterating(invlist));
9008
9009  invlist_iterinit(invlist);
9010  while (invlist_iternext(invlist, &start, &end)) {
9011   if (end == UV_MAX) {
9012    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9013   }
9014   else if (end != start) {
9015    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9016      start,       end);
9017   }
9018   else {
9019    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9020   }
9021  }
9022
9023  return output;
9024 }
9025 #endif
9026
9027 #ifndef PERL_IN_XSUB_RE
9028 void
9029 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9030       const char * const indent, SV* const invlist)
9031 {
9032  /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
9033  * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
9034  * the string 'indent'.  The output looks like this:
9035   [0] 0x000A .. 0x000D
9036   [2] 0x0085
9037   [4] 0x2028 .. 0x2029
9038   [6] 0x3104 .. INFINITY
9039  * This means that the first range of code points matched by the list are
9040  * 0xA through 0xD; the second range contains only the single code point
9041  * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
9042  * are used to define each range (except if the final range extends to
9043  * infinity, only a single element is needed).  The array index of the
9044  * first element for the corresponding range is given in brackets. */
9045
9046  UV start, end;
9047  STRLEN count = 0;
9048
9049  PERL_ARGS_ASSERT__INVLIST_DUMP;
9050
9051  if (invlist_is_iterating(invlist)) {
9052   Perl_dump_indent(aTHX_ level, file,
9053    "%sCan't dump inversion list because is in middle of iterating\n",
9054    indent);
9055   return;
9056  }
9057
9058  invlist_iterinit(invlist);
9059  while (invlist_iternext(invlist, &start, &end)) {
9060   if (end == UV_MAX) {
9061    Perl_dump_indent(aTHX_ level, file,
9062          "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9063         indent, (UV)count, start);
9064   }
9065   else if (end != start) {
9066    Perl_dump_indent(aTHX_ level, file,
9067          "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9068         indent, (UV)count, start,         end);
9069   }
9070   else {
9071    Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9072            indent, (UV)count, start);
9073   }
9074   count += 2;
9075  }
9076 }
9077 #endif
9078
9079 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9080 bool
9081 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9082 {
9083  /* Return a boolean as to if the two passed in inversion lists are
9084  * identical.  The final argument, if TRUE, says to take the complement of
9085  * the second inversion list before doing the comparison */
9086
9087  const UV* array_a = invlist_array(a);
9088  const UV* array_b = invlist_array(b);
9089  UV len_a = _invlist_len(a);
9090  UV len_b = _invlist_len(b);
9091
9092  UV i = 0;      /* current index into the arrays */
9093  bool retval = TRUE;     /* Assume are identical until proven otherwise */
9094
9095  PERL_ARGS_ASSERT__INVLISTEQ;
9096
9097  /* If are to compare 'a' with the complement of b, set it
9098  * up so are looking at b's complement. */
9099  if (complement_b) {
9100
9101   /* The complement of nothing is everything, so <a> would have to have
9102   * just one element, starting at zero (ending at infinity) */
9103   if (len_b == 0) {
9104    return (len_a == 1 && array_a[0] == 0);
9105   }
9106   else if (array_b[0] == 0) {
9107
9108    /* Otherwise, to complement, we invert.  Here, the first element is
9109    * 0, just remove it.  To do this, we just pretend the array starts
9110    * one later */
9111
9112    array_b++;
9113    len_b--;
9114   }
9115   else {
9116
9117    /* But if the first element is not zero, we pretend the list starts
9118    * at the 0 that is always stored immediately before the array. */
9119    array_b--;
9120    len_b++;
9121   }
9122  }
9123
9124  /* Make sure that the lengths are the same, as well as the final element
9125  * before looping through the remainder.  (Thus we test the length, final,
9126  * and first elements right off the bat) */
9127  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9128   retval = FALSE;
9129  }
9130  else for (i = 0; i < len_a - 1; i++) {
9131   if (array_a[i] != array_b[i]) {
9132    retval = FALSE;
9133    break;
9134   }
9135  }
9136
9137  return retval;
9138 }
9139 #endif
9140
9141 #undef HEADER_LENGTH
9142 #undef TO_INTERNAL_SIZE
9143 #undef FROM_INTERNAL_SIZE
9144 #undef INVLIST_VERSION_ID
9145
9146 /* End of inversion list object */
9147
9148 STATIC void
9149 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9150 {
9151  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9152  * constructs, and updates RExC_flags with them.  On input, RExC_parse
9153  * should point to the first flag; it is updated on output to point to the
9154  * final ')' or ':'.  There needs to be at least one flag, or this will
9155  * abort */
9156
9157  /* for (?g), (?gc), and (?o) warnings; warning
9158  about (?c) will warn about (?g) -- japhy    */
9159
9160 #define WASTED_O  0x01
9161 #define WASTED_G  0x02
9162 #define WASTED_C  0x04
9163 #define WASTED_GC (WASTED_G|WASTED_C)
9164  I32 wastedflags = 0x00;
9165  U32 posflags = 0, negflags = 0;
9166  U32 *flagsp = &posflags;
9167  char has_charset_modifier = '\0';
9168  regex_charset cs;
9169  bool has_use_defaults = FALSE;
9170  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9171
9172  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9173
9174  /* '^' as an initial flag sets certain defaults */
9175  if (UCHARAT(RExC_parse) == '^') {
9176   RExC_parse++;
9177   has_use_defaults = TRUE;
9178   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9179   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9180           ? REGEX_UNICODE_CHARSET
9181           : REGEX_DEPENDS_CHARSET);
9182  }
9183
9184  cs = get_regex_charset(RExC_flags);
9185  if (cs == REGEX_DEPENDS_CHARSET
9186   && (RExC_utf8 || RExC_uni_semantics))
9187  {
9188   cs = REGEX_UNICODE_CHARSET;
9189  }
9190
9191  while (*RExC_parse) {
9192   /* && strchr("iogcmsx", *RExC_parse) */
9193   /* (?g), (?gc) and (?o) are useless here
9194   and must be globally applied -- japhy */
9195   switch (*RExC_parse) {
9196
9197    /* Code for the imsx flags */
9198    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9199
9200    case LOCALE_PAT_MOD:
9201     if (has_charset_modifier) {
9202      goto excess_modifier;
9203     }
9204     else if (flagsp == &negflags) {
9205      goto neg_modifier;
9206     }
9207     cs = REGEX_LOCALE_CHARSET;
9208     has_charset_modifier = LOCALE_PAT_MOD;
9209     break;
9210    case UNICODE_PAT_MOD:
9211     if (has_charset_modifier) {
9212      goto excess_modifier;
9213     }
9214     else if (flagsp == &negflags) {
9215      goto neg_modifier;
9216     }
9217     cs = REGEX_UNICODE_CHARSET;
9218     has_charset_modifier = UNICODE_PAT_MOD;
9219     break;
9220    case ASCII_RESTRICT_PAT_MOD:
9221     if (flagsp == &negflags) {
9222      goto neg_modifier;
9223     }
9224     if (has_charset_modifier) {
9225      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9226       goto excess_modifier;
9227      }
9228      /* Doubled modifier implies more restricted */
9229      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9230     }
9231     else {
9232      cs = REGEX_ASCII_RESTRICTED_CHARSET;
9233     }
9234     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9235     break;
9236    case DEPENDS_PAT_MOD:
9237     if (has_use_defaults) {
9238      goto fail_modifiers;
9239     }
9240     else if (flagsp == &negflags) {
9241      goto neg_modifier;
9242     }
9243     else if (has_charset_modifier) {
9244      goto excess_modifier;
9245     }
9246
9247     /* The dual charset means unicode semantics if the
9248     * pattern (or target, not known until runtime) are
9249     * utf8, or something in the pattern indicates unicode
9250     * semantics */
9251     cs = (RExC_utf8 || RExC_uni_semantics)
9252      ? REGEX_UNICODE_CHARSET
9253      : REGEX_DEPENDS_CHARSET;
9254     has_charset_modifier = DEPENDS_PAT_MOD;
9255     break;
9256    excess_modifier:
9257     RExC_parse++;
9258     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9259      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9260     }
9261     else if (has_charset_modifier == *(RExC_parse - 1)) {
9262      vFAIL2("Regexp modifier \"%c\" may not appear twice",
9263           *(RExC_parse - 1));
9264     }
9265     else {
9266      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9267     }
9268     /*NOTREACHED*/
9269    neg_modifier:
9270     RExC_parse++;
9271     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9272          *(RExC_parse - 1));
9273     /*NOTREACHED*/
9274    case ONCE_PAT_MOD: /* 'o' */
9275    case GLOBAL_PAT_MOD: /* 'g' */
9276     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9277      const I32 wflagbit = *RExC_parse == 'o'
9278           ? WASTED_O
9279           : WASTED_G;
9280      if (! (wastedflags & wflagbit) ) {
9281       wastedflags |= wflagbit;
9282       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9283       vWARN5(
9284        RExC_parse + 1,
9285        "Useless (%s%c) - %suse /%c modifier",
9286        flagsp == &negflags ? "?-" : "?",
9287        *RExC_parse,
9288        flagsp == &negflags ? "don't " : "",
9289        *RExC_parse
9290       );
9291      }
9292     }
9293     break;
9294
9295    case CONTINUE_PAT_MOD: /* 'c' */
9296     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9297      if (! (wastedflags & WASTED_C) ) {
9298       wastedflags |= WASTED_GC;
9299       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9300       vWARN3(
9301        RExC_parse + 1,
9302        "Useless (%sc) - %suse /gc modifier",
9303        flagsp == &negflags ? "?-" : "?",
9304        flagsp == &negflags ? "don't " : ""
9305       );
9306      }
9307     }
9308     break;
9309    case KEEPCOPY_PAT_MOD: /* 'p' */
9310     if (flagsp == &negflags) {
9311      if (SIZE_ONLY)
9312       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9313     } else {
9314      *flagsp |= RXf_PMf_KEEPCOPY;
9315     }
9316     break;
9317    case '-':
9318     /* A flag is a default iff it is following a minus, so
9319     * if there is a minus, it means will be trying to
9320     * re-specify a default which is an error */
9321     if (has_use_defaults || flagsp == &negflags) {
9322      goto fail_modifiers;
9323     }
9324     flagsp = &negflags;
9325     wastedflags = 0;  /* reset so (?g-c) warns twice */
9326     break;
9327    case ':':
9328    case ')':
9329     RExC_flags |= posflags;
9330     RExC_flags &= ~negflags;
9331     set_regex_charset(&RExC_flags, cs);
9332     if (RExC_flags & RXf_PMf_FOLD) {
9333      RExC_contains_i = 1;
9334     }
9335     return;
9336     /*NOTREACHED*/
9337    default:
9338    fail_modifiers:
9339     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9340     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9341     vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9342      UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9343     /*NOTREACHED*/
9344   }
9345
9346   ++RExC_parse;
9347  }
9348 }
9349
9350 /*
9351  - reg - regular expression, i.e. main body or parenthesized thing
9352  *
9353  * Caller must absorb opening parenthesis.
9354  *
9355  * Combining parenthesis handling with the base level of regular expression
9356  * is a trifle forced, but the need to tie the tails of the branches to what
9357  * follows makes it hard to avoid.
9358  */
9359 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9360 #ifdef DEBUGGING
9361 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9362 #else
9363 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9364 #endif
9365
9366 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9367    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9368    needs to be restarted.
9369    Otherwise would only return NULL if regbranch() returns NULL, which
9370    cannot happen.  */
9371 STATIC regnode *
9372 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9373  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9374  * 2 is like 1, but indicates that nextchar() has been called to advance
9375  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
9376  * this flag alerts us to the need to check for that */
9377 {
9378  dVAR;
9379  regnode *ret;  /* Will be the head of the group. */
9380  regnode *br;
9381  regnode *lastbr;
9382  regnode *ender = NULL;
9383  I32 parno = 0;
9384  I32 flags;
9385  U32 oregflags = RExC_flags;
9386  bool have_branch = 0;
9387  bool is_open = 0;
9388  I32 freeze_paren = 0;
9389  I32 after_freeze = 0;
9390
9391  char * parse_start = RExC_parse; /* MJD */
9392  char * const oregcomp_parse = RExC_parse;
9393
9394  GET_RE_DEBUG_FLAGS_DECL;
9395
9396  PERL_ARGS_ASSERT_REG;
9397  DEBUG_PARSE("reg ");
9398
9399  *flagp = 0;    /* Tentatively. */
9400
9401
9402  /* Make an OPEN node, if parenthesized. */
9403  if (paren) {
9404
9405   /* Under /x, space and comments can be gobbled up between the '(' and
9406   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
9407   * intervening space, as the sequence is a token, and a token should be
9408   * indivisible */
9409   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9410
9411   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9412    char *start_verb = RExC_parse;
9413    STRLEN verb_len = 0;
9414    char *start_arg = NULL;
9415    unsigned char op = 0;
9416    int argok = 1;
9417    int internal_argval = 0; /* internal_argval is only useful if
9418           !argok */
9419
9420    if (has_intervening_patws && SIZE_ONLY) {
9421     ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9422    }
9423    while ( *RExC_parse && *RExC_parse != ')' ) {
9424     if ( *RExC_parse == ':' ) {
9425      start_arg = RExC_parse + 1;
9426      break;
9427     }
9428     RExC_parse++;
9429    }
9430    ++start_verb;
9431    verb_len = RExC_parse - start_verb;
9432    if ( start_arg ) {
9433     RExC_parse++;
9434     while ( *RExC_parse && *RExC_parse != ')' )
9435      RExC_parse++;
9436     if ( *RExC_parse != ')' )
9437      vFAIL("Unterminated verb pattern argument");
9438     if ( RExC_parse == start_arg )
9439      start_arg = NULL;
9440    } else {
9441     if ( *RExC_parse != ')' )
9442      vFAIL("Unterminated verb pattern");
9443    }
9444
9445    switch ( *start_verb ) {
9446    case 'A':  /* (*ACCEPT) */
9447     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9448      op = ACCEPT;
9449      internal_argval = RExC_nestroot;
9450     }
9451     break;
9452    case 'C':  /* (*COMMIT) */
9453     if ( memEQs(start_verb,verb_len,"COMMIT") )
9454      op = COMMIT;
9455     break;
9456    case 'F':  /* (*FAIL) */
9457     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9458      op = OPFAIL;
9459      argok = 0;
9460     }
9461     break;
9462    case ':':  /* (*:NAME) */
9463    case 'M':  /* (*MARK:NAME) */
9464     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9465      op = MARKPOINT;
9466      argok = -1;
9467     }
9468     break;
9469    case 'P':  /* (*PRUNE) */
9470     if ( memEQs(start_verb,verb_len,"PRUNE") )
9471      op = PRUNE;
9472     break;
9473    case 'S':   /* (*SKIP) */
9474     if ( memEQs(start_verb,verb_len,"SKIP") )
9475      op = SKIP;
9476     break;
9477    case 'T':  /* (*THEN) */
9478     /* [19:06] <TimToady> :: is then */
9479     if ( memEQs(start_verb,verb_len,"THEN") ) {
9480      op = CUTGROUP;
9481      RExC_seen |= REG_CUTGROUP_SEEN;
9482     }
9483     break;
9484    }
9485    if ( ! op ) {
9486     RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9487     vFAIL2utf8f(
9488      "Unknown verb pattern '%"UTF8f"'",
9489      UTF8fARG(UTF, verb_len, start_verb));
9490    }
9491    if ( argok ) {
9492     if ( start_arg && internal_argval ) {
9493      vFAIL3("Verb pattern '%.*s' may not have an argument",
9494       verb_len, start_verb);
9495     } else if ( argok < 0 && !start_arg ) {
9496      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9497       verb_len, start_verb);
9498     } else {
9499      ret = reganode(pRExC_state, op, internal_argval);
9500      if ( ! internal_argval && ! SIZE_ONLY ) {
9501       if (start_arg) {
9502        SV *sv = newSVpvn( start_arg,
9503            RExC_parse - start_arg);
9504        ARG(ret) = add_data( pRExC_state,
9505             STR_WITH_LEN("S"));
9506        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9507        ret->flags = 0;
9508       } else {
9509        ret->flags = 1;
9510       }
9511      }
9512     }
9513     if (!internal_argval)
9514      RExC_seen |= REG_VERBARG_SEEN;
9515    } else if ( start_arg ) {
9516     vFAIL3("Verb pattern '%.*s' may not have an argument",
9517       verb_len, start_verb);
9518    } else {
9519     ret = reg_node(pRExC_state, op);
9520    }
9521    nextchar(pRExC_state);
9522    return ret;
9523   }
9524   else if (*RExC_parse == '?') { /* (?...) */
9525    bool is_logical = 0;
9526    const char * const seqstart = RExC_parse;
9527    if (has_intervening_patws && SIZE_ONLY) {
9528     ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9529    }
9530
9531    RExC_parse++;
9532    paren = *RExC_parse++;
9533    ret = NULL;   /* For look-ahead/behind. */
9534    switch (paren) {
9535
9536    case 'P': /* (?P...) variants for those used to PCRE/Python */
9537     paren = *RExC_parse++;
9538     if ( paren == '<')         /* (?P<...>) named capture */
9539      goto named_capture;
9540     else if (paren == '>') {   /* (?P>name) named recursion */
9541      goto named_recursion;
9542     }
9543     else if (paren == '=') {   /* (?P=...)  named backref */
9544      /* this pretty much dupes the code for \k<NAME> in
9545      * regatom(), if you change this make sure you change that
9546      * */
9547      char* name_start = RExC_parse;
9548      U32 num = 0;
9549      SV *sv_dat = reg_scan_name(pRExC_state,
9550       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9551      if (RExC_parse == name_start || *RExC_parse != ')')
9552       /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9553       vFAIL2("Sequence %.3s... not terminated",parse_start);
9554
9555      if (!SIZE_ONLY) {
9556       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9557       RExC_rxi->data->data[num]=(void*)sv_dat;
9558       SvREFCNT_inc_simple_void(sv_dat);
9559      }
9560      RExC_sawback = 1;
9561      ret = reganode(pRExC_state,
9562         ((! FOLD)
9563          ? NREF
9564          : (ASCII_FOLD_RESTRICTED)
9565          ? NREFFA
9566          : (AT_LEAST_UNI_SEMANTICS)
9567           ? NREFFU
9568           : (LOC)
9569           ? NREFFL
9570           : NREFF),
9571          num);
9572      *flagp |= HASWIDTH;
9573
9574      Set_Node_Offset(ret, parse_start+1);
9575      Set_Node_Cur_Length(ret, parse_start);
9576
9577      nextchar(pRExC_state);
9578      return ret;
9579     }
9580     RExC_parse++;
9581     /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9582     vFAIL3("Sequence (%.*s...) not recognized",
9583         RExC_parse-seqstart, seqstart);
9584     /*NOTREACHED*/
9585    case '<':           /* (?<...) */
9586     if (*RExC_parse == '!')
9587      paren = ',';
9588     else if (*RExC_parse != '=')
9589    named_capture:
9590     {               /* (?<...>) */
9591      char *name_start;
9592      SV *svname;
9593      paren= '>';
9594    case '\'':          /* (?'...') */
9595       name_start= RExC_parse;
9596       svname = reg_scan_name(pRExC_state,
9597       SIZE_ONLY    /* reverse test from the others */
9598       ? REG_RSN_RETURN_NAME
9599       : REG_RSN_RETURN_NULL);
9600      if (RExC_parse == name_start || *RExC_parse != paren)
9601       vFAIL2("Sequence (?%c... not terminated",
9602        paren=='>' ? '<' : paren);
9603      if (SIZE_ONLY) {
9604       HE *he_str;
9605       SV *sv_dat = NULL;
9606       if (!svname) /* shouldn't happen */
9607        Perl_croak(aTHX_
9608         "panic: reg_scan_name returned NULL");
9609       if (!RExC_paren_names) {
9610        RExC_paren_names= newHV();
9611        sv_2mortal(MUTABLE_SV(RExC_paren_names));
9612 #ifdef DEBUGGING
9613        RExC_paren_name_list= newAV();
9614        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9615 #endif
9616       }
9617       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9618       if ( he_str )
9619        sv_dat = HeVAL(he_str);
9620       if ( ! sv_dat ) {
9621        /* croak baby croak */
9622        Perl_croak(aTHX_
9623         "panic: paren_name hash element allocation failed");
9624       } else if ( SvPOK(sv_dat) ) {
9625        /* (?|...) can mean we have dupes so scan to check
9626        its already been stored. Maybe a flag indicating
9627        we are inside such a construct would be useful,
9628        but the arrays are likely to be quite small, so
9629        for now we punt -- dmq */
9630        IV count = SvIV(sv_dat);
9631        I32 *pv = (I32*)SvPVX(sv_dat);
9632        IV i;
9633        for ( i = 0 ; i < count ; i++ ) {
9634         if ( pv[i] == RExC_npar ) {
9635          count = 0;
9636          break;
9637         }
9638        }
9639        if ( count ) {
9640         pv = (I32*)SvGROW(sv_dat,
9641             SvCUR(sv_dat) + sizeof(I32)+1);
9642         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9643         pv[count] = RExC_npar;
9644         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9645        }
9646       } else {
9647        (void)SvUPGRADE(sv_dat,SVt_PVNV);
9648        sv_setpvn(sv_dat, (char *)&(RExC_npar),
9649                 sizeof(I32));
9650        SvIOK_on(sv_dat);
9651        SvIV_set(sv_dat, 1);
9652       }
9653 #ifdef DEBUGGING
9654       /* Yes this does cause a memory leak in debugging Perls
9655       * */
9656       if (!av_store(RExC_paren_name_list,
9657          RExC_npar, SvREFCNT_inc(svname)))
9658        SvREFCNT_dec_NN(svname);
9659 #endif
9660
9661       /*sv_dump(sv_dat);*/
9662      }
9663      nextchar(pRExC_state);
9664      paren = 1;
9665      goto capturing_parens;
9666     }
9667     RExC_seen |= REG_LOOKBEHIND_SEEN;
9668     RExC_in_lookbehind++;
9669     RExC_parse++;
9670    case '=':           /* (?=...) */
9671     RExC_seen_zerolen++;
9672     break;
9673    case '!':           /* (?!...) */
9674     RExC_seen_zerolen++;
9675     if (*RExC_parse == ')') {
9676      ret=reg_node(pRExC_state, OPFAIL);
9677      nextchar(pRExC_state);
9678      return ret;
9679     }
9680     break;
9681    case '|':           /* (?|...) */
9682     /* branch reset, behave like a (?:...) except that
9683     buffers in alternations share the same numbers */
9684     paren = ':';
9685     after_freeze = freeze_paren = RExC_npar;
9686     break;
9687    case ':':           /* (?:...) */
9688    case '>':           /* (?>...) */
9689     break;
9690    case '$':           /* (?$...) */
9691    case '@':           /* (?@...) */
9692     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9693     break;
9694    case '#':           /* (?#...) */
9695     /* XXX As soon as we disallow separating the '?' and '*' (by
9696     * spaces or (?#...) comment), it is believed that this case
9697     * will be unreachable and can be removed.  See
9698     * [perl #117327] */
9699     while (*RExC_parse && *RExC_parse != ')')
9700      RExC_parse++;
9701     if (*RExC_parse != ')')
9702      FAIL("Sequence (?#... not terminated");
9703     nextchar(pRExC_state);
9704     *flagp = TRYAGAIN;
9705     return NULL;
9706    case '0' :           /* (?0) */
9707    case 'R' :           /* (?R) */
9708     if (*RExC_parse != ')')
9709      FAIL("Sequence (?R) not terminated");
9710     ret = reg_node(pRExC_state, GOSTART);
9711      RExC_seen |= REG_GOSTART_SEEN;
9712     *flagp |= POSTPONED;
9713     nextchar(pRExC_state);
9714     return ret;
9715     /*notreached*/
9716    { /* named and numeric backreferences */
9717     I32 num;
9718    case '&':            /* (?&NAME) */
9719     parse_start = RExC_parse - 1;
9720    named_recursion:
9721     {
9722       SV *sv_dat = reg_scan_name(pRExC_state,
9723        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9724       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9725     }
9726     if (RExC_parse == RExC_end || *RExC_parse != ')')
9727      vFAIL("Sequence (?&... not terminated");
9728     goto gen_recurse_regop;
9729     assert(0); /* NOT REACHED */
9730    case '+':
9731     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9732      RExC_parse++;
9733      vFAIL("Illegal pattern");
9734     }
9735     goto parse_recursion;
9736     /* NOT REACHED*/
9737    case '-': /* (?-1) */
9738     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9739      RExC_parse--; /* rewind to let it be handled later */
9740      goto parse_flags;
9741     }
9742     /*FALLTHROUGH */
9743    case '1': case '2': case '3': case '4': /* (?1) */
9744    case '5': case '6': case '7': case '8': case '9':
9745     RExC_parse--;
9746    parse_recursion:
9747     num = atoi(RExC_parse);
9748     parse_start = RExC_parse - 1; /* MJD */
9749     if (*RExC_parse == '-')
9750      RExC_parse++;
9751     while (isDIGIT(*RExC_parse))
9752       RExC_parse++;
9753     if (*RExC_parse!=')')
9754      vFAIL("Expecting close bracket");
9755
9756    gen_recurse_regop:
9757     if ( paren == '-' ) {
9758      /*
9759      Diagram of capture buffer numbering.
9760      Top line is the normal capture buffer numbers
9761      Bottom line is the negative indexing as from
9762      the X (the (?-2))
9763
9764      +   1 2    3 4 5 X          6 7
9765      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9766      -   5 4    3 2 1 X          x x
9767
9768      */
9769      num = RExC_npar + num;
9770      if (num < 1)  {
9771       RExC_parse++;
9772       vFAIL("Reference to nonexistent group");
9773      }
9774     } else if ( paren == '+' ) {
9775      num = RExC_npar + num - 1;
9776     }
9777
9778     ret = reganode(pRExC_state, GOSUB, num);
9779     if (!SIZE_ONLY) {
9780      if (num > (I32)RExC_rx->nparens) {
9781       RExC_parse++;
9782       vFAIL("Reference to nonexistent group");
9783      }
9784      ARG2L_SET( ret, RExC_recurse_count++);
9785      RExC_emit++;
9786      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9787       "Recurse #%"UVuf" to %"IVdf"\n",
9788        (UV)ARG(ret), (IV)ARG2L(ret)));
9789     } else {
9790      RExC_size++;
9791      }
9792      RExC_seen |= REG_RECURSE_SEEN;
9793     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9794     Set_Node_Offset(ret, parse_start); /* MJD */
9795
9796     *flagp |= POSTPONED;
9797     nextchar(pRExC_state);
9798     return ret;
9799    } /* named and numeric backreferences */
9800    assert(0); /* NOT REACHED */
9801
9802    case '?':           /* (??...) */
9803     is_logical = 1;
9804     if (*RExC_parse != '{') {
9805      RExC_parse++;
9806      /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9807      vFAIL2utf8f(
9808       "Sequence (%"UTF8f"...) not recognized",
9809       UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9810      /*NOTREACHED*/
9811     }
9812     *flagp |= POSTPONED;
9813     paren = *RExC_parse++;
9814     /* FALL THROUGH */
9815    case '{':           /* (?{...}) */
9816    {
9817     U32 n = 0;
9818     struct reg_code_block *cb;
9819
9820     RExC_seen_zerolen++;
9821
9822     if (   !pRExC_state->num_code_blocks
9823      || pRExC_state->code_index >= pRExC_state->num_code_blocks
9824      || pRExC_state->code_blocks[pRExC_state->code_index].start
9825       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9826        - RExC_start)
9827     ) {
9828      if (RExC_pm_flags & PMf_USE_RE_EVAL)
9829       FAIL("panic: Sequence (?{...}): no code block found\n");
9830      FAIL("Eval-group not allowed at runtime, use re 'eval'");
9831     }
9832     /* this is a pre-compiled code block (?{...}) */
9833     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9834     RExC_parse = RExC_start + cb->end;
9835     if (!SIZE_ONLY) {
9836      OP *o = cb->block;
9837      if (cb->src_regex) {
9838       n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9839       RExC_rxi->data->data[n] =
9840        (void*)SvREFCNT_inc((SV*)cb->src_regex);
9841       RExC_rxi->data->data[n+1] = (void*)o;
9842      }
9843      else {
9844       n = add_data(pRExC_state,
9845        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9846       RExC_rxi->data->data[n] = (void*)o;
9847      }
9848     }
9849     pRExC_state->code_index++;
9850     nextchar(pRExC_state);
9851
9852     if (is_logical) {
9853      regnode *eval;
9854      ret = reg_node(pRExC_state, LOGICAL);
9855      eval = reganode(pRExC_state, EVAL, n);
9856      if (!SIZE_ONLY) {
9857       ret->flags = 2;
9858       /* for later propagation into (??{}) return value */
9859       eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9860      }
9861      REGTAIL(pRExC_state, ret, eval);
9862      /* deal with the length of this later - MJD */
9863      return ret;
9864     }
9865     ret = reganode(pRExC_state, EVAL, n);
9866     Set_Node_Length(ret, RExC_parse - parse_start + 1);
9867     Set_Node_Offset(ret, parse_start);
9868     return ret;
9869    }
9870    case '(':           /* (?(?{...})...) and (?(?=...)...) */
9871    {
9872     int is_define= 0;
9873     if (RExC_parse[0] == '?') {        /* (?(?...)) */
9874      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9875       || RExC_parse[1] == '<'
9876       || RExC_parse[1] == '{') { /* Lookahead or eval. */
9877       I32 flag;
9878       regnode *tail;
9879
9880       ret = reg_node(pRExC_state, LOGICAL);
9881       if (!SIZE_ONLY)
9882        ret->flags = 1;
9883
9884       tail = reg(pRExC_state, 1, &flag, depth+1);
9885       if (flag & RESTART_UTF8) {
9886        *flagp = RESTART_UTF8;
9887        return NULL;
9888       }
9889       REGTAIL(pRExC_state, ret, tail);
9890       goto insert_if;
9891      }
9892     }
9893     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9894       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9895     {
9896      char ch = RExC_parse[0] == '<' ? '>' : '\'';
9897      char *name_start= RExC_parse++;
9898      U32 num = 0;
9899      SV *sv_dat=reg_scan_name(pRExC_state,
9900       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9901      if (RExC_parse == name_start || *RExC_parse != ch)
9902       vFAIL2("Sequence (?(%c... not terminated",
9903        (ch == '>' ? '<' : ch));
9904      RExC_parse++;
9905      if (!SIZE_ONLY) {
9906       num = add_data( pRExC_state, STR_WITH_LEN("S"));
9907       RExC_rxi->data->data[num]=(void*)sv_dat;
9908       SvREFCNT_inc_simple_void(sv_dat);
9909      }
9910      ret = reganode(pRExC_state,NGROUPP,num);
9911      goto insert_if_check_paren;
9912     }
9913     else if (RExC_parse[0] == 'D' &&
9914       RExC_parse[1] == 'E' &&
9915       RExC_parse[2] == 'F' &&
9916       RExC_parse[3] == 'I' &&
9917       RExC_parse[4] == 'N' &&
9918       RExC_parse[5] == 'E')
9919     {
9920      ret = reganode(pRExC_state,DEFINEP,0);
9921      RExC_parse +=6 ;
9922      is_define = 1;
9923      goto insert_if_check_paren;
9924     }
9925     else if (RExC_parse[0] == 'R') {
9926      RExC_parse++;
9927      parno = 0;
9928      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9929       parno = atoi(RExC_parse++);
9930       while (isDIGIT(*RExC_parse))
9931        RExC_parse++;
9932      } else if (RExC_parse[0] == '&') {
9933       SV *sv_dat;
9934       RExC_parse++;
9935       sv_dat = reg_scan_name(pRExC_state,
9936        SIZE_ONLY
9937        ? REG_RSN_RETURN_NULL
9938        : REG_RSN_RETURN_DATA);
9939        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9940      }
9941      ret = reganode(pRExC_state,INSUBP,parno);
9942      goto insert_if_check_paren;
9943     }
9944     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9945      /* (?(1)...) */
9946      char c;
9947      char *tmp;
9948      parno = atoi(RExC_parse++);
9949
9950      while (isDIGIT(*RExC_parse))
9951       RExC_parse++;
9952      ret = reganode(pRExC_state, GROUPP, parno);
9953
9954     insert_if_check_paren:
9955      if (*(tmp = nextchar(pRExC_state)) != ')') {
9956       /* nextchar also skips comments, so undo its work
9957       * and skip over the the next character.
9958       */
9959       RExC_parse = tmp;
9960       RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9961       vFAIL("Switch condition not recognized");
9962      }
9963     insert_if:
9964      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9965      br = regbranch(pRExC_state, &flags, 1,depth+1);
9966      if (br == NULL) {
9967       if (flags & RESTART_UTF8) {
9968        *flagp = RESTART_UTF8;
9969        return NULL;
9970       }
9971       FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9972        (UV) flags);
9973      } else
9974       REGTAIL(pRExC_state, br, reganode(pRExC_state,
9975               LONGJMP, 0));
9976      c = *nextchar(pRExC_state);
9977      if (flags&HASWIDTH)
9978       *flagp |= HASWIDTH;
9979      if (c == '|') {
9980       if (is_define)
9981        vFAIL("(?(DEFINE)....) does not allow branches");
9982
9983       /* Fake one for optimizer.  */
9984       lastbr = reganode(pRExC_state, IFTHEN, 0);
9985
9986       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9987        if (flags & RESTART_UTF8) {
9988         *flagp = RESTART_UTF8;
9989         return NULL;
9990        }
9991        FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9992         (UV) flags);
9993       }
9994       REGTAIL(pRExC_state, ret, lastbr);
9995       if (flags&HASWIDTH)
9996        *flagp |= HASWIDTH;
9997       c = *nextchar(pRExC_state);
9998      }
9999      else
10000       lastbr = NULL;
10001      if (c != ')')
10002       vFAIL("Switch (?(condition)... contains too many branches");
10003      ender = reg_node(pRExC_state, TAIL);
10004      REGTAIL(pRExC_state, br, ender);
10005      if (lastbr) {
10006       REGTAIL(pRExC_state, lastbr, ender);
10007       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10008      }
10009      else
10010       REGTAIL(pRExC_state, ret, ender);
10011      RExC_size++; /* XXX WHY do we need this?!!
10012          For large programs it seems to be required
10013          but I can't figure out why. -- dmq*/
10014      return ret;
10015     }
10016     else {
10017      RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10018      vFAIL("Unknown switch condition (?(...))");
10019     }
10020    }
10021    case '[':           /* (?[ ... ]) */
10022     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10023           oregcomp_parse);
10024    case 0:
10025     RExC_parse--; /* for vFAIL to print correctly */
10026     vFAIL("Sequence (? incomplete");
10027     break;
10028    default: /* e.g., (?i) */
10029     --RExC_parse;
10030    parse_flags:
10031     parse_lparen_question_flags(pRExC_state);
10032     if (UCHARAT(RExC_parse) != ':') {
10033      nextchar(pRExC_state);
10034      *flagp = TRYAGAIN;
10035      return NULL;
10036     }
10037     paren = ':';
10038     nextchar(pRExC_state);
10039     ret = NULL;
10040     goto parse_rest;
10041    } /* end switch */
10042   }
10043   else {                  /* (...) */
10044   capturing_parens:
10045    parno = RExC_npar;
10046    RExC_npar++;
10047
10048    ret = reganode(pRExC_state, OPEN, parno);
10049    if (!SIZE_ONLY ){
10050     if (!RExC_nestroot)
10051      RExC_nestroot = parno;
10052     if (RExC_seen & REG_RECURSE_SEEN
10053      && !RExC_open_parens[parno-1])
10054     {
10055      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10056       "Setting open paren #%"IVdf" to %d\n",
10057       (IV)parno, REG_NODE_NUM(ret)));
10058      RExC_open_parens[parno-1]= ret;
10059     }
10060    }
10061    Set_Node_Length(ret, 1); /* MJD */
10062    Set_Node_Offset(ret, RExC_parse); /* MJD */
10063    is_open = 1;
10064   }
10065  }
10066  else                        /* ! paren */
10067   ret = NULL;
10068
10069    parse_rest:
10070  /* Pick up the branches, linking them together. */
10071  parse_start = RExC_parse;   /* MJD */
10072  br = regbranch(pRExC_state, &flags, 1,depth+1);
10073
10074  /*     branch_len = (paren != 0); */
10075
10076  if (br == NULL) {
10077   if (flags & RESTART_UTF8) {
10078    *flagp = RESTART_UTF8;
10079    return NULL;
10080   }
10081   FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10082  }
10083  if (*RExC_parse == '|') {
10084   if (!SIZE_ONLY && RExC_extralen) {
10085    reginsert(pRExC_state, BRANCHJ, br, depth+1);
10086   }
10087   else {                  /* MJD */
10088    reginsert(pRExC_state, BRANCH, br, depth+1);
10089    Set_Node_Length(br, paren != 0);
10090    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10091   }
10092   have_branch = 1;
10093   if (SIZE_ONLY)
10094    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
10095  }
10096  else if (paren == ':') {
10097   *flagp |= flags&SIMPLE;
10098  }
10099  if (is_open) {    /* Starts with OPEN. */
10100   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
10101  }
10102  else if (paren != '?')  /* Not Conditional */
10103   ret = br;
10104  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10105  lastbr = br;
10106  while (*RExC_parse == '|') {
10107   if (!SIZE_ONLY && RExC_extralen) {
10108    ender = reganode(pRExC_state, LONGJMP,0);
10109
10110    /* Append to the previous. */
10111    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10112   }
10113   if (SIZE_ONLY)
10114    RExC_extralen += 2;  /* Account for LONGJMP. */
10115   nextchar(pRExC_state);
10116   if (freeze_paren) {
10117    if (RExC_npar > after_freeze)
10118     after_freeze = RExC_npar;
10119    RExC_npar = freeze_paren;
10120   }
10121   br = regbranch(pRExC_state, &flags, 0, depth+1);
10122
10123   if (br == NULL) {
10124    if (flags & RESTART_UTF8) {
10125     *flagp = RESTART_UTF8;
10126     return NULL;
10127    }
10128    FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10129   }
10130   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
10131   lastbr = br;
10132   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10133  }
10134
10135  if (have_branch || paren != ':') {
10136   /* Make a closing node, and hook it on the end. */
10137   switch (paren) {
10138   case ':':
10139    ender = reg_node(pRExC_state, TAIL);
10140    break;
10141   case 1: case 2:
10142    ender = reganode(pRExC_state, CLOSE, parno);
10143    if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10144     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10145       "Setting close paren #%"IVdf" to %d\n",
10146       (IV)parno, REG_NODE_NUM(ender)));
10147     RExC_close_parens[parno-1]= ender;
10148     if (RExC_nestroot == parno)
10149      RExC_nestroot = 0;
10150    }
10151    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10152    Set_Node_Length(ender,1); /* MJD */
10153    break;
10154   case '<':
10155   case ',':
10156   case '=':
10157   case '!':
10158    *flagp &= ~HASWIDTH;
10159    /* FALL THROUGH */
10160   case '>':
10161    ender = reg_node(pRExC_state, SUCCEED);
10162    break;
10163   case 0:
10164    ender = reg_node(pRExC_state, END);
10165    if (!SIZE_ONLY) {
10166     assert(!RExC_opend); /* there can only be one! */
10167     RExC_opend = ender;
10168    }
10169    break;
10170   }
10171   DEBUG_PARSE_r(if (!SIZE_ONLY) {
10172    SV * const mysv_val1=sv_newmortal();
10173    SV * const mysv_val2=sv_newmortal();
10174    DEBUG_PARSE_MSG("lsbr");
10175    regprop(RExC_rx, mysv_val1, lastbr, NULL);
10176    regprop(RExC_rx, mysv_val2, ender, NULL);
10177    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10178       SvPV_nolen_const(mysv_val1),
10179       (IV)REG_NODE_NUM(lastbr),
10180       SvPV_nolen_const(mysv_val2),
10181       (IV)REG_NODE_NUM(ender),
10182       (IV)(ender - lastbr)
10183    );
10184   });
10185   REGTAIL(pRExC_state, lastbr, ender);
10186
10187   if (have_branch && !SIZE_ONLY) {
10188    char is_nothing= 1;
10189    if (depth==1)
10190     RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10191
10192    /* Hook the tails of the branches to the closing node. */
10193    for (br = ret; br; br = regnext(br)) {
10194     const U8 op = PL_regkind[OP(br)];
10195     if (op == BRANCH) {
10196      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10197      if ( OP(NEXTOPER(br)) != NOTHING
10198       || regnext(NEXTOPER(br)) != ender)
10199       is_nothing= 0;
10200     }
10201     else if (op == BRANCHJ) {
10202      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10203      /* for now we always disable this optimisation * /
10204      if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10205       || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10206      */
10207       is_nothing= 0;
10208     }
10209    }
10210    if (is_nothing) {
10211     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10212     DEBUG_PARSE_r(if (!SIZE_ONLY) {
10213      SV * const mysv_val1=sv_newmortal();
10214      SV * const mysv_val2=sv_newmortal();
10215      DEBUG_PARSE_MSG("NADA");
10216      regprop(RExC_rx, mysv_val1, ret, NULL);
10217      regprop(RExC_rx, mysv_val2, ender, NULL);
10218      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10219         SvPV_nolen_const(mysv_val1),
10220         (IV)REG_NODE_NUM(ret),
10221         SvPV_nolen_const(mysv_val2),
10222         (IV)REG_NODE_NUM(ender),
10223         (IV)(ender - ret)
10224      );
10225     });
10226     OP(br)= NOTHING;
10227     if (OP(ender) == TAIL) {
10228      NEXT_OFF(br)= 0;
10229      RExC_emit= br + 1;
10230     } else {
10231      regnode *opt;
10232      for ( opt= br + 1; opt < ender ; opt++ )
10233       OP(opt)= OPTIMIZED;
10234      NEXT_OFF(br)= ender - br;
10235     }
10236    }
10237   }
10238  }
10239
10240  {
10241   const char *p;
10242   static const char parens[] = "=!<,>";
10243
10244   if (paren && (p = strchr(parens, paren))) {
10245    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10246    int flag = (p - parens) > 1;
10247
10248    if (paren == '>')
10249     node = SUSPEND, flag = 0;
10250    reginsert(pRExC_state, node,ret, depth+1);
10251    Set_Node_Cur_Length(ret, parse_start);
10252    Set_Node_Offset(ret, parse_start + 1);
10253    ret->flags = flag;
10254    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10255   }
10256  }
10257
10258  /* Check for proper termination. */
10259  if (paren) {
10260   /* restore original flags, but keep (?p) */
10261   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10262   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10263    RExC_parse = oregcomp_parse;
10264    vFAIL("Unmatched (");
10265   }
10266  }
10267  else if (!paren && RExC_parse < RExC_end) {
10268   if (*RExC_parse == ')') {
10269    RExC_parse++;
10270    vFAIL("Unmatched )");
10271   }
10272   else
10273    FAIL("Junk on end of regexp"); /* "Can't happen". */
10274   assert(0); /* NOTREACHED */
10275  }
10276
10277  if (RExC_in_lookbehind) {
10278   RExC_in_lookbehind--;
10279  }
10280  if (after_freeze > RExC_npar)
10281   RExC_npar = after_freeze;
10282  return(ret);
10283 }
10284
10285 /*
10286  - regbranch - one alternative of an | operator
10287  *
10288  * Implements the concatenation operator.
10289  *
10290  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10291  * restarted.
10292  */
10293 STATIC regnode *
10294 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10295 {
10296  dVAR;
10297  regnode *ret;
10298  regnode *chain = NULL;
10299  regnode *latest;
10300  I32 flags = 0, c = 0;
10301  GET_RE_DEBUG_FLAGS_DECL;
10302
10303  PERL_ARGS_ASSERT_REGBRANCH;
10304
10305  DEBUG_PARSE("brnc");
10306
10307  if (first)
10308   ret = NULL;
10309  else {
10310   if (!SIZE_ONLY && RExC_extralen)
10311    ret = reganode(pRExC_state, BRANCHJ,0);
10312   else {
10313    ret = reg_node(pRExC_state, BRANCH);
10314    Set_Node_Length(ret, 1);
10315   }
10316  }
10317
10318  if (!first && SIZE_ONLY)
10319   RExC_extralen += 1;   /* BRANCHJ */
10320
10321  *flagp = WORST;   /* Tentatively. */
10322
10323  RExC_parse--;
10324  nextchar(pRExC_state);
10325  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10326   flags &= ~TRYAGAIN;
10327   latest = regpiece(pRExC_state, &flags,depth+1);
10328   if (latest == NULL) {
10329    if (flags & TRYAGAIN)
10330     continue;
10331    if (flags & RESTART_UTF8) {
10332     *flagp = RESTART_UTF8;
10333     return NULL;
10334    }
10335    FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10336   }
10337   else if (ret == NULL)
10338    ret = latest;
10339   *flagp |= flags&(HASWIDTH|POSTPONED);
10340   if (chain == NULL)  /* First piece. */
10341    *flagp |= flags&SPSTART;
10342   else {
10343    RExC_naughty++;
10344    REGTAIL(pRExC_state, chain, latest);
10345   }
10346   chain = latest;
10347   c++;
10348  }
10349  if (chain == NULL) { /* Loop ran zero times. */
10350   chain = reg_node(pRExC_state, NOTHING);
10351   if (ret == NULL)
10352    ret = chain;
10353  }
10354  if (c == 1) {
10355   *flagp |= flags&SIMPLE;
10356  }
10357
10358  return ret;
10359 }
10360
10361 /*
10362  - regpiece - something followed by possible [*+?]
10363  *
10364  * Note that the branching code sequences used for ? and the general cases
10365  * of * and + are somewhat optimized:  they use the same NOTHING node as
10366  * both the endmarker for their branch list and the body of the last branch.
10367  * It might seem that this node could be dispensed with entirely, but the
10368  * endmarker role is not redundant.
10369  *
10370  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10371  * TRYAGAIN.
10372  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10373  * restarted.
10374  */
10375 STATIC regnode *
10376 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10377 {
10378  dVAR;
10379  regnode *ret;
10380  char op;
10381  char *next;
10382  I32 flags;
10383  const char * const origparse = RExC_parse;
10384  I32 min;
10385  I32 max = REG_INFTY;
10386 #ifdef RE_TRACK_PATTERN_OFFSETS
10387  char *parse_start;
10388 #endif
10389  const char *maxpos = NULL;
10390
10391  /* Save the original in case we change the emitted regop to a FAIL. */
10392  regnode * const orig_emit = RExC_emit;
10393
10394  GET_RE_DEBUG_FLAGS_DECL;
10395
10396  PERL_ARGS_ASSERT_REGPIECE;
10397
10398  DEBUG_PARSE("piec");
10399
10400  ret = regatom(pRExC_state, &flags,depth+1);
10401  if (ret == NULL) {
10402   if (flags & (TRYAGAIN|RESTART_UTF8))
10403    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10404   else
10405    FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10406   return(NULL);
10407  }
10408
10409  op = *RExC_parse;
10410
10411  if (op == '{' && regcurly(RExC_parse, FALSE)) {
10412   maxpos = NULL;
10413 #ifdef RE_TRACK_PATTERN_OFFSETS
10414   parse_start = RExC_parse; /* MJD */
10415 #endif
10416   next = RExC_parse + 1;
10417   while (isDIGIT(*next) || *next == ',') {
10418    if (*next == ',') {
10419     if (maxpos)
10420      break;
10421     else
10422      maxpos = next;
10423    }
10424    next++;
10425   }
10426   if (*next == '}') {  /* got one */
10427    if (!maxpos)
10428     maxpos = next;
10429    RExC_parse++;
10430    min = atoi(RExC_parse);
10431    if (*maxpos == ',')
10432     maxpos++;
10433    else
10434     maxpos = RExC_parse;
10435    max = atoi(maxpos);
10436    if (!max && *maxpos != '0')
10437     max = REG_INFTY;  /* meaning "infinity" */
10438    else if (max >= REG_INFTY)
10439     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10440    RExC_parse = next;
10441    nextchar(pRExC_state);
10442    if (max < min) {    /* If can't match, warn and optimize to fail
10443         unconditionally */
10444     if (SIZE_ONLY) {
10445      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10446
10447      /* We can't back off the size because we have to reserve
10448      * enough space for all the things we are about to throw
10449      * away, but we can shrink it by the ammount we are about
10450      * to re-use here */
10451      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10452     }
10453     else {
10454      RExC_emit = orig_emit;
10455     }
10456     ret = reg_node(pRExC_state, OPFAIL);
10457     return ret;
10458    }
10459    else if (min == max
10460      && RExC_parse < RExC_end
10461      && (*RExC_parse == '?' || *RExC_parse == '+'))
10462    {
10463     if (SIZE_ONLY) {
10464      ckWARN2reg(RExC_parse + 1,
10465        "Useless use of greediness modifier '%c'",
10466        *RExC_parse);
10467     }
10468     /* Absorb the modifier, so later code doesn't see nor use
10469      * it */
10470     nextchar(pRExC_state);
10471    }
10472
10473   do_curly:
10474    if ((flags&SIMPLE)) {
10475     RExC_naughty += 2 + RExC_naughty / 2;
10476     reginsert(pRExC_state, CURLY, ret, depth+1);
10477     Set_Node_Offset(ret, parse_start+1); /* MJD */
10478     Set_Node_Cur_Length(ret, parse_start);
10479    }
10480    else {
10481     regnode * const w = reg_node(pRExC_state, WHILEM);
10482
10483     w->flags = 0;
10484     REGTAIL(pRExC_state, ret, w);
10485     if (!SIZE_ONLY && RExC_extralen) {
10486      reginsert(pRExC_state, LONGJMP,ret, depth+1);
10487      reginsert(pRExC_state, NOTHING,ret, depth+1);
10488      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10489     }
10490     reginsert(pRExC_state, CURLYX,ret, depth+1);
10491         /* MJD hk */
10492     Set_Node_Offset(ret, parse_start+1);
10493     Set_Node_Length(ret,
10494         op == '{' ? (RExC_parse - parse_start) : 1);
10495
10496     if (!SIZE_ONLY && RExC_extralen)
10497      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10498     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10499     if (SIZE_ONLY)
10500      RExC_whilem_seen++, RExC_extralen += 3;
10501     RExC_naughty += 4 + RExC_naughty; /* compound interest */
10502    }
10503    ret->flags = 0;
10504
10505    if (min > 0)
10506     *flagp = WORST;
10507    if (max > 0)
10508     *flagp |= HASWIDTH;
10509    if (!SIZE_ONLY) {
10510     ARG1_SET(ret, (U16)min);
10511     ARG2_SET(ret, (U16)max);
10512    }
10513    if (max == REG_INFTY)
10514     RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10515
10516    goto nest_check;
10517   }
10518  }
10519
10520  if (!ISMULT1(op)) {
10521   *flagp = flags;
10522   return(ret);
10523  }
10524
10525 #if 0    /* Now runtime fix should be reliable. */
10526
10527  /* if this is reinstated, don't forget to put this back into perldiag:
10528
10529    =item Regexp *+ operand could be empty at {#} in regex m/%s/
10530
10531   (F) The part of the regexp subject to either the * or + quantifier
10532   could match an empty string. The {#} shows in the regular
10533   expression about where the problem was discovered.
10534
10535  */
10536
10537  if (!(flags&HASWIDTH) && op != '?')
10538  vFAIL("Regexp *+ operand could be empty");
10539 #endif
10540
10541 #ifdef RE_TRACK_PATTERN_OFFSETS
10542  parse_start = RExC_parse;
10543 #endif
10544  nextchar(pRExC_state);
10545
10546  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10547
10548  if (op == '*' && (flags&SIMPLE)) {
10549   reginsert(pRExC_state, STAR, ret, depth+1);
10550   ret->flags = 0;
10551   RExC_naughty += 4;
10552   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10553  }
10554  else if (op == '*') {
10555   min = 0;
10556   goto do_curly;
10557  }
10558  else if (op == '+' && (flags&SIMPLE)) {
10559   reginsert(pRExC_state, PLUS, ret, depth+1);
10560   ret->flags = 0;
10561   RExC_naughty += 3;
10562   RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10563  }
10564  else if (op == '+') {
10565   min = 1;
10566   goto do_curly;
10567  }
10568  else if (op == '?') {
10569   min = 0; max = 1;
10570   goto do_curly;
10571  }
10572   nest_check:
10573  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10574   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10575   ckWARN2reg(RExC_parse,
10576     "%"UTF8f" matches null string many times",
10577     UTF8fARG(UTF, (RExC_parse >= origparse
10578         ? RExC_parse - origparse
10579         : 0),
10580     origparse));
10581   (void)ReREFCNT_inc(RExC_rx_sv);
10582  }
10583
10584  if (RExC_parse < RExC_end && *RExC_parse == '?') {
10585   nextchar(pRExC_state);
10586   reginsert(pRExC_state, MINMOD, ret, depth+1);
10587   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10588  }
10589  else
10590  if (RExC_parse < RExC_end && *RExC_parse == '+') {
10591   regnode *ender;
10592   nextchar(pRExC_state);
10593   ender = reg_node(pRExC_state, SUCCEED);
10594   REGTAIL(pRExC_state, ret, ender);
10595   reginsert(pRExC_state, SUSPEND, ret, depth+1);
10596   ret->flags = 0;
10597   ender = reg_node(pRExC_state, TAIL);
10598   REGTAIL(pRExC_state, ret, ender);
10599  }
10600
10601  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10602   RExC_parse++;
10603   vFAIL("Nested quantifiers");
10604  }
10605
10606  return(ret);
10607 }
10608
10609 STATIC bool
10610 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10611      UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10612      const bool strict   /* Apply stricter parsing rules? */
10613  )
10614 {
10615
10616  /* This is expected to be called by a parser routine that has recognized '\N'
10617    and needs to handle the rest. RExC_parse is expected to point at the first
10618    char following the N at the time of the call.  On successful return,
10619    RExC_parse has been updated to point to just after the sequence identified
10620    by this routine, and <*flagp> has been updated.
10621
10622    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10623    character class.
10624
10625    \N may begin either a named sequence, or if outside a character class, mean
10626    to match a non-newline.  For non single-quoted regexes, the tokenizer has
10627    attempted to decide which, and in the case of a named sequence, converted it
10628    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10629    where c1... are the characters in the sequence.  For single-quoted regexes,
10630    the tokenizer passes the \N sequence through unchanged; this code will not
10631    attempt to determine this nor expand those, instead raising a syntax error.
10632    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10633    or there is no '}', it signals that this \N occurrence means to match a
10634    non-newline.
10635
10636    Only the \N{U+...} form should occur in a character class, for the same
10637    reason that '.' inside a character class means to just match a period: it
10638    just doesn't make sense.
10639
10640    The function raises an error (via vFAIL), and doesn't return for various
10641    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
10642    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10643    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10644    only possible if node_p is non-NULL.
10645
10646
10647    If <valuep> is non-null, it means the caller can accept an input sequence
10648    consisting of a just a single code point; <*valuep> is set to that value
10649    if the input is such.
10650
10651    If <node_p> is non-null it signifies that the caller can accept any other
10652    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
10653    is set as follows:
10654  1) \N means not-a-NL: points to a newly created REG_ANY node;
10655  2) \N{}:              points to a new NOTHING node;
10656  3) otherwise:         points to a new EXACT node containing the resolved
10657       string.
10658    Note that FALSE is returned for single code point sequences if <valuep> is
10659    null.
10660  */
10661
10662  char * endbrace;    /* '}' following the name */
10663  char* p;
10664  char *endchar; /* Points to '.' or '}' ending cur char in the input
10665       stream */
10666  bool has_multiple_chars; /* true if the input stream contains a sequence of
10667         more than one character */
10668
10669  GET_RE_DEBUG_FLAGS_DECL;
10670
10671  PERL_ARGS_ASSERT_GROK_BSLASH_N;
10672
10673  GET_RE_DEBUG_FLAGS;
10674
10675  assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
10676
10677  /* The [^\n] meaning of \N ignores spaces and comments under the /x
10678  * modifier.  The other meaning does not, so use a temporary until we find
10679  * out which we are being called with */
10680  p = (RExC_flags & RXf_PMf_EXTENDED)
10681   ? regwhite( pRExC_state, RExC_parse )
10682   : RExC_parse;
10683
10684  /* Disambiguate between \N meaning a named character versus \N meaning
10685  * [^\n].  The former is assumed when it can't be the latter. */
10686  if (*p != '{' || regcurly(p, FALSE)) {
10687   RExC_parse = p;
10688   if (! node_p) {
10689    /* no bare \N allowed in a charclass */
10690    if (in_char_class) {
10691     vFAIL("\\N in a character class must be a named character: \\N{...}");
10692    }
10693    return FALSE;
10694   }
10695   RExC_parse--;   /* Need to back off so nextchar() doesn't skip the
10696       current char */
10697   nextchar(pRExC_state);
10698   *node_p = reg_node(pRExC_state, REG_ANY);
10699   *flagp |= HASWIDTH|SIMPLE;
10700   RExC_naughty++;
10701   Set_Node_Length(*node_p, 1); /* MJD */
10702   return TRUE;
10703  }
10704
10705  /* Here, we have decided it should be a named character or sequence */
10706
10707  /* The test above made sure that the next real character is a '{', but
10708  * under the /x modifier, it could be separated by space (or a comment and
10709  * \n) and this is not allowed (for consistency with \x{...} and the
10710  * tokenizer handling of \N{NAME}). */
10711  if (*RExC_parse != '{') {
10712   vFAIL("Missing braces on \\N{}");
10713  }
10714
10715  RExC_parse++; /* Skip past the '{' */
10716
10717  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10718   || ! (endbrace == RExC_parse  /* nothing between the {} */
10719    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10720             */
10721     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10722              */
10723  {
10724   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10725   vFAIL("\\N{NAME} must be resolved by the lexer");
10726  }
10727
10728  if (endbrace == RExC_parse) {   /* empty: \N{} */
10729   bool ret = TRUE;
10730   if (node_p) {
10731    *node_p = reg_node(pRExC_state,NOTHING);
10732   }
10733   else if (in_char_class) {
10734    if (SIZE_ONLY && in_char_class) {
10735     if (strict) {
10736      RExC_parse++;   /* Position after the "}" */
10737      vFAIL("Zero length \\N{}");
10738     }
10739     else {
10740      ckWARNreg(RExC_parse,
10741        "Ignoring zero length \\N{} in character class");
10742     }
10743    }
10744    ret = FALSE;
10745   }
10746   else {
10747    return FALSE;
10748   }
10749   nextchar(pRExC_state);
10750   return ret;
10751  }
10752
10753  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10754  RExC_parse += 2; /* Skip past the 'U+' */
10755
10756  endchar = RExC_parse + strcspn(RExC_parse, ".}");
10757
10758  /* Code points are separated by dots.  If none, there is only one code
10759  * point, and is terminated by the brace */
10760  has_multiple_chars = (endchar < endbrace);
10761
10762  if (valuep && (! has_multiple_chars || in_char_class)) {
10763   /* We only pay attention to the first char of
10764   multichar strings being returned in char classes. I kinda wonder
10765   if this makes sense as it does change the behaviour
10766   from earlier versions, OTOH that behaviour was broken
10767   as well. XXX Solution is to recharacterize as
10768   [rest-of-class]|multi1|multi2... */
10769
10770   STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10771   I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10772    | PERL_SCAN_DISALLOW_PREFIX
10773    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10774
10775   *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10776
10777   /* The tokenizer should have guaranteed validity, but it's possible to
10778   * bypass it by using single quoting, so check */
10779   if (length_of_hex == 0
10780    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10781   {
10782    RExC_parse += length_of_hex; /* Includes all the valid */
10783    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10784        ? UTF8SKIP(RExC_parse)
10785        : 1;
10786    /* Guard against malformed utf8 */
10787    if (RExC_parse >= endchar) {
10788     RExC_parse = endchar;
10789    }
10790    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10791   }
10792
10793   if (in_char_class && has_multiple_chars) {
10794    if (strict) {
10795     RExC_parse = endbrace;
10796     vFAIL("\\N{} in character class restricted to one character");
10797    }
10798    else {
10799     ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10800    }
10801   }
10802
10803   RExC_parse = endbrace + 1;
10804  }
10805  else if (! node_p || ! has_multiple_chars) {
10806
10807   /* Here, the input is legal, but not according to the caller's
10808   * options.  We fail without advancing the parse, so that the
10809   * caller can try again */
10810   RExC_parse = p;
10811   return FALSE;
10812  }
10813  else {
10814
10815   /* What is done here is to convert this to a sub-pattern of the form
10816   * (?:\x{char1}\x{char2}...)
10817   * and then call reg recursively.  That way, it retains its atomicness,
10818   * while not having to worry about special handling that some code
10819   * points may have.  toke.c has converted the original Unicode values
10820   * to native, so that we can just pass on the hex values unchanged.  We
10821   * do have to set a flag to keep recoding from happening in the
10822   * recursion */
10823
10824   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10825   STRLEN len;
10826   char *orig_end = RExC_end;
10827   I32 flags;
10828
10829   while (RExC_parse < endbrace) {
10830
10831    /* Convert to notation the rest of the code understands */
10832    sv_catpv(substitute_parse, "\\x{");
10833    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10834    sv_catpv(substitute_parse, "}");
10835
10836    /* Point to the beginning of the next character in the sequence. */
10837    RExC_parse = endchar + 1;
10838    endchar = RExC_parse + strcspn(RExC_parse, ".}");
10839   }
10840   sv_catpv(substitute_parse, ")");
10841
10842   RExC_parse = SvPV(substitute_parse, len);
10843
10844   /* Don't allow empty number */
10845   if (len < 8) {
10846    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10847   }
10848   RExC_end = RExC_parse + len;
10849
10850   /* The values are Unicode, and therefore not subject to recoding */
10851   RExC_override_recoding = 1;
10852
10853   if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10854    if (flags & RESTART_UTF8) {
10855     *flagp = RESTART_UTF8;
10856     return FALSE;
10857    }
10858    FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10859     (UV) flags);
10860   }
10861   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10862
10863   RExC_parse = endbrace;
10864   RExC_end = orig_end;
10865   RExC_override_recoding = 0;
10866
10867   nextchar(pRExC_state);
10868  }
10869
10870  return TRUE;
10871 }
10872
10873
10874 /*
10875  * reg_recode
10876  *
10877  * It returns the code point in utf8 for the value in *encp.
10878  *    value: a code value in the source encoding
10879  *    encp:  a pointer to an Encode object
10880  *
10881  * If the result from Encode is not a single character,
10882  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10883  */
10884 STATIC UV
10885 S_reg_recode(pTHX_ const char value, SV **encp)
10886 {
10887  STRLEN numlen = 1;
10888  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10889  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10890  const STRLEN newlen = SvCUR(sv);
10891  UV uv = UNICODE_REPLACEMENT;
10892
10893  PERL_ARGS_ASSERT_REG_RECODE;
10894
10895  if (newlen)
10896   uv = SvUTF8(sv)
10897    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10898    : *(U8*)s;
10899
10900  if (!newlen || numlen != newlen) {
10901   uv = UNICODE_REPLACEMENT;
10902   *encp = NULL;
10903  }
10904  return uv;
10905 }
10906
10907 PERL_STATIC_INLINE U8
10908 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10909 {
10910  U8 op;
10911
10912  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10913
10914  if (! FOLD) {
10915   return EXACT;
10916  }
10917
10918  op = get_regex_charset(RExC_flags);
10919  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10920   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10921     been, so there is no hole */
10922  }
10923
10924  return op + EXACTF;
10925 }
10926
10927 PERL_STATIC_INLINE void
10928 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
10929       regnode *node, I32* flagp, STRLEN len, UV code_point,
10930       bool downgradable)
10931 {
10932  /* This knows the details about sizing an EXACTish node, setting flags for
10933  * it (by setting <*flagp>, and potentially populating it with a single
10934  * character.
10935  *
10936  * If <len> (the length in bytes) is non-zero, this function assumes that
10937  * the node has already been populated, and just does the sizing.  In this
10938  * case <code_point> should be the final code point that has already been
10939  * placed into the node.  This value will be ignored except that under some
10940  * circumstances <*flagp> is set based on it.
10941  *
10942  * If <len> is zero, the function assumes that the node is to contain only
10943  * the single character given by <code_point> and calculates what <len>
10944  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10945  * additionally will populate the node's STRING with <code_point> or its
10946  * fold if folding.
10947  *
10948  * In both cases <*flagp> is appropriately set
10949  *
10950  * It knows that under FOLD, the Latin Sharp S and UTF characters above
10951  * 255, must be folded (the former only when the rules indicate it can
10952  * match 'ss')
10953  *
10954  * When it does the populating, it looks at the flag 'downgradable'.  If
10955  * true with a node that folds, it checks if the single code point
10956  * participates in a fold, and if not downgrades the node to an EXACT.
10957  * This helps the optimizer */
10958
10959  bool len_passed_in = cBOOL(len != 0);
10960  U8 character[UTF8_MAXBYTES_CASE+1];
10961
10962  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10963
10964  /* Don't bother to check for downgrading in PASS1, as it doesn't make any
10965  * sizing difference, and is extra work that is thrown away */
10966  if (downgradable && ! PASS2) {
10967   downgradable = FALSE;
10968  }
10969
10970  if (! len_passed_in) {
10971   if (UTF) {
10972    if (UNI_IS_INVARIANT(code_point)) {
10973     if (LOC || ! FOLD) {    /* /l defers folding until runtime */
10974      *character = (U8) code_point;
10975     }
10976     else { /* Here is /i and not /l (toFOLD() is defined on just
10977       ASCII, which isn't the same thing as INVARIANT on
10978       EBCDIC, but it works there, as the extra invariants
10979       fold to themselves) */
10980      *character = toFOLD((U8) code_point);
10981
10982      /* We can downgrade to an EXACT node if this character
10983      * isn't a folding one.  Note that this assumes that
10984      * nothing above Latin1 folds to some other invariant than
10985      * one of these alphabetics; otherwise we would also have
10986      * to check:
10987      *  && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
10988      *      || ASCII_FOLD_RESTRICTED))
10989      */
10990      if (downgradable && PL_fold[code_point] == code_point) {
10991       OP(node) = EXACT;
10992      }
10993     }
10994     len = 1;
10995    }
10996    else if (FOLD && (! LOC
10997        || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
10998    {   /* Folding, and ok to do so now */
10999     UV folded = _to_uni_fold_flags(
11000         code_point,
11001         character,
11002         &len,
11003         FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11004              ? FOLD_FLAGS_NOMIX_ASCII
11005              : 0));
11006     if (downgradable
11007      && folded == code_point
11008      && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11009     {
11010      OP(node) = EXACT;
11011     }
11012    }
11013    else if (code_point <= MAX_UTF8_TWO_BYTE) {
11014
11015     /* Not folding this cp, and can output it directly */
11016     *character = UTF8_TWO_BYTE_HI(code_point);
11017     *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11018     len = 2;
11019    }
11020    else {
11021     uvchr_to_utf8( character, code_point);
11022     len = UTF8SKIP(character);
11023    }
11024   } /* Else pattern isn't UTF8.  */
11025   else if (! FOLD) {
11026    *character = (U8) code_point;
11027    len = 1;
11028   } /* Else is folded non-UTF8 */
11029   else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11030
11031    /* We don't fold any non-UTF8 except possibly the Sharp s  (see
11032    * comments at join_exact()); */
11033    *character = (U8) code_point;
11034    len = 1;
11035
11036    /* Can turn into an EXACT node if we know the fold at compile time,
11037    * and it folds to itself and doesn't particpate in other folds */
11038    if (downgradable
11039     && ! LOC
11040     && PL_fold_latin1[code_point] == code_point
11041     && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11042      || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11043    {
11044     OP(node) = EXACT;
11045    }
11046   } /* else is Sharp s.  May need to fold it */
11047   else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11048    *character = 's';
11049    *(character + 1) = 's';
11050    len = 2;
11051   }
11052   else {
11053    *character = LATIN_SMALL_LETTER_SHARP_S;
11054    len = 1;
11055   }
11056  }
11057
11058  if (SIZE_ONLY) {
11059   RExC_size += STR_SZ(len);
11060  }
11061  else {
11062   RExC_emit += STR_SZ(len);
11063   STR_LEN(node) = len;
11064   if (! len_passed_in) {
11065    Copy((char *) character, STRING(node), len, char);
11066   }
11067  }
11068
11069  *flagp |= HASWIDTH;
11070
11071  /* A single character node is SIMPLE, except for the special-cased SHARP S
11072  * under /di. */
11073  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11074   && (code_point != LATIN_SMALL_LETTER_SHARP_S
11075    || ! FOLD || ! DEPENDS_SEMANTICS))
11076  {
11077   *flagp |= SIMPLE;
11078  }
11079
11080  /* The OP may not be well defined in PASS1 */
11081  if (PASS2 && OP(node) == EXACTFL) {
11082   RExC_contains_locale = 1;
11083  }
11084 }
11085
11086
11087 /* return atoi(p), unless it's too big to sensibly be a backref,
11088  * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11089
11090 static I32
11091 S_backref_value(char *p)
11092 {
11093  char *q = p;
11094
11095  for (;isDIGIT(*q); q++); /* calculate length of num */
11096  if (q - p == 0 || q - p > 9)
11097   return I32_MAX;
11098  return atoi(p);
11099 }
11100
11101
11102 /*
11103  - regatom - the lowest level
11104
11105    Try to identify anything special at the start of the pattern. If there
11106    is, then handle it as required. This may involve generating a single regop,
11107    such as for an assertion; or it may involve recursing, such as to
11108    handle a () structure.
11109
11110    If the string doesn't start with something special then we gobble up
11111    as much literal text as we can.
11112
11113    Once we have been able to handle whatever type of thing started the
11114    sequence, we return.
11115
11116    Note: we have to be careful with escapes, as they can be both literal
11117    and special, and in the case of \10 and friends, context determines which.
11118
11119    A summary of the code structure is:
11120
11121    switch (first_byte) {
11122   cases for each special:
11123    handle this special;
11124    break;
11125   case '\\':
11126    switch (2nd byte) {
11127     cases for each unambiguous special:
11128      handle this special;
11129      break;
11130     cases for each ambigous special/literal:
11131      disambiguate;
11132      if (special)  handle here
11133      else goto defchar;
11134     default: // unambiguously literal:
11135      goto defchar;
11136    }
11137   default:  // is a literal char
11138    // FALL THROUGH
11139   defchar:
11140    create EXACTish node for literal;
11141    while (more input and node isn't full) {
11142     switch (input_byte) {
11143     cases for each special;
11144      make sure parse pointer is set so that the next call to
11145       regatom will see this special first
11146      goto loopdone; // EXACTish node terminated by prev. char
11147     default:
11148      append char to EXACTISH node;
11149     }
11150     get next input byte;
11151    }
11152   loopdone:
11153    }
11154    return the generated node;
11155
11156    Specifically there are two separate switches for handling
11157    escape sequences, with the one for handling literal escapes requiring
11158    a dummy entry for all of the special escapes that are actually handled
11159    by the other.
11160
11161    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11162    TRYAGAIN.
11163    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11164    restarted.
11165    Otherwise does not return NULL.
11166 */
11167
11168 STATIC regnode *
11169 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11170 {
11171  dVAR;
11172  regnode *ret = NULL;
11173  I32 flags = 0;
11174  char *parse_start = RExC_parse;
11175  U8 op;
11176  int invert = 0;
11177
11178  GET_RE_DEBUG_FLAGS_DECL;
11179
11180  *flagp = WORST;  /* Tentatively. */
11181
11182  DEBUG_PARSE("atom");
11183
11184  PERL_ARGS_ASSERT_REGATOM;
11185
11186 tryagain:
11187  switch ((U8)*RExC_parse) {
11188  case '^':
11189   RExC_seen_zerolen++;
11190   nextchar(pRExC_state);
11191   if (RExC_flags & RXf_PMf_MULTILINE)
11192    ret = reg_node(pRExC_state, MBOL);
11193   else if (RExC_flags & RXf_PMf_SINGLELINE)
11194    ret = reg_node(pRExC_state, SBOL);
11195   else
11196    ret = reg_node(pRExC_state, BOL);
11197   Set_Node_Length(ret, 1); /* MJD */
11198   break;
11199  case '$':
11200   nextchar(pRExC_state);
11201   if (*RExC_parse)
11202    RExC_seen_zerolen++;
11203   if (RExC_flags & RXf_PMf_MULTILINE)
11204    ret = reg_node(pRExC_state, MEOL);
11205   else if (RExC_flags & RXf_PMf_SINGLELINE)
11206    ret = reg_node(pRExC_state, SEOL);
11207   else
11208    ret = reg_node(pRExC_state, EOL);
11209   Set_Node_Length(ret, 1); /* MJD */
11210   break;
11211  case '.':
11212   nextchar(pRExC_state);
11213   if (RExC_flags & RXf_PMf_SINGLELINE)
11214    ret = reg_node(pRExC_state, SANY);
11215   else
11216    ret = reg_node(pRExC_state, REG_ANY);
11217   *flagp |= HASWIDTH|SIMPLE;
11218   RExC_naughty++;
11219   Set_Node_Length(ret, 1); /* MJD */
11220   break;
11221  case '[':
11222  {
11223   char * const oregcomp_parse = ++RExC_parse;
11224   ret = regclass(pRExC_state, flagp,depth+1,
11225      FALSE, /* means parse the whole char class */
11226      TRUE, /* allow multi-char folds */
11227      FALSE, /* don't silence non-portable warnings. */
11228      NULL);
11229   if (*RExC_parse != ']') {
11230    RExC_parse = oregcomp_parse;
11231    vFAIL("Unmatched [");
11232   }
11233   if (ret == NULL) {
11234    if (*flagp & RESTART_UTF8)
11235     return NULL;
11236    FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11237     (UV) *flagp);
11238   }
11239   nextchar(pRExC_state);
11240   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11241   break;
11242  }
11243  case '(':
11244   nextchar(pRExC_state);
11245   ret = reg(pRExC_state, 2, &flags,depth+1);
11246   if (ret == NULL) {
11247     if (flags & TRYAGAIN) {
11248      if (RExC_parse == RExC_end) {
11249       /* Make parent create an empty node if needed. */
11250       *flagp |= TRYAGAIN;
11251       return(NULL);
11252      }
11253      goto tryagain;
11254     }
11255     if (flags & RESTART_UTF8) {
11256      *flagp = RESTART_UTF8;
11257      return NULL;
11258     }
11259     FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11260                 (UV) flags);
11261   }
11262   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11263   break;
11264  case '|':
11265  case ')':
11266   if (flags & TRYAGAIN) {
11267    *flagp |= TRYAGAIN;
11268    return NULL;
11269   }
11270   vFAIL("Internal urp");
11271         /* Supposed to be caught earlier. */
11272   break;
11273  case '{':
11274   if (!regcurly(RExC_parse, FALSE)) {
11275    RExC_parse++;
11276    goto defchar;
11277   }
11278   /* FALL THROUGH */
11279  case '?':
11280  case '+':
11281  case '*':
11282   RExC_parse++;
11283   vFAIL("Quantifier follows nothing");
11284   break;
11285  case '\\':
11286   /* Special Escapes
11287
11288   This switch handles escape sequences that resolve to some kind
11289   of special regop and not to literal text. Escape sequnces that
11290   resolve to literal text are handled below in the switch marked
11291   "Literal Escapes".
11292
11293   Every entry in this switch *must* have a corresponding entry
11294   in the literal escape switch. However, the opposite is not
11295   required, as the default for this switch is to jump to the
11296   literal text handling code.
11297   */
11298   switch ((U8)*++RExC_parse) {
11299    U8 arg;
11300   /* Special Escapes */
11301   case 'A':
11302    RExC_seen_zerolen++;
11303    ret = reg_node(pRExC_state, SBOL);
11304    *flagp |= SIMPLE;
11305    goto finish_meta_pat;
11306   case 'G':
11307    ret = reg_node(pRExC_state, GPOS);
11308    RExC_seen |= REG_GPOS_SEEN;
11309    *flagp |= SIMPLE;
11310    goto finish_meta_pat;
11311   case 'K':
11312    RExC_seen_zerolen++;
11313    ret = reg_node(pRExC_state, KEEPS);
11314    *flagp |= SIMPLE;
11315    /* XXX:dmq : disabling in-place substitution seems to
11316    * be necessary here to avoid cases of memory corruption, as
11317    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11318    */
11319    RExC_seen |= REG_LOOKBEHIND_SEEN;
11320    goto finish_meta_pat;
11321   case 'Z':
11322    ret = reg_node(pRExC_state, SEOL);
11323    *flagp |= SIMPLE;
11324    RExC_seen_zerolen++;  /* Do not optimize RE away */
11325    goto finish_meta_pat;
11326   case 'z':
11327    ret = reg_node(pRExC_state, EOS);
11328    *flagp |= SIMPLE;
11329    RExC_seen_zerolen++;  /* Do not optimize RE away */
11330    goto finish_meta_pat;
11331   case 'C':
11332    ret = reg_node(pRExC_state, CANY);
11333    RExC_seen |= REG_CANY_SEEN;
11334    *flagp |= HASWIDTH|SIMPLE;
11335    goto finish_meta_pat;
11336   case 'X':
11337    ret = reg_node(pRExC_state, CLUMP);
11338    *flagp |= HASWIDTH;
11339    goto finish_meta_pat;
11340
11341   case 'W':
11342    invert = 1;
11343    /* FALLTHROUGH */
11344   case 'w':
11345    arg = ANYOF_WORDCHAR;
11346    goto join_posix;
11347
11348   case 'b':
11349    RExC_seen_zerolen++;
11350    RExC_seen |= REG_LOOKBEHIND_SEEN;
11351    op = BOUND + get_regex_charset(RExC_flags);
11352    if (op > BOUNDA) {  /* /aa is same as /a */
11353     op = BOUNDA;
11354    }
11355    else if (op == BOUNDL) {
11356     RExC_contains_locale = 1;
11357    }
11358    ret = reg_node(pRExC_state, op);
11359    FLAGS(ret) = get_regex_charset(RExC_flags);
11360    *flagp |= SIMPLE;
11361    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11362     /* diag_listed_as: Use "%s" instead of "%s" */
11363     vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11364    }
11365    goto finish_meta_pat;
11366   case 'B':
11367    RExC_seen_zerolen++;
11368    RExC_seen |= REG_LOOKBEHIND_SEEN;
11369    op = NBOUND + get_regex_charset(RExC_flags);
11370    if (op > NBOUNDA) { /* /aa is same as /a */
11371     op = NBOUNDA;
11372    }
11373    else if (op == NBOUNDL) {
11374     RExC_contains_locale = 1;
11375    }
11376    ret = reg_node(pRExC_state, op);
11377    FLAGS(ret) = get_regex_charset(RExC_flags);
11378    *flagp |= SIMPLE;
11379    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11380     /* diag_listed_as: Use "%s" instead of "%s" */
11381     vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11382    }
11383    goto finish_meta_pat;
11384
11385   case 'D':
11386    invert = 1;
11387    /* FALLTHROUGH */
11388   case 'd':
11389    arg = ANYOF_DIGIT;
11390    goto join_posix;
11391
11392   case 'R':
11393    ret = reg_node(pRExC_state, LNBREAK);
11394    *flagp |= HASWIDTH|SIMPLE;
11395    goto finish_meta_pat;
11396
11397   case 'H':
11398    invert = 1;
11399    /* FALLTHROUGH */
11400   case 'h':
11401    arg = ANYOF_BLANK;
11402    op = POSIXU;
11403    goto join_posix_op_known;
11404
11405   case 'V':
11406    invert = 1;
11407    /* FALLTHROUGH */
11408   case 'v':
11409    arg = ANYOF_VERTWS;
11410    op = POSIXU;
11411    goto join_posix_op_known;
11412
11413   case 'S':
11414    invert = 1;
11415    /* FALLTHROUGH */
11416   case 's':
11417    arg = ANYOF_SPACE;
11418
11419   join_posix:
11420
11421    op = POSIXD + get_regex_charset(RExC_flags);
11422    if (op > POSIXA) {  /* /aa is same as /a */
11423     op = POSIXA;
11424    }
11425    else if (op == POSIXL) {
11426     RExC_contains_locale = 1;
11427    }
11428
11429   join_posix_op_known:
11430
11431    if (invert) {
11432     op += NPOSIXD - POSIXD;
11433    }
11434
11435    ret = reg_node(pRExC_state, op);
11436    if (! SIZE_ONLY) {
11437     FLAGS(ret) = namedclass_to_classnum(arg);
11438    }
11439
11440    *flagp |= HASWIDTH|SIMPLE;
11441    /* FALL THROUGH */
11442
11443   finish_meta_pat:
11444    nextchar(pRExC_state);
11445    Set_Node_Length(ret, 2); /* MJD */
11446    break;
11447   case 'p':
11448   case 'P':
11449    {
11450 #ifdef DEBUGGING
11451     char* parse_start = RExC_parse - 2;
11452 #endif
11453
11454     RExC_parse--;
11455
11456     ret = regclass(pRExC_state, flagp,depth+1,
11457        TRUE, /* means just parse this element */
11458        FALSE, /* don't allow multi-char folds */
11459        FALSE, /* don't silence non-portable warnings.
11460           It would be a bug if these returned
11461           non-portables */
11462        NULL);
11463     /* regclass() can only return RESTART_UTF8 if multi-char folds
11464     are allowed.  */
11465     if (!ret)
11466      FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11467       (UV) *flagp);
11468
11469     RExC_parse--;
11470
11471     Set_Node_Offset(ret, parse_start + 2);
11472     Set_Node_Cur_Length(ret, parse_start);
11473     nextchar(pRExC_state);
11474    }
11475    break;
11476   case 'N':
11477    /* Handle \N and \N{NAME} with multiple code points here and not
11478    * below because it can be multicharacter. join_exact() will join
11479    * them up later on.  Also this makes sure that things like
11480    * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11481    * The options to the grok function call causes it to fail if the
11482    * sequence is just a single code point.  We then go treat it as
11483    * just another character in the current EXACT node, and hence it
11484    * gets uniform treatment with all the other characters.  The
11485    * special treatment for quantifiers is not needed for such single
11486    * character sequences */
11487    ++RExC_parse;
11488    if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11489         FALSE /* not strict */ )) {
11490     if (*flagp & RESTART_UTF8)
11491      return NULL;
11492     RExC_parse--;
11493     goto defchar;
11494    }
11495    break;
11496   case 'k':    /* Handle \k<NAME> and \k'NAME' */
11497   parse_named_seq:
11498   {
11499    char ch= RExC_parse[1];
11500    if (ch != '<' && ch != '\'' && ch != '{') {
11501     RExC_parse++;
11502     /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11503     vFAIL2("Sequence %.2s... not terminated",parse_start);
11504    } else {
11505     /* this pretty much dupes the code for (?P=...) in reg(), if
11506     you change this make sure you change that */
11507     char* name_start = (RExC_parse += 2);
11508     U32 num = 0;
11509     SV *sv_dat = reg_scan_name(pRExC_state,
11510      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11511     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11512     if (RExC_parse == name_start || *RExC_parse != ch)
11513      /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11514      vFAIL2("Sequence %.3s... not terminated",parse_start);
11515
11516     if (!SIZE_ONLY) {
11517      num = add_data( pRExC_state, STR_WITH_LEN("S"));
11518      RExC_rxi->data->data[num]=(void*)sv_dat;
11519      SvREFCNT_inc_simple_void(sv_dat);
11520     }
11521
11522     RExC_sawback = 1;
11523     ret = reganode(pRExC_state,
11524        ((! FOLD)
11525         ? NREF
11526         : (ASCII_FOLD_RESTRICTED)
11527         ? NREFFA
11528         : (AT_LEAST_UNI_SEMANTICS)
11529          ? NREFFU
11530          : (LOC)
11531          ? NREFFL
11532          : NREFF),
11533         num);
11534     *flagp |= HASWIDTH;
11535
11536     /* override incorrect value set in reganode MJD */
11537     Set_Node_Offset(ret, parse_start+1);
11538     Set_Node_Cur_Length(ret, parse_start);
11539     nextchar(pRExC_state);
11540
11541    }
11542    break;
11543   }
11544   case 'g':
11545   case '1': case '2': case '3': case '4':
11546   case '5': case '6': case '7': case '8': case '9':
11547    {
11548     I32 num;
11549     bool hasbrace = 0;
11550
11551     if (*RExC_parse == 'g') {
11552      bool isrel = 0;
11553
11554      RExC_parse++;
11555      if (*RExC_parse == '{') {
11556       RExC_parse++;
11557       hasbrace = 1;
11558      }
11559      if (*RExC_parse == '-') {
11560       RExC_parse++;
11561       isrel = 1;
11562      }
11563      if (hasbrace && !isDIGIT(*RExC_parse)) {
11564       if (isrel) RExC_parse--;
11565       RExC_parse -= 2;
11566       goto parse_named_seq;
11567      }
11568
11569      num = S_backref_value(RExC_parse);
11570      if (num == 0)
11571       vFAIL("Reference to invalid group 0");
11572      else if (num == I32_MAX) {
11573       if (isDIGIT(*RExC_parse))
11574        vFAIL("Reference to nonexistent group");
11575       else
11576        vFAIL("Unterminated \\g... pattern");
11577      }
11578
11579      if (isrel) {
11580       num = RExC_npar - num;
11581       if (num < 1)
11582        vFAIL("Reference to nonexistent or unclosed group");
11583      }
11584     }
11585     else {
11586      num = S_backref_value(RExC_parse);
11587      /* bare \NNN might be backref or octal - if it is larger than or equal
11588      * RExC_npar then it is assumed to be and octal escape.
11589      * Note RExC_npar is +1 from the actual number of parens*/
11590      if (num == I32_MAX || (num > 9 && num >= RExC_npar
11591        && *RExC_parse != '8' && *RExC_parse != '9'))
11592      {
11593       /* Probably a character specified in octal, e.g. \35 */
11594       goto defchar;
11595      }
11596     }
11597
11598     /* at this point RExC_parse definitely points to a backref
11599     * number */
11600     {
11601 #ifdef RE_TRACK_PATTERN_OFFSETS
11602      char * const parse_start = RExC_parse - 1; /* MJD */
11603 #endif
11604      while (isDIGIT(*RExC_parse))
11605       RExC_parse++;
11606      if (hasbrace) {
11607       if (*RExC_parse != '}')
11608        vFAIL("Unterminated \\g{...} pattern");
11609       RExC_parse++;
11610      }
11611      if (!SIZE_ONLY) {
11612       if (num > (I32)RExC_rx->nparens)
11613        vFAIL("Reference to nonexistent group");
11614      }
11615      RExC_sawback = 1;
11616      ret = reganode(pRExC_state,
11617         ((! FOLD)
11618          ? REF
11619          : (ASCII_FOLD_RESTRICTED)
11620          ? REFFA
11621          : (AT_LEAST_UNI_SEMANTICS)
11622           ? REFFU
11623           : (LOC)
11624           ? REFFL
11625           : REFF),
11626          num);
11627      *flagp |= HASWIDTH;
11628
11629      /* override incorrect value set in reganode MJD */
11630      Set_Node_Offset(ret, parse_start+1);
11631      Set_Node_Cur_Length(ret, parse_start);
11632      RExC_parse--;
11633      nextchar(pRExC_state);
11634     }
11635    }
11636    break;
11637   case '\0':
11638    if (RExC_parse >= RExC_end)
11639     FAIL("Trailing \\");
11640    /* FALL THROUGH */
11641   default:
11642    /* Do not generate "unrecognized" warnings here, we fall
11643    back into the quick-grab loop below */
11644    parse_start--;
11645    goto defchar;
11646   }
11647   break;
11648
11649  case '#':
11650   if (RExC_flags & RXf_PMf_EXTENDED) {
11651    if ( reg_skipcomment( pRExC_state ) )
11652     goto tryagain;
11653   }
11654   /* FALL THROUGH */
11655
11656  default:
11657
11658    parse_start = RExC_parse - 1;
11659
11660    RExC_parse++;
11661
11662   defchar: {
11663    STRLEN len = 0;
11664    UV ender = 0;
11665    char *p;
11666    char *s;
11667 #define MAX_NODE_STRING_SIZE 127
11668    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11669    char *s0;
11670    U8 upper_parse = MAX_NODE_STRING_SIZE;
11671    U8 node_type = compute_EXACTish(pRExC_state);
11672    bool next_is_quantifier;
11673    char * oldp = NULL;
11674
11675    /* We can convert EXACTF nodes to EXACTFU if they contain only
11676    * characters that match identically regardless of the target
11677    * string's UTF8ness.  The reason to do this is that EXACTF is not
11678    * trie-able, EXACTFU is.
11679    *
11680    * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11681    * contain only above-Latin1 characters (hence must be in UTF8),
11682    * which don't participate in folds with Latin1-range characters,
11683    * as the latter's folds aren't known until runtime.  (We don't
11684    * need to figure this out until pass 2) */
11685    bool maybe_exactfu = PASS2
11686        && (node_type == EXACTF || node_type == EXACTFL);
11687
11688    /* If a folding node contains only code points that don't
11689    * participate in folds, it can be changed into an EXACT node,
11690    * which allows the optimizer more things to look for */
11691    bool maybe_exact;
11692
11693    ret = reg_node(pRExC_state, node_type);
11694
11695    /* In pass1, folded, we use a temporary buffer instead of the
11696    * actual node, as the node doesn't exist yet */
11697    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11698
11699    s0 = s;
11700
11701   reparse:
11702
11703    /* We do the EXACTFish to EXACT node only if folding.  (And we
11704    * don't need to figure this out until pass 2) */
11705    maybe_exact = FOLD && PASS2;
11706
11707    /* XXX The node can hold up to 255 bytes, yet this only goes to
11708    * 127.  I (khw) do not know why.  Keeping it somewhat less than
11709    * 255 allows us to not have to worry about overflow due to
11710    * converting to utf8 and fold expansion, but that value is
11711    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
11712    * split up by this limit into a single one using the real max of
11713    * 255.  Even at 127, this breaks under rare circumstances.  If
11714    * folding, we do not want to split a node at a character that is a
11715    * non-final in a multi-char fold, as an input string could just
11716    * happen to want to match across the node boundary.  The join
11717    * would solve that problem if the join actually happens.  But a
11718    * series of more than two nodes in a row each of 127 would cause
11719    * the first join to succeed to get to 254, but then there wouldn't
11720    * be room for the next one, which could at be one of those split
11721    * multi-char folds.  I don't know of any fool-proof solution.  One
11722    * could back off to end with only a code point that isn't such a
11723    * non-final, but it is possible for there not to be any in the
11724    * entire node. */
11725    for (p = RExC_parse - 1;
11726     len < upper_parse && p < RExC_end;
11727     len++)
11728    {
11729     oldp = p;
11730
11731     if (RExC_flags & RXf_PMf_EXTENDED)
11732      p = regwhite( pRExC_state, p );
11733     switch ((U8)*p) {
11734     case '^':
11735     case '$':
11736     case '.':
11737     case '[':
11738     case '(':
11739     case ')':
11740     case '|':
11741      goto loopdone;
11742     case '\\':
11743      /* Literal Escapes Switch
11744
11745      This switch is meant to handle escape sequences that
11746      resolve to a literal character.
11747
11748      Every escape sequence that represents something
11749      else, like an assertion or a char class, is handled
11750      in the switch marked 'Special Escapes' above in this
11751      routine, but also has an entry here as anything that
11752      isn't explicitly mentioned here will be treated as
11753      an unescaped equivalent literal.
11754      */
11755
11756      switch ((U8)*++p) {
11757      /* These are all the special escapes. */
11758      case 'A':             /* Start assertion */
11759      case 'b': case 'B':   /* Word-boundary assertion*/
11760      case 'C':             /* Single char !DANGEROUS! */
11761      case 'd': case 'D':   /* digit class */
11762      case 'g': case 'G':   /* generic-backref, pos assertion */
11763      case 'h': case 'H':   /* HORIZWS */
11764      case 'k': case 'K':   /* named backref, keep marker */
11765      case 'p': case 'P':   /* Unicode property */
11766        case 'R':   /* LNBREAK */
11767      case 's': case 'S':   /* space class */
11768      case 'v': case 'V':   /* VERTWS */
11769      case 'w': case 'W':   /* word class */
11770      case 'X':             /* eXtended Unicode "combining
11771            character sequence" */
11772      case 'z': case 'Z':   /* End of line/string assertion */
11773       --p;
11774       goto loopdone;
11775
11776      /* Anything after here is an escape that resolves to a
11777      literal. (Except digits, which may or may not)
11778      */
11779      case 'n':
11780       ender = '\n';
11781       p++;
11782       break;
11783      case 'N': /* Handle a single-code point named character. */
11784       /* The options cause it to fail if a multiple code
11785       * point sequence.  Handle those in the switch() above
11786       * */
11787       RExC_parse = p + 1;
11788       if (! grok_bslash_N(pRExC_state, NULL, &ender,
11789            flagp, depth, FALSE,
11790            FALSE /* not strict */ ))
11791       {
11792        if (*flagp & RESTART_UTF8)
11793         FAIL("panic: grok_bslash_N set RESTART_UTF8");
11794        RExC_parse = p = oldp;
11795        goto loopdone;
11796       }
11797       p = RExC_parse;
11798       if (ender > 0xff) {
11799        REQUIRE_UTF8;
11800       }
11801       break;
11802      case 'r':
11803       ender = '\r';
11804       p++;
11805       break;
11806      case 't':
11807       ender = '\t';
11808       p++;
11809       break;
11810      case 'f':
11811       ender = '\f';
11812       p++;
11813       break;
11814      case 'e':
11815       ender = ASCII_TO_NATIVE('\033');
11816       p++;
11817       break;
11818      case 'a':
11819       ender = '\a';
11820       p++;
11821       break;
11822      case 'o':
11823       {
11824        UV result;
11825        const char* error_msg;
11826
11827        bool valid = grok_bslash_o(&p,
11828              &result,
11829              &error_msg,
11830              TRUE, /* out warnings */
11831              FALSE, /* not strict */
11832              TRUE, /* Output warnings
11833                 for non-
11834                 portables */
11835              UTF);
11836        if (! valid) {
11837         RExC_parse = p; /* going to die anyway; point
11838             to exact spot of failure */
11839         vFAIL(error_msg);
11840        }
11841        ender = result;
11842        if (PL_encoding && ender < 0x100) {
11843         goto recode_encoding;
11844        }
11845        if (ender > 0xff) {
11846         REQUIRE_UTF8;
11847        }
11848        break;
11849       }
11850      case 'x':
11851       {
11852        UV result = UV_MAX; /* initialize to erroneous
11853             value */
11854        const char* error_msg;
11855
11856        bool valid = grok_bslash_x(&p,
11857              &result,
11858              &error_msg,
11859              TRUE, /* out warnings */
11860              FALSE, /* not strict */
11861              TRUE, /* Output warnings
11862                 for non-
11863                 portables */
11864              UTF);
11865        if (! valid) {
11866         RExC_parse = p; /* going to die anyway; point
11867             to exact spot of failure */
11868         vFAIL(error_msg);
11869        }
11870        ender = result;
11871
11872        if (PL_encoding && ender < 0x100) {
11873         goto recode_encoding;
11874        }
11875        if (ender > 0xff) {
11876         REQUIRE_UTF8;
11877        }
11878        break;
11879       }
11880      case 'c':
11881       p++;
11882       ender = grok_bslash_c(*p++, SIZE_ONLY);
11883       break;
11884      case '8': case '9': /* must be a backreference */
11885       --p;
11886       goto loopdone;
11887      case '1': case '2': case '3':case '4':
11888      case '5': case '6': case '7':
11889       /* When we parse backslash escapes there is ambiguity
11890       * between backreferences and octal escapes. Any escape
11891       * from \1 - \9 is a backreference, any multi-digit
11892       * escape which does not start with 0 and which when
11893       * evaluated as decimal could refer to an already
11894       * parsed capture buffer is a backslash. Anything else
11895       * is octal.
11896       *
11897       * Note this implies that \118 could be interpreted as
11898       * 118 OR as "\11" . "8" depending on whether there
11899       * were 118 capture buffers defined already in the
11900       * pattern.  */
11901
11902       /* NOTE, RExC_npar is 1 more than the actual number of
11903       * parens we have seen so far, hence the < RExC_npar below. */
11904
11905       if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11906       {  /* Not to be treated as an octal constant, go
11907         find backref */
11908        --p;
11909        goto loopdone;
11910       }
11911      case '0':
11912       {
11913        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11914        STRLEN numlen = 3;
11915        ender = grok_oct(p, &numlen, &flags, NULL);
11916        if (ender > 0xff) {
11917         REQUIRE_UTF8;
11918        }
11919        p += numlen;
11920        if (SIZE_ONLY   /* like \08, \178 */
11921         && numlen < 3
11922         && p < RExC_end
11923         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11924        {
11925         reg_warn_non_literal_string(
11926           p + 1,
11927           form_short_octal_warning(p, numlen));
11928        }
11929       }
11930       if (PL_encoding && ender < 0x100)
11931        goto recode_encoding;
11932       break;
11933      recode_encoding:
11934       if (! RExC_override_recoding) {
11935        SV* enc = PL_encoding;
11936        ender = reg_recode((const char)(U8)ender, &enc);
11937        if (!enc && SIZE_ONLY)
11938         ckWARNreg(p, "Invalid escape in the specified encoding");
11939        REQUIRE_UTF8;
11940       }
11941       break;
11942      case '\0':
11943       if (p >= RExC_end)
11944        FAIL("Trailing \\");
11945       /* FALL THROUGH */
11946      default:
11947       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11948        /* Include any { following the alpha to emphasize
11949        * that it could be part of an escape at some point
11950        * in the future */
11951        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11952        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11953       }
11954       goto normal_default;
11955      } /* End of switch on '\' */
11956      break;
11957     default:    /* A literal character */
11958
11959      if (! SIZE_ONLY
11960       && RExC_flags & RXf_PMf_EXTENDED
11961       && ckWARN_d(WARN_DEPRECATED)
11962       && is_PATWS_non_low_safe(p, RExC_end, UTF))
11963      {
11964       vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11965         "Escape literal pattern white space under /x");
11966      }
11967
11968     normal_default:
11969      if (UTF8_IS_START(*p) && UTF) {
11970       STRLEN numlen;
11971       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11972            &numlen, UTF8_ALLOW_DEFAULT);
11973       p += numlen;
11974      }
11975      else
11976       ender = (U8) *p++;
11977      break;
11978     } /* End of switch on the literal */
11979
11980     /* Here, have looked at the literal character and <ender>
11981     * contains its ordinal, <p> points to the character after it
11982     */
11983
11984     if ( RExC_flags & RXf_PMf_EXTENDED)
11985      p = regwhite( pRExC_state, p );
11986
11987     /* If the next thing is a quantifier, it applies to this
11988     * character only, which means that this character has to be in
11989     * its own node and can't just be appended to the string in an
11990     * existing node, so if there are already other characters in
11991     * the node, close the node with just them, and set up to do
11992     * this character again next time through, when it will be the
11993     * only thing in its new node */
11994     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11995     {
11996      p = oldp;
11997      goto loopdone;
11998     }
11999
12000     if (! FOLD   /* The simple case, just append the literal */
12001      || (LOC  /* Also don't fold for tricky chars under /l */
12002       && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12003     {
12004      if (UTF) {
12005       const STRLEN unilen = reguni(pRExC_state, ender, s);
12006       if (unilen > 0) {
12007       s   += unilen;
12008       len += unilen;
12009       }
12010
12011       /* The loop increments <len> each time, as all but this
12012       * path (and one other) through it add a single byte to
12013       * the EXACTish node.  But this one has changed len to
12014       * be the correct final value, so subtract one to
12015       * cancel out the increment that follows */
12016       len--;
12017      }
12018      else {
12019       REGC((char)ender, s++);
12020      }
12021
12022      /* Can get here if folding only if is one of the /l
12023      * characters whose fold depends on the locale.  The
12024      * occurrence of any of these indicate that we can't
12025      * simplify things */
12026      if (FOLD) {
12027       maybe_exact = FALSE;
12028       maybe_exactfu = FALSE;
12029      }
12030     }
12031     else             /* FOLD */
12032      if (! ( UTF
12033       /* See comments for join_exact() as to why we fold this
12034       * non-UTF at compile time */
12035       || (node_type == EXACTFU
12036        && ender == LATIN_SMALL_LETTER_SHARP_S)))
12037     {
12038      /* Here, are folding and are not UTF-8 encoded; therefore
12039      * the character must be in the range 0-255, and is not /l
12040      * (Not /l because we already handled these under /l in
12041      * is_PROBLEMATIC_LOCALE_FOLD_cp */
12042      if (IS_IN_SOME_FOLD_L1(ender)) {
12043       maybe_exact = FALSE;
12044
12045       /* See if the character's fold differs between /d and
12046       * /u.  This includes the multi-char fold SHARP S to
12047       * 'ss' */
12048       if (maybe_exactfu
12049        && (PL_fold[ender] != PL_fold_latin1[ender]
12050         || ender == LATIN_SMALL_LETTER_SHARP_S
12051         || (len > 0
12052         && isARG2_lower_or_UPPER_ARG1('s', ender)
12053         && isARG2_lower_or_UPPER_ARG1('s',
12054                 *(s-1)))))
12055       {
12056        maybe_exactfu = FALSE;
12057       }
12058      }
12059
12060      /* Even when folding, we store just the input character, as
12061      * we have an array that finds its fold quickly */
12062      *(s++) = (char) ender;
12063     }
12064     else {  /* FOLD and UTF */
12065      /* Unlike the non-fold case, we do actually have to
12066      * calculate the results here in pass 1.  This is for two
12067      * reasons, the folded length may be longer than the
12068      * unfolded, and we have to calculate how many EXACTish
12069      * nodes it will take; and we may run out of room in a node
12070      * in the middle of a potential multi-char fold, and have
12071      * to back off accordingly.  (Hence we can't use REGC for
12072      * the simple case just below.) */
12073
12074      UV folded;
12075      if (isASCII(ender)) {
12076       folded = toFOLD(ender);
12077       *(s)++ = (U8) folded;
12078      }
12079      else {
12080       STRLEN foldlen;
12081
12082       folded = _to_uni_fold_flags(
12083          ender,
12084          (U8 *) s,
12085          &foldlen,
12086          FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12087               ? FOLD_FLAGS_NOMIX_ASCII
12088               : 0));
12089       s += foldlen;
12090
12091       /* The loop increments <len> each time, as all but this
12092       * path (and one other) through it add a single byte to
12093       * the EXACTish node.  But this one has changed len to
12094       * be the correct final value, so subtract one to
12095       * cancel out the increment that follows */
12096       len += foldlen - 1;
12097      }
12098      /* If this node only contains non-folding code points so
12099      * far, see if this new one is also non-folding */
12100      if (maybe_exact) {
12101       if (folded != ender) {
12102        maybe_exact = FALSE;
12103       }
12104       else {
12105        /* Here the fold is the original; we have to check
12106        * further to see if anything folds to it */
12107        if (_invlist_contains_cp(PL_utf8_foldable,
12108               ender))
12109        {
12110         maybe_exact = FALSE;
12111        }
12112       }
12113      }
12114      ender = folded;
12115     }
12116
12117     if (next_is_quantifier) {
12118
12119      /* Here, the next input is a quantifier, and to get here,
12120      * the current character is the only one in the node.
12121      * Also, here <len> doesn't include the final byte for this
12122      * character */
12123      len++;
12124      goto loopdone;
12125     }
12126
12127    } /* End of loop through literal characters */
12128
12129    /* Here we have either exhausted the input or ran out of room in
12130    * the node.  (If we encountered a character that can't be in the
12131    * node, transfer is made directly to <loopdone>, and so we
12132    * wouldn't have fallen off the end of the loop.)  In the latter
12133    * case, we artificially have to split the node into two, because
12134    * we just don't have enough space to hold everything.  This
12135    * creates a problem if the final character participates in a
12136    * multi-character fold in the non-final position, as a match that
12137    * should have occurred won't, due to the way nodes are matched,
12138    * and our artificial boundary.  So back off until we find a non-
12139    * problematic character -- one that isn't at the beginning or
12140    * middle of such a fold.  (Either it doesn't participate in any
12141    * folds, or appears only in the final position of all the folds it
12142    * does participate in.)  A better solution with far fewer false
12143    * positives, and that would fill the nodes more completely, would
12144    * be to actually have available all the multi-character folds to
12145    * test against, and to back-off only far enough to be sure that
12146    * this node isn't ending with a partial one.  <upper_parse> is set
12147    * further below (if we need to reparse the node) to include just
12148    * up through that final non-problematic character that this code
12149    * identifies, so when it is set to less than the full node, we can
12150    * skip the rest of this */
12151    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12152
12153     const STRLEN full_len = len;
12154
12155     assert(len >= MAX_NODE_STRING_SIZE);
12156
12157     /* Here, <s> points to the final byte of the final character.
12158     * Look backwards through the string until find a non-
12159     * problematic character */
12160
12161     if (! UTF) {
12162
12163      /* This has no multi-char folds to non-UTF characters */
12164      if (ASCII_FOLD_RESTRICTED) {
12165       goto loopdone;
12166      }
12167
12168      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12169      len = s - s0 + 1;
12170     }
12171     else {
12172      if (!  PL_NonL1NonFinalFold) {
12173       PL_NonL1NonFinalFold = _new_invlist_C_array(
12174           NonL1_Perl_Non_Final_Folds_invlist);
12175      }
12176
12177      /* Point to the first byte of the final character */
12178      s = (char *) utf8_hop((U8 *) s, -1);
12179
12180      while (s >= s0) {   /* Search backwards until find
12181           non-problematic char */
12182       if (UTF8_IS_INVARIANT(*s)) {
12183
12184        /* There are no ascii characters that participate
12185        * in multi-char folds under /aa.  In EBCDIC, the
12186        * non-ascii invariants are all control characters,
12187        * so don't ever participate in any folds. */
12188        if (ASCII_FOLD_RESTRICTED
12189         || ! IS_NON_FINAL_FOLD(*s))
12190        {
12191         break;
12192        }
12193       }
12194       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12195        if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12196                 *s, *(s+1))))
12197        {
12198         break;
12199        }
12200       }
12201       else if (! _invlist_contains_cp(
12202           PL_NonL1NonFinalFold,
12203           valid_utf8_to_uvchr((U8 *) s, NULL)))
12204       {
12205        break;
12206       }
12207
12208       /* Here, the current character is problematic in that
12209       * it does occur in the non-final position of some
12210       * fold, so try the character before it, but have to
12211       * special case the very first byte in the string, so
12212       * we don't read outside the string */
12213       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12214      } /* End of loop backwards through the string */
12215
12216      /* If there were only problematic characters in the string,
12217      * <s> will point to before s0, in which case the length
12218      * should be 0, otherwise include the length of the
12219      * non-problematic character just found */
12220      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12221     }
12222
12223     /* Here, have found the final character, if any, that is
12224     * non-problematic as far as ending the node without splitting
12225     * it across a potential multi-char fold.  <len> contains the
12226     * number of bytes in the node up-to and including that
12227     * character, or is 0 if there is no such character, meaning
12228     * the whole node contains only problematic characters.  In
12229     * this case, give up and just take the node as-is.  We can't
12230     * do any better */
12231     if (len == 0) {
12232      len = full_len;
12233
12234      /* If the node ends in an 's' we make sure it stays EXACTF,
12235      * as if it turns into an EXACTFU, it could later get
12236      * joined with another 's' that would then wrongly match
12237      * the sharp s */
12238      if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12239      {
12240       maybe_exactfu = FALSE;
12241      }
12242     } else {
12243
12244      /* Here, the node does contain some characters that aren't
12245      * problematic.  If one such is the final character in the
12246      * node, we are done */
12247      if (len == full_len) {
12248       goto loopdone;
12249      }
12250      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12251
12252       /* If the final character is problematic, but the
12253       * penultimate is not, back-off that last character to
12254       * later start a new node with it */
12255       p = oldp;
12256       goto loopdone;
12257      }
12258
12259      /* Here, the final non-problematic character is earlier
12260      * in the input than the penultimate character.  What we do
12261      * is reparse from the beginning, going up only as far as
12262      * this final ok one, thus guaranteeing that the node ends
12263      * in an acceptable character.  The reason we reparse is
12264      * that we know how far in the character is, but we don't
12265      * know how to correlate its position with the input parse.
12266      * An alternate implementation would be to build that
12267      * correlation as we go along during the original parse,
12268      * but that would entail extra work for every node, whereas
12269      * this code gets executed only when the string is too
12270      * large for the node, and the final two characters are
12271      * problematic, an infrequent occurrence.  Yet another
12272      * possible strategy would be to save the tail of the
12273      * string, and the next time regatom is called, initialize
12274      * with that.  The problem with this is that unless you
12275      * back off one more character, you won't be guaranteed
12276      * regatom will get called again, unless regbranch,
12277      * regpiece ... are also changed.  If you do back off that
12278      * extra character, so that there is input guaranteed to
12279      * force calling regatom, you can't handle the case where
12280      * just the first character in the node is acceptable.  I
12281      * (khw) decided to try this method which doesn't have that
12282      * pitfall; if performance issues are found, we can do a
12283      * combination of the current approach plus that one */
12284      upper_parse = len;
12285      len = 0;
12286      s = s0;
12287      goto reparse;
12288     }
12289    }   /* End of verifying node ends with an appropriate char */
12290
12291   loopdone:   /* Jumped to when encounters something that shouldn't be in
12292      the node */
12293
12294    /* I (khw) don't know if you can get here with zero length, but the
12295    * old code handled this situation by creating a zero-length EXACT
12296    * node.  Might as well be NOTHING instead */
12297    if (len == 0) {
12298     OP(ret) = NOTHING;
12299    }
12300    else {
12301     if (FOLD) {
12302      /* If 'maybe_exact' is still set here, means there are no
12303      * code points in the node that participate in folds;
12304      * similarly for 'maybe_exactfu' and code points that match
12305      * differently depending on UTF8ness of the target string
12306      * (for /u), or depending on locale for /l */
12307      if (maybe_exact) {
12308       OP(ret) = EXACT;
12309      }
12310      else if (maybe_exactfu) {
12311       OP(ret) = EXACTFU;
12312      }
12313     }
12314     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12315           FALSE /* Don't look to see if could
12316              be turned into an EXACT
12317              node, as we have already
12318              computed that */
12319           );
12320    }
12321
12322    RExC_parse = p - 1;
12323    Set_Node_Cur_Length(ret, parse_start);
12324    nextchar(pRExC_state);
12325    {
12326     /* len is STRLEN which is unsigned, need to copy to signed */
12327     IV iv = len;
12328     if (iv < 0)
12329      vFAIL("Internal disaster");
12330    }
12331
12332   } /* End of label 'defchar:' */
12333   break;
12334  } /* End of giant switch on input character */
12335
12336  return(ret);
12337 }
12338
12339 STATIC char *
12340 S_regwhite( RExC_state_t *pRExC_state, char *p )
12341 {
12342  const char *e = RExC_end;
12343
12344  PERL_ARGS_ASSERT_REGWHITE;
12345
12346  while (p < e) {
12347   if (isSPACE(*p))
12348    ++p;
12349   else if (*p == '#') {
12350    bool ended = 0;
12351    do {
12352     if (*p++ == '\n') {
12353      ended = 1;
12354      break;
12355     }
12356    } while (p < e);
12357    if (!ended)
12358     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12359   }
12360   else
12361    break;
12362  }
12363  return p;
12364 }
12365
12366 STATIC char *
12367 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12368 {
12369  /* Returns the next non-pattern-white space, non-comment character (the
12370  * latter only if 'recognize_comment is true) in the string p, which is
12371  * ended by RExC_end.  If there is no line break ending a comment,
12372  * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
12373  const char *e = RExC_end;
12374
12375  PERL_ARGS_ASSERT_REGPATWS;
12376
12377  while (p < e) {
12378   STRLEN len;
12379   if ((len = is_PATWS_safe(p, e, UTF))) {
12380    p += len;
12381   }
12382   else if (recognize_comment && *p == '#') {
12383    bool ended = 0;
12384    do {
12385     p++;
12386     if (is_LNBREAK_safe(p, e, UTF)) {
12387      ended = 1;
12388      break;
12389     }
12390    } while (p < e);
12391    if (!ended)
12392     RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12393   }
12394   else
12395    break;
12396  }
12397  return p;
12398 }
12399
12400 STATIC void
12401 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12402 {
12403  /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'.  It
12404  * sets up the bitmap and any flags, removing those code points from the
12405  * inversion list, setting it to NULL should it become completely empty */
12406
12407  PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12408  assert(PL_regkind[OP(node)] == ANYOF);
12409
12410  ANYOF_BITMAP_ZERO(node);
12411  if (*invlist_ptr) {
12412
12413   /* This gets set if we actually need to modify things */
12414   bool change_invlist = FALSE;
12415
12416   UV start, end;
12417
12418   /* Start looking through *invlist_ptr */
12419   invlist_iterinit(*invlist_ptr);
12420   while (invlist_iternext(*invlist_ptr, &start, &end)) {
12421    UV high;
12422    int i;
12423
12424    if (end == UV_MAX && start <= 256) {
12425     ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12426    }
12427    else if (end >= 256) {
12428     ANYOF_FLAGS(node) |= ANYOF_UTF8;
12429    }
12430
12431    /* Quit if are above what we should change */
12432    if (start > 255) {
12433     break;
12434    }
12435
12436    change_invlist = TRUE;
12437
12438    /* Set all the bits in the range, up to the max that we are doing */
12439    high = (end < 255) ? end : 255;
12440    for (i = start; i <= (int) high; i++) {
12441     if (! ANYOF_BITMAP_TEST(node, i)) {
12442      ANYOF_BITMAP_SET(node, i);
12443     }
12444    }
12445   }
12446   invlist_iterfinish(*invlist_ptr);
12447
12448   /* Done with loop; remove any code points that are in the bitmap from
12449   * *invlist_ptr; similarly for code points above latin1 if we have a
12450   * flag to match all of them anyways */
12451   if (change_invlist) {
12452    _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12453   }
12454   if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12455    _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12456   }
12457
12458   /* If have completely emptied it, remove it completely */
12459   if (_invlist_len(*invlist_ptr) == 0) {
12460    SvREFCNT_dec_NN(*invlist_ptr);
12461    *invlist_ptr = NULL;
12462   }
12463  }
12464 }
12465
12466 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12467    Character classes ([:foo:]) can also be negated ([:^foo:]).
12468    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12469    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12470    but trigger failures because they are currently unimplemented. */
12471
12472 #define POSIXCC_DONE(c)   ((c) == ':')
12473 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12474 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12475
12476 PERL_STATIC_INLINE I32
12477 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12478 {
12479  dVAR;
12480  I32 namedclass = OOB_NAMEDCLASS;
12481
12482  PERL_ARGS_ASSERT_REGPPOSIXCC;
12483
12484  if (value == '[' && RExC_parse + 1 < RExC_end &&
12485   /* I smell either [: or [= or [. -- POSIX has been here, right? */
12486   POSIXCC(UCHARAT(RExC_parse)))
12487  {
12488   const char c = UCHARAT(RExC_parse);
12489   char* const s = RExC_parse++;
12490
12491   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12492    RExC_parse++;
12493   if (RExC_parse == RExC_end) {
12494    if (strict) {
12495
12496     /* Try to give a better location for the error (than the end of
12497     * the string) by looking for the matching ']' */
12498     RExC_parse = s;
12499     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12500      RExC_parse++;
12501     }
12502     vFAIL2("Unmatched '%c' in POSIX class", c);
12503    }
12504    /* Grandfather lone [:, [=, [. */
12505    RExC_parse = s;
12506   }
12507   else {
12508    const char* const t = RExC_parse++; /* skip over the c */
12509    assert(*t == c);
12510
12511    if (UCHARAT(RExC_parse) == ']') {
12512     const char *posixcc = s + 1;
12513     RExC_parse++; /* skip over the ending ] */
12514
12515     if (*s == ':') {
12516      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12517      const I32 skip = t - posixcc;
12518
12519      /* Initially switch on the length of the name.  */
12520      switch (skip) {
12521      case 4:
12522       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12523               this is the Perl \w
12524               */
12525        namedclass = ANYOF_WORDCHAR;
12526       break;
12527      case 5:
12528       /* Names all of length 5.  */
12529       /* alnum alpha ascii blank cntrl digit graph lower
12530       print punct space upper  */
12531       /* Offset 4 gives the best switch position.  */
12532       switch (posixcc[4]) {
12533       case 'a':
12534        if (memEQ(posixcc, "alph", 4)) /* alpha */
12535         namedclass = ANYOF_ALPHA;
12536        break;
12537       case 'e':
12538        if (memEQ(posixcc, "spac", 4)) /* space */
12539         namedclass = ANYOF_PSXSPC;
12540        break;
12541       case 'h':
12542        if (memEQ(posixcc, "grap", 4)) /* graph */
12543         namedclass = ANYOF_GRAPH;
12544        break;
12545       case 'i':
12546        if (memEQ(posixcc, "asci", 4)) /* ascii */
12547         namedclass = ANYOF_ASCII;
12548        break;
12549       case 'k':
12550        if (memEQ(posixcc, "blan", 4)) /* blank */
12551         namedclass = ANYOF_BLANK;
12552        break;
12553       case 'l':
12554        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12555         namedclass = ANYOF_CNTRL;
12556        break;
12557       case 'm':
12558        if (memEQ(posixcc, "alnu", 4)) /* alnum */
12559         namedclass = ANYOF_ALPHANUMERIC;
12560        break;
12561       case 'r':
12562        if (memEQ(posixcc, "lowe", 4)) /* lower */
12563         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12564        else if (memEQ(posixcc, "uppe", 4)) /* upper */
12565         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12566        break;
12567       case 't':
12568        if (memEQ(posixcc, "digi", 4)) /* digit */
12569         namedclass = ANYOF_DIGIT;
12570        else if (memEQ(posixcc, "prin", 4)) /* print */
12571         namedclass = ANYOF_PRINT;
12572        else if (memEQ(posixcc, "punc", 4)) /* punct */
12573         namedclass = ANYOF_PUNCT;
12574        break;
12575       }
12576       break;
12577      case 6:
12578       if (memEQ(posixcc, "xdigit", 6))
12579        namedclass = ANYOF_XDIGIT;
12580       break;
12581      }
12582
12583      if (namedclass == OOB_NAMEDCLASS)
12584       vFAIL2utf8f(
12585        "POSIX class [:%"UTF8f":] unknown",
12586        UTF8fARG(UTF, t - s - 1, s + 1));
12587
12588      /* The #defines are structured so each complement is +1 to
12589      * the normal one */
12590      if (complement) {
12591       namedclass++;
12592      }
12593      assert (posixcc[skip] == ':');
12594      assert (posixcc[skip+1] == ']');
12595     } else if (!SIZE_ONLY) {
12596      /* [[=foo=]] and [[.foo.]] are still future. */
12597
12598      /* adjust RExC_parse so the warning shows after
12599      the class closes */
12600      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12601       RExC_parse++;
12602      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12603     }
12604    } else {
12605     /* Maternal grandfather:
12606     * "[:" ending in ":" but not in ":]" */
12607     if (strict) {
12608      vFAIL("Unmatched '[' in POSIX class");
12609     }
12610
12611     /* Grandfather lone [:, [=, [. */
12612     RExC_parse = s;
12613    }
12614   }
12615  }
12616
12617  return namedclass;
12618 }
12619
12620 STATIC bool
12621 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12622 {
12623  /* This applies some heuristics at the current parse position (which should
12624  * be at a '[') to see if what follows might be intended to be a [:posix:]
12625  * class.  It returns true if it really is a posix class, of course, but it
12626  * also can return true if it thinks that what was intended was a posix
12627  * class that didn't quite make it.
12628  *
12629  * It will return true for
12630  *      [:alphanumerics:
12631  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
12632  *                         ')' indicating the end of the (?[
12633  *      [:any garbage including %^&$ punctuation:]
12634  *
12635  * This is designed to be called only from S_handle_regex_sets; it could be
12636  * easily adapted to be called from the spot at the beginning of regclass()
12637  * that checks to see in a normal bracketed class if the surrounding []
12638  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
12639  * change long-standing behavior, so I (khw) didn't do that */
12640  char* p = RExC_parse + 1;
12641  char first_char = *p;
12642
12643  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12644
12645  assert(*(p - 1) == '[');
12646
12647  if (! POSIXCC(first_char)) {
12648   return FALSE;
12649  }
12650
12651  p++;
12652  while (p < RExC_end && isWORDCHAR(*p)) p++;
12653
12654  if (p >= RExC_end) {
12655   return FALSE;
12656  }
12657
12658  if (p - RExC_parse > 2    /* Got at least 1 word character */
12659   && (*p == first_char
12660    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12661  {
12662   return TRUE;
12663  }
12664
12665  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12666
12667  return (p
12668    && p - RExC_parse > 2 /* [:] evaluates to colon;
12669          [::] is a bad posix class. */
12670    && first_char == *(p - 1));
12671 }
12672
12673 STATIC regnode *
12674 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12675      I32 *flagp, U32 depth,
12676      char * const oregcomp_parse)
12677 {
12678  /* Handle the (?[...]) construct to do set operations */
12679
12680  U8 curchar;
12681  UV start, end; /* End points of code point ranges */
12682  SV* result_string;
12683  char *save_end, *save_parse;
12684  SV* final;
12685  STRLEN len;
12686  regnode* node;
12687  AV* stack;
12688  const bool save_fold = FOLD;
12689
12690  GET_RE_DEBUG_FLAGS_DECL;
12691
12692  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12693
12694  if (LOC) {
12695   vFAIL("(?[...]) not valid in locale");
12696  }
12697  RExC_uni_semantics = 1;
12698
12699  /* This will return only an ANYOF regnode, or (unlikely) something smaller
12700  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
12701  * call regclass to handle '[]' so as to not have to reinvent its parsing
12702  * rules here (throwing away the size it computes each time).  And, we exit
12703  * upon an unescaped ']' that isn't one ending a regclass.  To do both
12704  * these things, we need to realize that something preceded by a backslash
12705  * is escaped, so we have to keep track of backslashes */
12706  if (SIZE_ONLY) {
12707   UV depth = 0; /* how many nested (?[...]) constructs */
12708
12709   Perl_ck_warner_d(aTHX_
12710    packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12711    "The regex_sets feature is experimental" REPORT_LOCATION,
12712     UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12713     UTF8fARG(UTF,
12714       RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12715       RExC_precomp + (RExC_parse - RExC_precomp)));
12716
12717   while (RExC_parse < RExC_end) {
12718    SV* current = NULL;
12719    RExC_parse = regpatws(pRExC_state, RExC_parse,
12720         TRUE); /* means recognize comments */
12721    switch (*RExC_parse) {
12722     case '?':
12723      if (RExC_parse[1] == '[') depth++, RExC_parse++;
12724      /* FALL THROUGH */
12725     default:
12726      break;
12727     case '\\':
12728      /* Skip the next byte (which could cause us to end up in
12729      * the middle of a UTF-8 character, but since none of those
12730      * are confusable with anything we currently handle in this
12731      * switch (invariants all), it's safe.  We'll just hit the
12732      * default: case next time and keep on incrementing until
12733      * we find one of the invariants we do handle. */
12734      RExC_parse++;
12735      break;
12736     case '[':
12737     {
12738      /* If this looks like it is a [:posix:] class, leave the
12739      * parse pointer at the '[' to fool regclass() into
12740      * thinking it is part of a '[[:posix:]]'.  That function
12741      * will use strict checking to force a syntax error if it
12742      * doesn't work out to a legitimate class */
12743      bool is_posix_class
12744          = could_it_be_a_POSIX_class(pRExC_state);
12745      if (! is_posix_class) {
12746       RExC_parse++;
12747      }
12748
12749      /* regclass() can only return RESTART_UTF8 if multi-char
12750      folds are allowed.  */
12751      if (!regclass(pRExC_state, flagp,depth+1,
12752         is_posix_class, /* parse the whole char
12753              class only if not a
12754              posix class */
12755         FALSE, /* don't allow multi-char folds */
12756         TRUE, /* silence non-portable warnings. */
12757         &current))
12758       FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12759        (UV) *flagp);
12760
12761      /* function call leaves parse pointing to the ']', except
12762      * if we faked it */
12763      if (is_posix_class) {
12764       RExC_parse--;
12765      }
12766
12767      SvREFCNT_dec(current);   /* In case it returned something */
12768      break;
12769     }
12770
12771     case ']':
12772      if (depth--) break;
12773      RExC_parse++;
12774      if (RExC_parse < RExC_end
12775       && *RExC_parse == ')')
12776      {
12777       node = reganode(pRExC_state, ANYOF, 0);
12778       RExC_size += ANYOF_SKIP;
12779       nextchar(pRExC_state);
12780       Set_Node_Length(node,
12781         RExC_parse - oregcomp_parse + 1); /* MJD */
12782       return node;
12783      }
12784      goto no_close;
12785    }
12786    RExC_parse++;
12787   }
12788
12789   no_close:
12790   FAIL("Syntax error in (?[...])");
12791  }
12792
12793  /* Pass 2 only after this.  Everything in this construct is a
12794  * metacharacter.  Operands begin with either a '\' (for an escape
12795  * sequence), or a '[' for a bracketed character class.  Any other
12796  * character should be an operator, or parenthesis for grouping.  Both
12797  * types of operands are handled by calling regclass() to parse them.  It
12798  * is called with a parameter to indicate to return the computed inversion
12799  * list.  The parsing here is implemented via a stack.  Each entry on the
12800  * stack is a single character representing one of the operators, or the
12801  * '('; or else a pointer to an operand inversion list. */
12802
12803 #define IS_OPERAND(a)  (! SvIOK(a))
12804
12805  /* The stack starts empty.  It is a syntax error if the first thing parsed
12806  * is a binary operator; everything else is pushed on the stack.  When an
12807  * operand is parsed, the top of the stack is examined.  If it is a binary
12808  * operator, the item before it should be an operand, and both are replaced
12809  * by the result of doing that operation on the new operand and the one on
12810  * the stack.   Thus a sequence of binary operands is reduced to a single
12811  * one before the next one is parsed.
12812  *
12813  * A unary operator may immediately follow a binary in the input, for
12814  * example
12815  *      [a] + ! [b]
12816  * When an operand is parsed and the top of the stack is a unary operator,
12817  * the operation is performed, and then the stack is rechecked to see if
12818  * this new operand is part of a binary operation; if so, it is handled as
12819  * above.
12820  *
12821  * A '(' is simply pushed on the stack; it is valid only if the stack is
12822  * empty, or the top element of the stack is an operator or another '('
12823  * (for which the parenthesized expression will become an operand).  By the
12824  * time the corresponding ')' is parsed everything in between should have
12825  * been parsed and evaluated to a single operand (or else is a syntax
12826  * error), and is handled as a regular operand */
12827
12828  sv_2mortal((SV *)(stack = newAV()));
12829
12830  while (RExC_parse < RExC_end) {
12831   I32 top_index = av_tindex(stack);
12832   SV** top_ptr;
12833   SV* current = NULL;
12834
12835   /* Skip white space */
12836   RExC_parse = regpatws(pRExC_state, RExC_parse,
12837         TRUE); /* means recognize comments */
12838   if (RExC_parse >= RExC_end) {
12839    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12840   }
12841   if ((curchar = UCHARAT(RExC_parse)) == ']') {
12842    break;
12843   }
12844
12845   switch (curchar) {
12846
12847    case '?':
12848     if (av_tindex(stack) >= 0   /* This makes sure that we can
12849            safely subtract 1 from
12850            RExC_parse in the next clause.
12851            If we have something on the
12852            stack, we have parsed something
12853            */
12854      && UCHARAT(RExC_parse - 1) == '('
12855      && RExC_parse < RExC_end)
12856     {
12857      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12858      * This happens when we have some thing like
12859      *
12860      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12861      *   ...
12862      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
12863      *
12864      * Here we would be handling the interpolated
12865      * '$thai_or_lao'.  We handle this by a recursive call to
12866      * ourselves which returns the inversion list the
12867      * interpolated expression evaluates to.  We use the flags
12868      * from the interpolated pattern. */
12869      U32 save_flags = RExC_flags;
12870      const char * const save_parse = ++RExC_parse;
12871
12872      parse_lparen_question_flags(pRExC_state);
12873
12874      if (RExC_parse == save_parse  /* Makes sure there was at
12875              least one flag (or this
12876              embedding wasn't compiled)
12877             */
12878       || RExC_parse >= RExC_end - 4
12879       || UCHARAT(RExC_parse) != ':'
12880       || UCHARAT(++RExC_parse) != '('
12881       || UCHARAT(++RExC_parse) != '?'
12882       || UCHARAT(++RExC_parse) != '[')
12883      {
12884
12885       /* In combination with the above, this moves the
12886       * pointer to the point just after the first erroneous
12887       * character (or if there are no flags, to where they
12888       * should have been) */
12889       if (RExC_parse >= RExC_end - 4) {
12890        RExC_parse = RExC_end;
12891       }
12892       else if (RExC_parse != save_parse) {
12893        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12894       }
12895       vFAIL("Expecting '(?flags:(?[...'");
12896      }
12897      RExC_parse++;
12898      (void) handle_regex_sets(pRExC_state, &current, flagp,
12899              depth+1, oregcomp_parse);
12900
12901      /* Here, 'current' contains the embedded expression's
12902      * inversion list, and RExC_parse points to the trailing
12903      * ']'; the next character should be the ')' which will be
12904      * paired with the '(' that has been put on the stack, so
12905      * the whole embedded expression reduces to '(operand)' */
12906      RExC_parse++;
12907
12908      RExC_flags = save_flags;
12909      goto handle_operand;
12910     }
12911     /* FALL THROUGH */
12912
12913    default:
12914     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12915     vFAIL("Unexpected character");
12916
12917    case '\\':
12918     /* regclass() can only return RESTART_UTF8 if multi-char
12919     folds are allowed.  */
12920     if (!regclass(pRExC_state, flagp,depth+1,
12921        TRUE, /* means parse just the next thing */
12922        FALSE, /* don't allow multi-char folds */
12923        FALSE, /* don't silence non-portable warnings.  */
12924        &current))
12925      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12926       (UV) *flagp);
12927     /* regclass() will return with parsing just the \ sequence,
12928     * leaving the parse pointer at the next thing to parse */
12929     RExC_parse--;
12930     goto handle_operand;
12931
12932    case '[':   /* Is a bracketed character class */
12933    {
12934     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12935
12936     if (! is_posix_class) {
12937      RExC_parse++;
12938     }
12939
12940     /* regclass() can only return RESTART_UTF8 if multi-char
12941     folds are allowed.  */
12942     if(!regclass(pRExC_state, flagp,depth+1,
12943        is_posix_class, /* parse the whole char class
12944             only if not a posix class */
12945        FALSE, /* don't allow multi-char folds */
12946        FALSE, /* don't silence non-portable warnings.  */
12947        &current))
12948      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12949       (UV) *flagp);
12950     /* function call leaves parse pointing to the ']', except if we
12951     * faked it */
12952     if (is_posix_class) {
12953      RExC_parse--;
12954     }
12955
12956     goto handle_operand;
12957    }
12958
12959    case '&':
12960    case '|':
12961    case '+':
12962    case '-':
12963    case '^':
12964     if (top_index < 0
12965      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12966      || ! IS_OPERAND(*top_ptr))
12967     {
12968      RExC_parse++;
12969      vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12970     }
12971     av_push(stack, newSVuv(curchar));
12972     break;
12973
12974    case '!':
12975     av_push(stack, newSVuv(curchar));
12976     break;
12977
12978    case '(':
12979     if (top_index >= 0) {
12980      top_ptr = av_fetch(stack, top_index, FALSE);
12981      assert(top_ptr);
12982      if (IS_OPERAND(*top_ptr)) {
12983       RExC_parse++;
12984       vFAIL("Unexpected '(' with no preceding operator");
12985      }
12986     }
12987     av_push(stack, newSVuv(curchar));
12988     break;
12989
12990    case ')':
12991    {
12992     SV* lparen;
12993     if (top_index < 1
12994      || ! (current = av_pop(stack))
12995      || ! IS_OPERAND(current)
12996      || ! (lparen = av_pop(stack))
12997      || IS_OPERAND(lparen)
12998      || SvUV(lparen) != '(')
12999     {
13000      SvREFCNT_dec(current);
13001      RExC_parse++;
13002      vFAIL("Unexpected ')'");
13003     }
13004     top_index -= 2;
13005     SvREFCNT_dec_NN(lparen);
13006
13007     /* FALL THROUGH */
13008    }
13009
13010    handle_operand:
13011
13012     /* Here, we have an operand to process, in 'current' */
13013
13014     if (top_index < 0) {    /* Just push if stack is empty */
13015      av_push(stack, current);
13016     }
13017     else {
13018      SV* top = av_pop(stack);
13019      SV *prev = NULL;
13020      char current_operator;
13021
13022      if (IS_OPERAND(top)) {
13023       SvREFCNT_dec_NN(top);
13024       SvREFCNT_dec_NN(current);
13025       vFAIL("Operand with no preceding operator");
13026      }
13027      current_operator = (char) SvUV(top);
13028      switch (current_operator) {
13029       case '(':   /* Push the '(' back on followed by the new
13030          operand */
13031        av_push(stack, top);
13032        av_push(stack, current);
13033        SvREFCNT_inc(top);  /* Counters the '_dec' done
13034             just after the 'break', so
13035             it doesn't get wrongly freed
13036             */
13037        break;
13038
13039       case '!':
13040        _invlist_invert(current);
13041
13042        /* Unlike binary operators, the top of the stack,
13043        * now that this unary one has been popped off, may
13044        * legally be an operator, and we now have operand
13045        * for it. */
13046        top_index--;
13047        SvREFCNT_dec_NN(top);
13048        goto handle_operand;
13049
13050       case '&':
13051        prev = av_pop(stack);
13052        _invlist_intersection(prev,
13053             current,
13054             &current);
13055        av_push(stack, current);
13056        break;
13057
13058       case '|':
13059       case '+':
13060        prev = av_pop(stack);
13061        _invlist_union(prev, current, &current);
13062        av_push(stack, current);
13063        break;
13064
13065       case '-':
13066        prev = av_pop(stack);;
13067        _invlist_subtract(prev, current, &current);
13068        av_push(stack, current);
13069        break;
13070
13071       case '^':   /* The union minus the intersection */
13072       {
13073        SV* i = NULL;
13074        SV* u = NULL;
13075        SV* element;
13076
13077        prev = av_pop(stack);
13078        _invlist_union(prev, current, &u);
13079        _invlist_intersection(prev, current, &i);
13080        /* _invlist_subtract will overwrite current
13081         without freeing what it already contains */
13082        element = current;
13083        _invlist_subtract(u, i, &current);
13084        av_push(stack, current);
13085        SvREFCNT_dec_NN(i);
13086        SvREFCNT_dec_NN(u);
13087        SvREFCNT_dec_NN(element);
13088        break;
13089       }
13090
13091       default:
13092        Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13093     }
13094     SvREFCNT_dec_NN(top);
13095     SvREFCNT_dec(prev);
13096    }
13097   }
13098
13099   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13100  }
13101
13102  if (av_tindex(stack) < 0   /* Was empty */
13103   || ((final = av_pop(stack)) == NULL)
13104   || ! IS_OPERAND(final)
13105   || av_tindex(stack) >= 0)  /* More left on stack */
13106  {
13107   vFAIL("Incomplete expression within '(?[ ])'");
13108  }
13109
13110  /* Here, 'final' is the resultant inversion list from evaluating the
13111  * expression.  Return it if so requested */
13112  if (return_invlist) {
13113   *return_invlist = final;
13114   return END;
13115  }
13116
13117  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
13118  * expecting a string of ranges and individual code points */
13119  invlist_iterinit(final);
13120  result_string = newSVpvs("");
13121  while (invlist_iternext(final, &start, &end)) {
13122   if (start == end) {
13123    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13124   }
13125   else {
13126    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13127              start,          end);
13128   }
13129  }
13130
13131  save_parse = RExC_parse;
13132  RExC_parse = SvPV(result_string, len);
13133  save_end = RExC_end;
13134  RExC_end = RExC_parse + len;
13135
13136  /* We turn off folding around the call, as the class we have constructed
13137  * already has all folding taken into consideration, and we don't want
13138  * regclass() to add to that */
13139  RExC_flags &= ~RXf_PMf_FOLD;
13140  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13141  */
13142  node = regclass(pRExC_state, flagp,depth+1,
13143      FALSE, /* means parse the whole char class */
13144      FALSE, /* don't allow multi-char folds */
13145      TRUE, /* silence non-portable warnings.  The above may very
13146        well have generated non-portable code points, but
13147        they're valid on this machine */
13148      NULL);
13149  if (!node)
13150   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13151      PTR2UV(flagp));
13152  if (save_fold) {
13153   RExC_flags |= RXf_PMf_FOLD;
13154  }
13155  RExC_parse = save_parse + 1;
13156  RExC_end = save_end;
13157  SvREFCNT_dec_NN(final);
13158  SvREFCNT_dec_NN(result_string);
13159
13160  nextchar(pRExC_state);
13161  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13162  return node;
13163 }
13164 #undef IS_OPERAND
13165
13166 /* The names of properties whose definitions are not known at compile time are
13167  * stored in this SV, after a constant heading.  So if the length has been
13168  * changed since initialization, then there is a run-time definition. */
13169 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION                            \
13170           (SvCUR(listsv) != initial_listsv_len)
13171
13172 STATIC regnode *
13173 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13174     const bool stop_at_1,  /* Just parse the next thing, don't
13175           look for a full character class */
13176     bool allow_multi_folds,
13177     const bool silence_non_portable,   /* Don't output warnings
13178              about too large
13179              characters */
13180     SV** ret_invlist)  /* Return an inversion list, not a node */
13181 {
13182  /* parse a bracketed class specification.  Most of these will produce an
13183  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13184  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
13185  * under /i with multi-character folds: it will be rewritten following the
13186  * paradigm of this example, where the <multi-fold>s are characters which
13187  * fold to multiple character sequences:
13188  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13189  * gets effectively rewritten as:
13190  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13191  * reg() gets called (recursively) on the rewritten version, and this
13192  * function will return what it constructs.  (Actually the <multi-fold>s
13193  * aren't physically removed from the [abcdefghi], it's just that they are
13194  * ignored in the recursion by means of a flag:
13195  * <RExC_in_multi_char_class>.)
13196  *
13197  * ANYOF nodes contain a bit map for the first 256 characters, with the
13198  * corresponding bit set if that character is in the list.  For characters
13199  * above 255, a range list or swash is used.  There are extra bits for \w,
13200  * etc. in locale ANYOFs, as what these match is not determinable at
13201  * compile time
13202  *
13203  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13204  * to be restarted.  This can only happen if ret_invlist is non-NULL.
13205  */
13206
13207  dVAR;
13208  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13209  IV range = 0;
13210  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13211  regnode *ret;
13212  STRLEN numlen;
13213  IV namedclass = OOB_NAMEDCLASS;
13214  char *rangebegin = NULL;
13215  bool need_class = 0;
13216  SV *listsv = NULL;
13217  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13218          than just initialized.  */
13219  SV* properties = NULL;    /* Code points that match \p{} \P{} */
13220  SV* posixes = NULL;     /* Code points that match classes like [:word:],
13221        extended beyond the Latin1 range.  These have to
13222        be kept separate from other code points for much
13223        of this function because their handling  is
13224        different under /i, and for most classes under
13225        /d as well */
13226  SV* nposixes = NULL;    /* Similarly for [:^word:].  These are kept
13227        separate for a while from the non-complemented
13228        versions because of complications with /d
13229        matching */
13230  UV element_count = 0;   /* Number of distinct elements in the class.
13231        Optimizations may be possible if this is tiny */
13232  AV * multi_char_matches = NULL; /* Code points that fold to more than one
13233          character; used under /i */
13234  UV n;
13235  char * stop_ptr = RExC_end;    /* where to stop parsing */
13236  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13237             space? */
13238  const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13239
13240  /* Unicode properties are stored in a swash; this holds the current one
13241  * being parsed.  If this swash is the only above-latin1 component of the
13242  * character class, an optimization is to pass it directly on to the
13243  * execution engine.  Otherwise, it is set to NULL to indicate that there
13244  * are other things in the class that have to be dealt with at execution
13245  * time */
13246  SV* swash = NULL;  /* Code points that match \p{} \P{} */
13247
13248  /* Set if a component of this character class is user-defined; just passed
13249  * on to the engine */
13250  bool has_user_defined_property = FALSE;
13251
13252  /* inversion list of code points this node matches only when the target
13253  * string is in UTF-8.  (Because is under /d) */
13254  SV* depends_list = NULL;
13255
13256  /* Inversion list of code points this node matches regardless of things
13257  * like locale, folding, utf8ness of the target string */
13258  SV* cp_list = NULL;
13259
13260  /* Like cp_list, but code points on this list need to be checked for things
13261  * that fold to/from them under /i */
13262  SV* cp_foldable_list = NULL;
13263
13264  /* Like cp_list, but code points on this list are valid only when the
13265  * runtime locale is UTF-8 */
13266  SV* only_utf8_locale_list = NULL;
13267
13268 #ifdef EBCDIC
13269  /* In a range, counts how many 0-2 of the ends of it came from literals,
13270  * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
13271  UV literal_endpoint = 0;
13272 #endif
13273  bool invert = FALSE;    /* Is this class to be complemented */
13274
13275  bool warn_super = ALWAYS_WARN_SUPER;
13276
13277  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13278   case we need to change the emitted regop to an EXACT. */
13279  const char * orig_parse = RExC_parse;
13280  const SSize_t orig_size = RExC_size;
13281  bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13282  GET_RE_DEBUG_FLAGS_DECL;
13283
13284  PERL_ARGS_ASSERT_REGCLASS;
13285 #ifndef DEBUGGING
13286  PERL_UNUSED_ARG(depth);
13287 #endif
13288
13289  DEBUG_PARSE("clas");
13290
13291  /* Assume we are going to generate an ANYOF node. */
13292  ret = reganode(pRExC_state, ANYOF, 0);
13293
13294  if (SIZE_ONLY) {
13295   RExC_size += ANYOF_SKIP;
13296   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13297  }
13298  else {
13299   ANYOF_FLAGS(ret) = 0;
13300
13301   RExC_emit += ANYOF_SKIP;
13302   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13303   initial_listsv_len = SvCUR(listsv);
13304   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
13305  }
13306
13307  if (skip_white) {
13308   RExC_parse = regpatws(pRExC_state, RExC_parse,
13309        FALSE /* means don't recognize comments */);
13310  }
13311
13312  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13313   RExC_parse++;
13314   invert = TRUE;
13315   allow_multi_folds = FALSE;
13316   RExC_naughty++;
13317   if (skip_white) {
13318    RExC_parse = regpatws(pRExC_state, RExC_parse,
13319         FALSE /* means don't recognize comments */);
13320   }
13321  }
13322
13323  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13324  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13325   const char *s = RExC_parse;
13326   const char  c = *s++;
13327
13328   while (isWORDCHAR(*s))
13329    s++;
13330   if (*s && c == *s && s[1] == ']') {
13331    SAVEFREESV(RExC_rx_sv);
13332    ckWARN3reg(s+2,
13333      "POSIX syntax [%c %c] belongs inside character classes",
13334      c, c);
13335    (void)ReREFCNT_inc(RExC_rx_sv);
13336   }
13337  }
13338
13339  /* If the caller wants us to just parse a single element, accomplish this
13340  * by faking the loop ending condition */
13341  if (stop_at_1 && RExC_end > RExC_parse) {
13342   stop_ptr = RExC_parse + 1;
13343  }
13344
13345  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13346  if (UCHARAT(RExC_parse) == ']')
13347   goto charclassloop;
13348
13349 parseit:
13350  while (1) {
13351   if  (RExC_parse >= stop_ptr) {
13352    break;
13353   }
13354
13355   if (skip_white) {
13356    RExC_parse = regpatws(pRExC_state, RExC_parse,
13357         FALSE /* means don't recognize comments */);
13358   }
13359
13360   if  (UCHARAT(RExC_parse) == ']') {
13361    break;
13362   }
13363
13364  charclassloop:
13365
13366   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13367   save_value = value;
13368   save_prevvalue = prevvalue;
13369
13370   if (!range) {
13371    rangebegin = RExC_parse;
13372    element_count++;
13373   }
13374   if (UTF) {
13375    value = utf8n_to_uvchr((U8*)RExC_parse,
13376         RExC_end - RExC_parse,
13377         &numlen, UTF8_ALLOW_DEFAULT);
13378    RExC_parse += numlen;
13379   }
13380   else
13381    value = UCHARAT(RExC_parse++);
13382
13383   if (value == '['
13384    && RExC_parse < RExC_end
13385    && POSIXCC(UCHARAT(RExC_parse)))
13386   {
13387    namedclass = regpposixcc(pRExC_state, value, strict);
13388   }
13389   else if (value == '\\') {
13390    if (UTF) {
13391     value = utf8n_to_uvchr((U8*)RExC_parse,
13392         RExC_end - RExC_parse,
13393         &numlen, UTF8_ALLOW_DEFAULT);
13394     RExC_parse += numlen;
13395    }
13396    else
13397     value = UCHARAT(RExC_parse++);
13398
13399    /* Some compilers cannot handle switching on 64-bit integer
13400    * values, therefore value cannot be an UV.  Yes, this will
13401    * be a problem later if we want switch on Unicode.
13402    * A similar issue a little bit later when switching on
13403    * namedclass. --jhi */
13404
13405    /* If the \ is escaping white space when white space is being
13406    * skipped, it means that that white space is wanted literally, and
13407    * is already in 'value'.  Otherwise, need to translate the escape
13408    * into what it signifies. */
13409    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13410
13411    case 'w': namedclass = ANYOF_WORDCHAR; break;
13412    case 'W': namedclass = ANYOF_NWORDCHAR; break;
13413    case 's': namedclass = ANYOF_SPACE; break;
13414    case 'S': namedclass = ANYOF_NSPACE; break;
13415    case 'd': namedclass = ANYOF_DIGIT; break;
13416    case 'D': namedclass = ANYOF_NDIGIT; break;
13417    case 'v': namedclass = ANYOF_VERTWS; break;
13418    case 'V': namedclass = ANYOF_NVERTWS; break;
13419    case 'h': namedclass = ANYOF_HORIZWS; break;
13420    case 'H': namedclass = ANYOF_NHORIZWS; break;
13421    case 'N':  /* Handle \N{NAME} in class */
13422     {
13423      /* We only pay attention to the first char of
13424      multichar strings being returned. I kinda wonder
13425      if this makes sense as it does change the behaviour
13426      from earlier versions, OTOH that behaviour was broken
13427      as well. */
13428      if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13429          TRUE, /* => charclass */
13430          strict))
13431      {
13432       if (*flagp & RESTART_UTF8)
13433        FAIL("panic: grok_bslash_N set RESTART_UTF8");
13434       goto parseit;
13435      }
13436     }
13437     break;
13438    case 'p':
13439    case 'P':
13440     {
13441     char *e;
13442
13443     /* We will handle any undefined properties ourselves */
13444     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13445          /* And we actually would prefer to get
13446           * the straight inversion list of the
13447           * swash, since we will be accessing it
13448           * anyway, to save a little time */
13449          |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13450
13451     if (RExC_parse >= RExC_end)
13452      vFAIL2("Empty \\%c{}", (U8)value);
13453     if (*RExC_parse == '{') {
13454      const U8 c = (U8)value;
13455      e = strchr(RExC_parse++, '}');
13456      if (!e)
13457       vFAIL2("Missing right brace on \\%c{}", c);
13458      while (isSPACE(UCHARAT(RExC_parse)))
13459       RExC_parse++;
13460      if (e == RExC_parse)
13461       vFAIL2("Empty \\%c{}", c);
13462      n = e - RExC_parse;
13463      while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13464       n--;
13465     }
13466     else {
13467      e = RExC_parse;
13468      n = 1;
13469     }
13470     if (!SIZE_ONLY) {
13471      SV* invlist;
13472      char* formatted;
13473      char* name;
13474
13475      if (UCHARAT(RExC_parse) == '^') {
13476       RExC_parse++;
13477       n--;
13478       /* toggle.  (The rhs xor gets the single bit that
13479       * differs between P and p; the other xor inverts just
13480       * that bit) */
13481       value ^= 'P' ^ 'p';
13482
13483       while (isSPACE(UCHARAT(RExC_parse))) {
13484        RExC_parse++;
13485        n--;
13486       }
13487      }
13488      /* Try to get the definition of the property into
13489      * <invlist>.  If /i is in effect, the effective property
13490      * will have its name be <__NAME_i>.  The design is
13491      * discussed in commit
13492      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13493      formatted = Perl_form(aTHX_
13494           "%s%.*s%s\n",
13495           (FOLD) ? "__" : "",
13496           (int)n,
13497           RExC_parse,
13498           (FOLD) ? "_i" : ""
13499         );
13500      name = savepvn(formatted, strlen(formatted));
13501
13502      /* Look up the property name, and get its swash and
13503      * inversion list, if the property is found  */
13504      if (swash) {
13505       SvREFCNT_dec_NN(swash);
13506      }
13507      swash = _core_swash_init("utf8", name, &PL_sv_undef,
13508            1, /* binary */
13509            0, /* not tr/// */
13510            NULL, /* No inversion list */
13511            &swash_init_flags
13512            );
13513      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13514       if (swash) {
13515        SvREFCNT_dec_NN(swash);
13516        swash = NULL;
13517       }
13518
13519       /* Here didn't find it.  It could be a user-defined
13520       * property that will be available at run-time.  If we
13521       * accept only compile-time properties, is an error;
13522       * otherwise add it to the list for run-time look up */
13523       if (ret_invlist) {
13524        RExC_parse = e + 1;
13525        vFAIL2utf8f(
13526         "Property '%"UTF8f"' is unknown",
13527         UTF8fARG(UTF, n, name));
13528       }
13529       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13530           (value == 'p' ? '+' : '!'),
13531           UTF8fARG(UTF, n, name));
13532       has_user_defined_property = TRUE;
13533
13534       /* We don't know yet, so have to assume that the
13535       * property could match something in the Latin1 range,
13536       * hence something that isn't utf8.  Note that this
13537       * would cause things in <depends_list> to match
13538       * inappropriately, except that any \p{}, including
13539       * this one forces Unicode semantics, which means there
13540       * is no <depends_list> */
13541       ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13542      }
13543      else {
13544
13545       /* Here, did get the swash and its inversion list.  If
13546       * the swash is from a user-defined property, then this
13547       * whole character class should be regarded as such */
13548       if (swash_init_flags
13549        & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13550       {
13551        has_user_defined_property = TRUE;
13552       }
13553       else if
13554        /* We warn on matching an above-Unicode code point
13555        * if the match would return true, except don't
13556        * warn for \p{All}, which has exactly one element
13557        * = 0 */
13558        (_invlist_contains_cp(invlist, 0x110000)
13559         && (! (_invlist_len(invlist) == 1
13560          && *invlist_array(invlist) == 0)))
13561       {
13562        warn_super = TRUE;
13563       }
13564
13565
13566       /* Invert if asking for the complement */
13567       if (value == 'P') {
13568        _invlist_union_complement_2nd(properties,
13569               invlist,
13570               &properties);
13571
13572        /* The swash can't be used as-is, because we've
13573        * inverted things; delay removing it to here after
13574        * have copied its invlist above */
13575        SvREFCNT_dec_NN(swash);
13576        swash = NULL;
13577       }
13578       else {
13579        _invlist_union(properties, invlist, &properties);
13580       }
13581      }
13582      Safefree(name);
13583     }
13584     RExC_parse = e + 1;
13585     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
13586             named */
13587
13588     /* \p means they want Unicode semantics */
13589     RExC_uni_semantics = 1;
13590     }
13591     break;
13592    case 'n': value = '\n';   break;
13593    case 'r': value = '\r';   break;
13594    case 't': value = '\t';   break;
13595    case 'f': value = '\f';   break;
13596    case 'b': value = '\b';   break;
13597    case 'e': value = ASCII_TO_NATIVE('\033');break;
13598    case 'a': value = '\a';                   break;
13599    case 'o':
13600     RExC_parse--; /* function expects to be pointed at the 'o' */
13601     {
13602      const char* error_msg;
13603      bool valid = grok_bslash_o(&RExC_parse,
13604            &value,
13605            &error_msg,
13606            SIZE_ONLY,   /* warnings in pass
13607                1 only */
13608            strict,
13609            silence_non_portable,
13610            UTF);
13611      if (! valid) {
13612       vFAIL(error_msg);
13613      }
13614     }
13615     if (PL_encoding && value < 0x100) {
13616      goto recode_encoding;
13617     }
13618     break;
13619    case 'x':
13620     RExC_parse--; /* function expects to be pointed at the 'x' */
13621     {
13622      const char* error_msg;
13623      bool valid = grok_bslash_x(&RExC_parse,
13624            &value,
13625            &error_msg,
13626            TRUE, /* Output warnings */
13627            strict,
13628            silence_non_portable,
13629            UTF);
13630      if (! valid) {
13631       vFAIL(error_msg);
13632      }
13633     }
13634     if (PL_encoding && value < 0x100)
13635      goto recode_encoding;
13636     break;
13637    case 'c':
13638     value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13639     break;
13640    case '0': case '1': case '2': case '3': case '4':
13641    case '5': case '6': case '7':
13642     {
13643      /* Take 1-3 octal digits */
13644      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13645      numlen = (strict) ? 4 : 3;
13646      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13647      RExC_parse += numlen;
13648      if (numlen != 3) {
13649       if (strict) {
13650        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13651        vFAIL("Need exactly 3 octal digits");
13652       }
13653       else if (! SIZE_ONLY /* like \08, \178 */
13654         && numlen < 3
13655         && RExC_parse < RExC_end
13656         && isDIGIT(*RExC_parse)
13657         && ckWARN(WARN_REGEXP))
13658       {
13659        SAVEFREESV(RExC_rx_sv);
13660        reg_warn_non_literal_string(
13661         RExC_parse + 1,
13662         form_short_octal_warning(RExC_parse, numlen));
13663        (void)ReREFCNT_inc(RExC_rx_sv);
13664       }
13665      }
13666      if (PL_encoding && value < 0x100)
13667       goto recode_encoding;
13668      break;
13669     }
13670    recode_encoding:
13671     if (! RExC_override_recoding) {
13672      SV* enc = PL_encoding;
13673      value = reg_recode((const char)(U8)value, &enc);
13674      if (!enc) {
13675       if (strict) {
13676        vFAIL("Invalid escape in the specified encoding");
13677       }
13678       else if (SIZE_ONLY) {
13679        ckWARNreg(RExC_parse,
13680         "Invalid escape in the specified encoding");
13681       }
13682      }
13683      break;
13684     }
13685    default:
13686     /* Allow \_ to not give an error */
13687     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13688      if (strict) {
13689       vFAIL2("Unrecognized escape \\%c in character class",
13690        (int)value);
13691      }
13692      else {
13693       SAVEFREESV(RExC_rx_sv);
13694       ckWARN2reg(RExC_parse,
13695        "Unrecognized escape \\%c in character class passed through",
13696        (int)value);
13697       (void)ReREFCNT_inc(RExC_rx_sv);
13698      }
13699     }
13700     break;
13701    }   /* End of switch on char following backslash */
13702   } /* end of handling backslash escape sequences */
13703 #ifdef EBCDIC
13704   else
13705    literal_endpoint++;
13706 #endif
13707
13708   /* Here, we have the current token in 'value' */
13709
13710   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13711    U8 classnum;
13712
13713    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
13714    * literal, as is the character that began the false range, i.e.
13715    * the 'a' in the examples */
13716    if (range) {
13717     if (!SIZE_ONLY) {
13718      const int w = (RExC_parse >= rangebegin)
13719         ? RExC_parse - rangebegin
13720         : 0;
13721      if (strict) {
13722       vFAIL2utf8f(
13723        "False [] range \"%"UTF8f"\"",
13724        UTF8fARG(UTF, w, rangebegin));
13725      }
13726      else {
13727       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13728       ckWARN2reg(RExC_parse,
13729        "False [] range \"%"UTF8f"\"",
13730        UTF8fARG(UTF, w, rangebegin));
13731       (void)ReREFCNT_inc(RExC_rx_sv);
13732       cp_list = add_cp_to_invlist(cp_list, '-');
13733       cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13734                prevvalue);
13735      }
13736     }
13737
13738     range = 0; /* this was not a true range */
13739     element_count += 2; /* So counts for three values */
13740    }
13741
13742    classnum = namedclass_to_classnum(namedclass);
13743
13744    if (LOC && namedclass < ANYOF_POSIXL_MAX
13745 #ifndef HAS_ISASCII
13746     && classnum != _CC_ASCII
13747 #endif
13748    ) {
13749     /* What the Posix classes (like \w, [:space:]) match in locale
13750     * isn't knowable under locale until actual match time.  Room
13751     * must be reserved (one time per outer bracketed class) to
13752     * store such classes.  The space will contain a bit for each
13753     * named class that is to be matched against.  This isn't
13754     * needed for \p{} and pseudo-classes, as they are not affected
13755     * by locale, and hence are dealt with separately */
13756     if (! need_class) {
13757      need_class = 1;
13758      if (SIZE_ONLY) {
13759       RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13760      }
13761      else {
13762       RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13763      }
13764      ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13765      ANYOF_POSIXL_ZERO(ret);
13766     }
13767
13768     /* See if it already matches the complement of this POSIX
13769     * class */
13770     if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13771      && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13772                ? -1
13773                : 1)))
13774     {
13775      posixl_matches_all = TRUE;
13776      break;  /* No need to continue.  Since it matches both
13777        e.g., \w and \W, it matches everything, and the
13778        bracketed class can be optimized into qr/./s */
13779     }
13780
13781     /* Add this class to those that should be checked at runtime */
13782     ANYOF_POSIXL_SET(ret, namedclass);
13783
13784     /* The above-Latin1 characters are not subject to locale rules.
13785     * Just add them, in the second pass, to the
13786     * unconditionally-matched list */
13787     if (! SIZE_ONLY) {
13788      SV* scratch_list = NULL;
13789
13790      /* Get the list of the above-Latin1 code points this
13791      * matches */
13792      _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13793           PL_XPosix_ptrs[classnum],
13794
13795           /* Odd numbers are complements, like
13796           * NDIGIT, NASCII, ... */
13797           namedclass % 2 != 0,
13798           &scratch_list);
13799      /* Checking if 'cp_list' is NULL first saves an extra
13800      * clone.  Its reference count will be decremented at the
13801      * next union, etc, or if this is the only instance, at the
13802      * end of the routine */
13803      if (! cp_list) {
13804       cp_list = scratch_list;
13805      }
13806      else {
13807       _invlist_union(cp_list, scratch_list, &cp_list);
13808       SvREFCNT_dec_NN(scratch_list);
13809      }
13810      continue;   /* Go get next character */
13811     }
13812    }
13813    else if (! SIZE_ONLY) {
13814
13815     /* Here, not in pass1 (in that pass we skip calculating the
13816     * contents of this class), and is /l, or is a POSIX class for
13817     * which /l doesn't matter (or is a Unicode property, which is
13818     * skipped here). */
13819     if (namedclass >= ANYOF_POSIXL_MAX) {  /* If a special class */
13820      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13821
13822       /* Here, should be \h, \H, \v, or \V.  None of /d, /i
13823       * nor /l make a difference in what these match,
13824       * therefore we just add what they match to cp_list. */
13825       if (classnum != _CC_VERTSPACE) {
13826        assert(   namedclass == ANYOF_HORIZWS
13827         || namedclass == ANYOF_NHORIZWS);
13828
13829        /* It turns out that \h is just a synonym for
13830        * XPosixBlank */
13831        classnum = _CC_BLANK;
13832       }
13833
13834       _invlist_union_maybe_complement_2nd(
13835         cp_list,
13836         PL_XPosix_ptrs[classnum],
13837         namedclass % 2 != 0,    /* Complement if odd
13838               (NHORIZWS, NVERTWS)
13839               */
13840         &cp_list);
13841      }
13842     }
13843     else {  /* Garden variety class.  If is NASCII, NDIGIT, ...
13844       complement and use nposixes */
13845      SV** posixes_ptr = namedclass % 2 == 0
13846          ? &posixes
13847          : &nposixes;
13848      SV** source_ptr = &PL_XPosix_ptrs[classnum];
13849      _invlist_union_maybe_complement_2nd(
13850              *posixes_ptr,
13851              *source_ptr,
13852              namedclass % 2 != 0,
13853              posixes_ptr);
13854     }
13855     continue;   /* Go get next character */
13856    }
13857   } /* end of namedclass \blah */
13858
13859   /* Here, we have a single value.  If 'range' is set, it is the ending
13860   * of a range--check its validity.  Later, we will handle each
13861   * individual code point in the range.  If 'range' isn't set, this
13862   * could be the beginning of a range, so check for that by looking
13863   * ahead to see if the next real character to be processed is the range
13864   * indicator--the minus sign */
13865
13866   if (skip_white) {
13867    RExC_parse = regpatws(pRExC_state, RExC_parse,
13868         FALSE /* means don't recognize comments */);
13869   }
13870
13871   if (range) {
13872    if (prevvalue > value) /* b-a */ {
13873     const int w = RExC_parse - rangebegin;
13874     vFAIL2utf8f(
13875      "Invalid [] range \"%"UTF8f"\"",
13876      UTF8fARG(UTF, w, rangebegin));
13877     range = 0; /* not a valid range */
13878    }
13879   }
13880   else {
13881    prevvalue = value; /* save the beginning of the potential range */
13882    if (! stop_at_1     /* Can't be a range if parsing just one thing */
13883     && *RExC_parse == '-')
13884    {
13885     char* next_char_ptr = RExC_parse + 1;
13886     if (skip_white) {   /* Get the next real char after the '-' */
13887      next_char_ptr = regpatws(pRExC_state,
13888            RExC_parse + 1,
13889            FALSE); /* means don't recognize
13890               comments */
13891     }
13892
13893     /* If the '-' is at the end of the class (just before the ']',
13894     * it is a literal minus; otherwise it is a range */
13895     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13896      RExC_parse = next_char_ptr;
13897
13898      /* a bad range like \w-, [:word:]- ? */
13899      if (namedclass > OOB_NAMEDCLASS) {
13900       if (strict || ckWARN(WARN_REGEXP)) {
13901        const int w =
13902         RExC_parse >= rangebegin ?
13903         RExC_parse - rangebegin : 0;
13904        if (strict) {
13905         vFAIL4("False [] range \"%*.*s\"",
13906          w, w, rangebegin);
13907        }
13908        else {
13909         vWARN4(RExC_parse,
13910          "False [] range \"%*.*s\"",
13911          w, w, rangebegin);
13912        }
13913       }
13914       if (!SIZE_ONLY) {
13915        cp_list = add_cp_to_invlist(cp_list, '-');
13916       }
13917       element_count++;
13918      } else
13919       range = 1; /* yeah, it's a range! */
13920      continue; /* but do it the next time */
13921     }
13922    }
13923   }
13924
13925   /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13926   * if not */
13927
13928   /* non-Latin1 code point implies unicode semantics.  Must be set in
13929   * pass1 so is there for the whole of pass 2 */
13930   if (value > 255) {
13931    RExC_uni_semantics = 1;
13932   }
13933
13934   /* Ready to process either the single value, or the completed range.
13935   * For single-valued non-inverted ranges, we consider the possibility
13936   * of multi-char folds.  (We made a conscious decision to not do this
13937   * for the other cases because it can often lead to non-intuitive
13938   * results.  For example, you have the peculiar case that:
13939   *  "s s" =~ /^[^\xDF]+$/i => Y
13940   *  "ss"  =~ /^[^\xDF]+$/i => N
13941   *
13942   * See [perl #89750] */
13943   if (FOLD && allow_multi_folds && value == prevvalue) {
13944    if (value == LATIN_SMALL_LETTER_SHARP_S
13945     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13946               value)))
13947    {
13948     /* Here <value> is indeed a multi-char fold.  Get what it is */
13949
13950     U8 foldbuf[UTF8_MAXBYTES_CASE];
13951     STRLEN foldlen;
13952
13953     UV folded = _to_uni_fold_flags(
13954         value,
13955         foldbuf,
13956         &foldlen,
13957         FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
13958             ? FOLD_FLAGS_NOMIX_ASCII
13959             : 0)
13960         );
13961
13962     /* Here, <folded> should be the first character of the
13963     * multi-char fold of <value>, with <foldbuf> containing the
13964     * whole thing.  But, if this fold is not allowed (because of
13965     * the flags), <fold> will be the same as <value>, and should
13966     * be processed like any other character, so skip the special
13967     * handling */
13968     if (folded != value) {
13969
13970      /* Skip if we are recursed, currently parsing the class
13971      * again.  Otherwise add this character to the list of
13972      * multi-char folds. */
13973      if (! RExC_in_multi_char_class) {
13974       AV** this_array_ptr;
13975       AV* this_array;
13976       STRLEN cp_count = utf8_length(foldbuf,
13977              foldbuf + foldlen);
13978       SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13979
13980       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13981
13982
13983       if (! multi_char_matches) {
13984        multi_char_matches = newAV();
13985       }
13986
13987       /* <multi_char_matches> is actually an array of arrays.
13988       * There will be one or two top-level elements: [2],
13989       * and/or [3].  The [2] element is an array, each
13990       * element thereof is a character which folds to TWO
13991       * characters; [3] is for folds to THREE characters.
13992       * (Unicode guarantees a maximum of 3 characters in any
13993       * fold.)  When we rewrite the character class below,
13994       * we will do so such that the longest folds are
13995       * written first, so that it prefers the longest
13996       * matching strings first.  This is done even if it
13997       * turns out that any quantifier is non-greedy, out of
13998       * programmer laziness.  Tom Christiansen has agreed
13999       * that this is ok.  This makes the test for the
14000       * ligature 'ffi' come before the test for 'ff' */
14001       if (av_exists(multi_char_matches, cp_count)) {
14002        this_array_ptr = (AV**) av_fetch(multi_char_matches,
14003                cp_count, FALSE);
14004        this_array = *this_array_ptr;
14005       }
14006       else {
14007        this_array = newAV();
14008        av_store(multi_char_matches, cp_count,
14009          (SV*) this_array);
14010       }
14011       av_push(this_array, multi_fold);
14012      }
14013
14014      /* This element should not be processed further in this
14015      * class */
14016      element_count--;
14017      value = save_value;
14018      prevvalue = save_prevvalue;
14019      continue;
14020     }
14021    }
14022   }
14023
14024   /* Deal with this element of the class */
14025   if (! SIZE_ONLY) {
14026 #ifndef EBCDIC
14027    cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14028              prevvalue, value);
14029 #else
14030    SV* this_range = _new_invlist(1);
14031    _append_range_to_invlist(this_range, prevvalue, value);
14032
14033    /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14034    * If this range was specified using something like 'i-j', we want
14035    * to include only the 'i' and the 'j', and not anything in
14036    * between, so exclude non-ASCII, non-alphabetics from it.
14037    * However, if the range was specified with something like
14038    * [\x89-\x91] or [\x89-j], all code points within it should be
14039    * included.  literal_endpoint==2 means both ends of the range used
14040    * a literal character, not \x{foo} */
14041    if (literal_endpoint == 2
14042     && ((prevvalue >= 'a' && value <= 'z')
14043      || (prevvalue >= 'A' && value <= 'Z')))
14044    {
14045     _invlist_intersection(this_range, PL_ASCII,
14046          &this_range);
14047
14048     /* Since this above only contains ascii, the intersection of it
14049     * with anything will still yield only ascii */
14050     _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14051          &this_range);
14052    }
14053    _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14054    literal_endpoint = 0;
14055 #endif
14056   }
14057
14058   range = 0; /* this range (if it was one) is done now */
14059  } /* End of loop through all the text within the brackets */
14060
14061  /* If anything in the class expands to more than one character, we have to
14062  * deal with them by building up a substitute parse string, and recursively
14063  * calling reg() on it, instead of proceeding */
14064  if (multi_char_matches) {
14065   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14066   I32 cp_count;
14067   STRLEN len;
14068   char *save_end = RExC_end;
14069   char *save_parse = RExC_parse;
14070   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
14071          a "|" */
14072   I32 reg_flags;
14073
14074   assert(! invert);
14075 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
14076   because too confusing */
14077   if (invert) {
14078    sv_catpv(substitute_parse, "(?:");
14079   }
14080 #endif
14081
14082   /* Look at the longest folds first */
14083   for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14084
14085    if (av_exists(multi_char_matches, cp_count)) {
14086     AV** this_array_ptr;
14087     SV* this_sequence;
14088
14089     this_array_ptr = (AV**) av_fetch(multi_char_matches,
14090             cp_count, FALSE);
14091     while ((this_sequence = av_pop(*this_array_ptr)) !=
14092                 &PL_sv_undef)
14093     {
14094      if (! first_time) {
14095       sv_catpv(substitute_parse, "|");
14096      }
14097      first_time = FALSE;
14098
14099      sv_catpv(substitute_parse, SvPVX(this_sequence));
14100     }
14101    }
14102   }
14103
14104   /* If the character class contains anything else besides these
14105   * multi-character folds, have to include it in recursive parsing */
14106   if (element_count) {
14107    sv_catpv(substitute_parse, "|[");
14108    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14109    sv_catpv(substitute_parse, "]");
14110   }
14111
14112   sv_catpv(substitute_parse, ")");
14113 #if 0
14114   if (invert) {
14115    /* This is a way to get the parse to skip forward a whole named
14116    * sequence instead of matching the 2nd character when it fails the
14117    * first */
14118    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14119   }
14120 #endif
14121
14122   RExC_parse = SvPV(substitute_parse, len);
14123   RExC_end = RExC_parse + len;
14124   RExC_in_multi_char_class = 1;
14125   RExC_emit = (regnode *)orig_emit;
14126
14127   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
14128
14129   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14130
14131   RExC_parse = save_parse;
14132   RExC_end = save_end;
14133   RExC_in_multi_char_class = 0;
14134   SvREFCNT_dec_NN(multi_char_matches);
14135   return ret;
14136  }
14137
14138  /* Here, we've gone through the entire class and dealt with multi-char
14139  * folds.  We are now in a position that we can do some checks to see if we
14140  * can optimize this ANYOF node into a simpler one, even in Pass 1.
14141  * Currently we only do two checks:
14142  * 1) is in the unlikely event that the user has specified both, eg. \w and
14143  *    \W under /l, then the class matches everything.  (This optimization
14144  *    is done only to make the optimizer code run later work.)
14145  * 2) if the character class contains only a single element (including a
14146  *    single range), we see if there is an equivalent node for it.
14147  * Other checks are possible */
14148  if (! ret_invlist   /* Can't optimize if returning the constructed
14149       inversion list */
14150   && (UNLIKELY(posixl_matches_all) || element_count == 1))
14151  {
14152   U8 op = END;
14153   U8 arg = 0;
14154
14155   if (UNLIKELY(posixl_matches_all)) {
14156    op = SANY;
14157   }
14158   else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14159             \w or [:digit:] or \p{foo}
14160             */
14161
14162    /* All named classes are mapped into POSIXish nodes, with its FLAG
14163    * argument giving which class it is */
14164    switch ((I32)namedclass) {
14165     case ANYOF_UNIPROP:
14166      break;
14167
14168     /* These don't depend on the charset modifiers.  They always
14169     * match under /u rules */
14170     case ANYOF_NHORIZWS:
14171     case ANYOF_HORIZWS:
14172      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14173      /* FALLTHROUGH */
14174
14175     case ANYOF_NVERTWS:
14176     case ANYOF_VERTWS:
14177      op = POSIXU;
14178      goto join_posix;
14179
14180     /* The actual POSIXish node for all the rest depends on the
14181     * charset modifier.  The ones in the first set depend only on
14182     * ASCII or, if available on this platform, locale */
14183     case ANYOF_ASCII:
14184     case ANYOF_NASCII:
14185 #ifdef HAS_ISASCII
14186      op = (LOC) ? POSIXL : POSIXA;
14187 #else
14188      op = POSIXA;
14189 #endif
14190      goto join_posix;
14191
14192     case ANYOF_NCASED:
14193     case ANYOF_LOWER:
14194     case ANYOF_NLOWER:
14195     case ANYOF_UPPER:
14196     case ANYOF_NUPPER:
14197      /* under /a could be alpha */
14198      if (FOLD) {
14199       if (ASCII_RESTRICTED) {
14200        namedclass = ANYOF_ALPHA + (namedclass % 2);
14201       }
14202       else if (! LOC) {
14203        break;
14204       }
14205      }
14206      /* FALLTHROUGH */
14207
14208     /* The rest have more possibilities depending on the charset.
14209     * We take advantage of the enum ordering of the charset
14210     * modifiers to get the exact node type, */
14211     default:
14212      op = POSIXD + get_regex_charset(RExC_flags);
14213      if (op > POSIXA) { /* /aa is same as /a */
14214       op = POSIXA;
14215      }
14216
14217     join_posix:
14218      /* The odd numbered ones are the complements of the
14219      * next-lower even number one */
14220      if (namedclass % 2 == 1) {
14221       invert = ! invert;
14222       namedclass--;
14223      }
14224      arg = namedclass_to_classnum(namedclass);
14225      break;
14226    }
14227   }
14228   else if (value == prevvalue) {
14229
14230    /* Here, the class consists of just a single code point */
14231
14232    if (invert) {
14233     if (! LOC && value == '\n') {
14234      op = REG_ANY; /* Optimize [^\n] */
14235      *flagp |= HASWIDTH|SIMPLE;
14236      RExC_naughty++;
14237     }
14238    }
14239    else if (value < 256 || UTF) {
14240
14241     /* Optimize a single value into an EXACTish node, but not if it
14242     * would require converting the pattern to UTF-8. */
14243     op = compute_EXACTish(pRExC_state);
14244    }
14245   } /* Otherwise is a range */
14246   else if (! LOC) {   /* locale could vary these */
14247    if (prevvalue == '0') {
14248     if (value == '9') {
14249      arg = _CC_DIGIT;
14250      op = POSIXA;
14251     }
14252    }
14253   }
14254
14255   /* Here, we have changed <op> away from its initial value iff we found
14256   * an optimization */
14257   if (op != END) {
14258
14259    /* Throw away this ANYOF regnode, and emit the calculated one,
14260    * which should correspond to the beginning, not current, state of
14261    * the parse */
14262    const char * cur_parse = RExC_parse;
14263    RExC_parse = (char *)orig_parse;
14264    if ( SIZE_ONLY) {
14265     if (! LOC) {
14266
14267      /* To get locale nodes to not use the full ANYOF size would
14268      * require moving the code above that writes the portions
14269      * of it that aren't in other nodes to after this point.
14270      * e.g.  ANYOF_POSIXL_SET */
14271      RExC_size = orig_size;
14272     }
14273    }
14274    else {
14275     RExC_emit = (regnode *)orig_emit;
14276     if (PL_regkind[op] == POSIXD) {
14277      if (op == POSIXL) {
14278       RExC_contains_locale = 1;
14279      }
14280      if (invert) {
14281       op += NPOSIXD - POSIXD;
14282      }
14283     }
14284    }
14285
14286    ret = reg_node(pRExC_state, op);
14287
14288    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14289     if (! SIZE_ONLY) {
14290      FLAGS(ret) = arg;
14291     }
14292     *flagp |= HASWIDTH|SIMPLE;
14293    }
14294    else if (PL_regkind[op] == EXACT) {
14295     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14296           TRUE /* downgradable to EXACT */
14297           );
14298    }
14299
14300    RExC_parse = (char *) cur_parse;
14301
14302    SvREFCNT_dec(posixes);
14303    SvREFCNT_dec(nposixes);
14304    SvREFCNT_dec(cp_list);
14305    SvREFCNT_dec(cp_foldable_list);
14306    return ret;
14307   }
14308  }
14309
14310  if (SIZE_ONLY)
14311   return ret;
14312  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14313
14314  /* If folding, we calculate all characters that could fold to or from the
14315  * ones already on the list */
14316  if (cp_foldable_list) {
14317   if (FOLD) {
14318    UV start, end; /* End points of code point ranges */
14319
14320    SV* fold_intersection = NULL;
14321    SV** use_list;
14322
14323    /* Our calculated list will be for Unicode rules.  For locale
14324    * matching, we have to keep a separate list that is consulted at
14325    * runtime only when the locale indicates Unicode rules.  For
14326    * non-locale, we just use to the general list */
14327    if (LOC) {
14328     use_list = &only_utf8_locale_list;
14329    }
14330    else {
14331     use_list = &cp_list;
14332    }
14333
14334    /* Only the characters in this class that participate in folds need
14335    * be checked.  Get the intersection of this class and all the
14336    * possible characters that are foldable.  This can quickly narrow
14337    * down a large class */
14338    _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14339         &fold_intersection);
14340
14341    /* The folds for all the Latin1 characters are hard-coded into this
14342    * program, but we have to go out to disk to get the others. */
14343    if (invlist_highest(cp_foldable_list) >= 256) {
14344
14345     /* This is a hash that for a particular fold gives all
14346     * characters that are involved in it */
14347     if (! PL_utf8_foldclosures) {
14348
14349      /* If the folds haven't been read in, call a fold function
14350      * to force that */
14351      if (! PL_utf8_tofold) {
14352       U8 dummy[UTF8_MAXBYTES_CASE+1];
14353
14354       /* This string is just a short named one above \xff */
14355       to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14356       assert(PL_utf8_tofold); /* Verify that worked */
14357      }
14358      PL_utf8_foldclosures
14359          = _swash_inversion_hash(PL_utf8_tofold);
14360     }
14361    }
14362
14363    /* Now look at the foldable characters in this class individually */
14364    invlist_iterinit(fold_intersection);
14365    while (invlist_iternext(fold_intersection, &start, &end)) {
14366     UV j;
14367
14368     /* Look at every character in the range */
14369     for (j = start; j <= end; j++) {
14370      U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14371      STRLEN foldlen;
14372      SV** listp;
14373
14374      if (j < 256) {
14375
14376       /* We have the latin1 folding rules hard-coded here so
14377       * that an innocent-looking character class, like
14378       * /[ks]/i won't have to go out to disk to find the
14379       * possible matches.  XXX It would be better to
14380       * generate these via regen, in case a new version of
14381       * the Unicode standard adds new mappings, though that
14382       * is not really likely, and may be caught by the
14383       * default: case of the switch below. */
14384
14385       if (IS_IN_SOME_FOLD_L1(j)) {
14386
14387        /* ASCII is always matched; non-ASCII is matched
14388        * only under Unicode rules (which could happen
14389        * under /l if the locale is a UTF-8 one */
14390        if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14391         *use_list = add_cp_to_invlist(*use_list,
14392                PL_fold_latin1[j]);
14393        }
14394        else {
14395         depends_list =
14396         add_cp_to_invlist(depends_list,
14397             PL_fold_latin1[j]);
14398        }
14399       }
14400
14401       if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14402        && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14403       {
14404        /* Certain Latin1 characters have matches outside
14405        * Latin1.  To get here, <j> is one of those
14406        * characters.   None of these matches is valid for
14407        * ASCII characters under /aa, which is why the 'if'
14408        * just above excludes those.  These matches only
14409        * happen when the target string is utf8.  The code
14410        * below adds the single fold closures for <j> to the
14411        * inversion list. */
14412
14413        switch (j) {
14414         case 'k':
14415         case 'K':
14416         *use_list =
14417          add_cp_to_invlist(*use_list, KELVIN_SIGN);
14418          break;
14419         case 's':
14420         case 'S':
14421         *use_list = add_cp_to_invlist(*use_list,
14422              LATIN_SMALL_LETTER_LONG_S);
14423          break;
14424         case MICRO_SIGN:
14425         *use_list = add_cp_to_invlist(*use_list,
14426              GREEK_CAPITAL_LETTER_MU);
14427         *use_list = add_cp_to_invlist(*use_list,
14428               GREEK_SMALL_LETTER_MU);
14429          break;
14430         case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14431         case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14432         *use_list =
14433         add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
14434          break;
14435         case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14436         *use_list = add_cp_to_invlist(*use_list,
14437           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14438          break;
14439         case LATIN_SMALL_LETTER_SHARP_S:
14440         *use_list = add_cp_to_invlist(*use_list,
14441             LATIN_CAPITAL_LETTER_SHARP_S);
14442          break;
14443         case 'F': case 'f':
14444         case 'I': case 'i':
14445         case 'L': case 'l':
14446         case 'T': case 't':
14447         case 'A': case 'a':
14448         case 'H': case 'h':
14449         case 'J': case 'j':
14450         case 'N': case 'n':
14451         case 'W': case 'w':
14452         case 'Y': case 'y':
14453          /* These all are targets of multi-character
14454          * folds from code points that require UTF8
14455          * to express, so they can't match unless
14456          * the target string is in UTF-8, so no
14457          * action here is necessary, as regexec.c
14458          * properly handles the general case for
14459          * UTF-8 matching and multi-char folds */
14460          break;
14461         default:
14462          /* Use deprecated warning to increase the
14463          * chances of this being output */
14464          ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14465          break;
14466        }
14467       }
14468       continue;
14469      }
14470
14471      /* Here is an above Latin1 character.  We don't have the
14472      * rules hard-coded for it.  First, get its fold.  This is
14473      * the simple fold, as the multi-character folds have been
14474      * handled earlier and separated out */
14475      _to_uni_fold_flags(j, foldbuf, &foldlen,
14476               (ASCII_FOLD_RESTRICTED)
14477               ? FOLD_FLAGS_NOMIX_ASCII
14478               : 0);
14479
14480      /* Single character fold of above Latin1.  Add everything in
14481      * its fold closure to the list that this node should match.
14482      * The fold closures data structure is a hash with the keys
14483      * being the UTF-8 of every character that is folded to, like
14484      * 'k', and the values each an array of all code points that
14485      * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
14486      * Multi-character folds are not included */
14487      if ((listp = hv_fetch(PL_utf8_foldclosures,
14488           (char *) foldbuf, foldlen, FALSE)))
14489      {
14490       AV* list = (AV*) *listp;
14491       IV k;
14492       for (k = 0; k <= av_tindex(list); k++) {
14493        SV** c_p = av_fetch(list, k, FALSE);
14494        UV c;
14495        if (c_p == NULL) {
14496         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14497        }
14498        c = SvUV(*c_p);
14499
14500        /* /aa doesn't allow folds between ASCII and non- */
14501        if ((ASCII_FOLD_RESTRICTED
14502         && (isASCII(c) != isASCII(j))))
14503        {
14504         continue;
14505        }
14506
14507        /* Folds under /l which cross the 255/256 boundary
14508        * are added to a separate list.  (These are valid
14509        * only when the locale is UTF-8.) */
14510        if (c < 256 && LOC) {
14511         *use_list = add_cp_to_invlist(*use_list, c);
14512         continue;
14513        }
14514
14515        if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14516        {
14517         cp_list = add_cp_to_invlist(cp_list, c);
14518        }
14519        else {
14520         /* Similarly folds involving non-ascii Latin1
14521         * characters under /d are added to their list */
14522         depends_list = add_cp_to_invlist(depends_list,
14523                 c);
14524        }
14525       }
14526      }
14527     }
14528    }
14529    SvREFCNT_dec_NN(fold_intersection);
14530   }
14531
14532   /* Now that we have finished adding all the folds, there is no reason
14533   * to keep the foldable list separate */
14534   _invlist_union(cp_list, cp_foldable_list, &cp_list);
14535   SvREFCNT_dec_NN(cp_foldable_list);
14536  }
14537
14538  /* And combine the result (if any) with any inversion list from posix
14539  * classes.  The lists are kept separate up to now because we don't want to
14540  * fold the classes (folding of those is automatically handled by the swash
14541  * fetching code) */
14542  if (posixes || nposixes) {
14543   if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14544    /* Under /a and /aa, nothing above ASCII matches these */
14545    _invlist_intersection(posixes,
14546         PL_XPosix_ptrs[_CC_ASCII],
14547         &posixes);
14548   }
14549   if (nposixes) {
14550    if (DEPENDS_SEMANTICS) {
14551     /* Under /d, everything in the upper half of the Latin1 range
14552     * matches these complements */
14553     ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14554    }
14555    else if (AT_LEAST_ASCII_RESTRICTED) {
14556     /* Under /a and /aa, everything above ASCII matches these
14557     * complements */
14558     _invlist_union_complement_2nd(nposixes,
14559            PL_XPosix_ptrs[_CC_ASCII],
14560            &nposixes);
14561    }
14562    if (posixes) {
14563     _invlist_union(posixes, nposixes, &posixes);
14564     SvREFCNT_dec_NN(nposixes);
14565    }
14566    else {
14567     posixes = nposixes;
14568    }
14569   }
14570   if (! DEPENDS_SEMANTICS) {
14571    if (cp_list) {
14572     _invlist_union(cp_list, posixes, &cp_list);
14573     SvREFCNT_dec_NN(posixes);
14574    }
14575    else {
14576     cp_list = posixes;
14577    }
14578   }
14579   else {
14580    /* Under /d, we put into a separate list the Latin1 things that
14581    * match only when the target string is utf8 */
14582    SV* nonascii_but_latin1_properties = NULL;
14583    _invlist_intersection(posixes, PL_UpperLatin1,
14584         &nonascii_but_latin1_properties);
14585    _invlist_subtract(posixes, nonascii_but_latin1_properties,
14586        &posixes);
14587    if (cp_list) {
14588     _invlist_union(cp_list, posixes, &cp_list);
14589     SvREFCNT_dec_NN(posixes);
14590    }
14591    else {
14592     cp_list = posixes;
14593    }
14594
14595    if (depends_list) {
14596     _invlist_union(depends_list, nonascii_but_latin1_properties,
14597        &depends_list);
14598     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14599    }
14600    else {
14601     depends_list = nonascii_but_latin1_properties;
14602    }
14603   }
14604  }
14605
14606  /* And combine the result (if any) with any inversion list from properties.
14607  * The lists are kept separate up to now so that we can distinguish the two
14608  * in regards to matching above-Unicode.  A run-time warning is generated
14609  * if a Unicode property is matched against a non-Unicode code point. But,
14610  * we allow user-defined properties to match anything, without any warning,
14611  * and we also suppress the warning if there is a portion of the character
14612  * class that isn't a Unicode property, and which matches above Unicode, \W
14613  * or [\x{110000}] for example.
14614  * (Note that in this case, unlike the Posix one above, there is no
14615  * <depends_list>, because having a Unicode property forces Unicode
14616  * semantics */
14617  if (properties) {
14618   if (cp_list) {
14619
14620    /* If it matters to the final outcome, see if a non-property
14621    * component of the class matches above Unicode.  If so, the
14622    * warning gets suppressed.  This is true even if just a single
14623    * such code point is specified, as though not strictly correct if
14624    * another such code point is matched against, the fact that they
14625    * are using above-Unicode code points indicates they should know
14626    * the issues involved */
14627    if (warn_super) {
14628     warn_super = ! (invert
14629        ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14630    }
14631
14632    _invlist_union(properties, cp_list, &cp_list);
14633    SvREFCNT_dec_NN(properties);
14634   }
14635   else {
14636    cp_list = properties;
14637   }
14638
14639   if (warn_super) {
14640    ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14641   }
14642  }
14643
14644  /* Here, we have calculated what code points should be in the character
14645  * class.
14646  *
14647  * Now we can see about various optimizations.  Fold calculation (which we
14648  * did above) needs to take place before inversion.  Otherwise /[^k]/i
14649  * would invert to include K, which under /i would match k, which it
14650  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
14651  * folded until runtime */
14652
14653  /* If we didn't do folding, it's because some information isn't available
14654  * until runtime; set the run-time fold flag for these.  (We don't have to
14655  * worry about properties folding, as that is taken care of by the swash
14656  * fetching).  We know to set the flag if we have a non-NULL list for UTF-8
14657  * locales, or the class matches at least one 0-255 range code point */
14658  if (LOC && FOLD) {
14659   if (only_utf8_locale_list) {
14660    ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14661   }
14662   else if (cp_list) { /* Look to see if there a 0-255 code point is in
14663        the list */
14664    UV start, end;
14665    invlist_iterinit(cp_list);
14666    if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14667     ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14668    }
14669    invlist_iterfinish(cp_list);
14670   }
14671  }
14672
14673  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14674  * at compile time.  Besides not inverting folded locale now, we can't
14675  * invert if there are things such as \w, which aren't known until runtime
14676  * */
14677  if (cp_list
14678   && invert
14679   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14680   && ! depends_list
14681   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14682  {
14683   _invlist_invert(cp_list);
14684
14685   /* Any swash can't be used as-is, because we've inverted things */
14686   if (swash) {
14687    SvREFCNT_dec_NN(swash);
14688    swash = NULL;
14689   }
14690
14691   /* Clear the invert flag since have just done it here */
14692   invert = FALSE;
14693  }
14694
14695  if (ret_invlist) {
14696   *ret_invlist = cp_list;
14697   SvREFCNT_dec(swash);
14698
14699   /* Discard the generated node */
14700   if (SIZE_ONLY) {
14701    RExC_size = orig_size;
14702   }
14703   else {
14704    RExC_emit = orig_emit;
14705   }
14706   return orig_emit;
14707  }
14708
14709  /* Some character classes are equivalent to other nodes.  Such nodes take
14710  * up less room and generally fewer operations to execute than ANYOF nodes.
14711  * Above, we checked for and optimized into some such equivalents for
14712  * certain common classes that are easy to test.  Getting to this point in
14713  * the code means that the class didn't get optimized there.  Since this
14714  * code is only executed in Pass 2, it is too late to save space--it has
14715  * been allocated in Pass 1, and currently isn't given back.  But turning
14716  * things into an EXACTish node can allow the optimizer to join it to any
14717  * adjacent such nodes.  And if the class is equivalent to things like /./,
14718  * expensive run-time swashes can be avoided.  Now that we have more
14719  * complete information, we can find things necessarily missed by the
14720  * earlier code.  I (khw) am not sure how much to look for here.  It would
14721  * be easy, but perhaps too slow, to check any candidates against all the
14722  * node types they could possibly match using _invlistEQ(). */
14723
14724  if (cp_list
14725   && ! invert
14726   && ! depends_list
14727   && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14728   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14729
14730   /* We don't optimize if we are supposed to make sure all non-Unicode
14731    * code points raise a warning, as only ANYOF nodes have this check.
14732    * */
14733   && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14734  {
14735   UV start, end;
14736   U8 op = END;  /* The optimzation node-type */
14737   const char * cur_parse= RExC_parse;
14738
14739   invlist_iterinit(cp_list);
14740   if (! invlist_iternext(cp_list, &start, &end)) {
14741
14742    /* Here, the list is empty.  This happens, for example, when a
14743    * Unicode property is the only thing in the character class, and
14744    * it doesn't match anything.  (perluniprops.pod notes such
14745    * properties) */
14746    op = OPFAIL;
14747    *flagp |= HASWIDTH|SIMPLE;
14748   }
14749   else if (start == end) {    /* The range is a single code point */
14750    if (! invlist_iternext(cp_list, &start, &end)
14751
14752      /* Don't do this optimization if it would require changing
14753      * the pattern to UTF-8 */
14754     && (start < 256 || UTF))
14755    {
14756     /* Here, the list contains a single code point.  Can optimize
14757     * into an EXACTish node */
14758
14759     value = start;
14760
14761     if (! FOLD) {
14762      op = EXACT;
14763     }
14764     else if (LOC) {
14765
14766      /* A locale node under folding with one code point can be
14767      * an EXACTFL, as its fold won't be calculated until
14768      * runtime */
14769      op = EXACTFL;
14770     }
14771     else {
14772
14773      /* Here, we are generally folding, but there is only one
14774      * code point to match.  If we have to, we use an EXACT
14775      * node, but it would be better for joining with adjacent
14776      * nodes in the optimization pass if we used the same
14777      * EXACTFish node that any such are likely to be.  We can
14778      * do this iff the code point doesn't participate in any
14779      * folds.  For example, an EXACTF of a colon is the same as
14780      * an EXACT one, since nothing folds to or from a colon. */
14781      if (value < 256) {
14782       if (IS_IN_SOME_FOLD_L1(value)) {
14783        op = EXACT;
14784       }
14785      }
14786      else {
14787       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14788        op = EXACT;
14789       }
14790      }
14791
14792      /* If we haven't found the node type, above, it means we
14793      * can use the prevailing one */
14794      if (op == END) {
14795       op = compute_EXACTish(pRExC_state);
14796      }
14797     }
14798    }
14799   }
14800   else if (start == 0) {
14801    if (end == UV_MAX) {
14802     op = SANY;
14803     *flagp |= HASWIDTH|SIMPLE;
14804     RExC_naughty++;
14805    }
14806    else if (end == '\n' - 1
14807      && invlist_iternext(cp_list, &start, &end)
14808      && start == '\n' + 1 && end == UV_MAX)
14809    {
14810     op = REG_ANY;
14811     *flagp |= HASWIDTH|SIMPLE;
14812     RExC_naughty++;
14813    }
14814   }
14815   invlist_iterfinish(cp_list);
14816
14817   if (op != END) {
14818    RExC_parse = (char *)orig_parse;
14819    RExC_emit = (regnode *)orig_emit;
14820
14821    ret = reg_node(pRExC_state, op);
14822
14823    RExC_parse = (char *)cur_parse;
14824
14825    if (PL_regkind[op] == EXACT) {
14826     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14827           TRUE /* downgradable to EXACT */
14828           );
14829    }
14830
14831    SvREFCNT_dec_NN(cp_list);
14832    return ret;
14833   }
14834  }
14835
14836  /* Here, <cp_list> contains all the code points we can determine at
14837  * compile time that match under all conditions.  Go through it, and
14838  * for things that belong in the bitmap, put them there, and delete from
14839  * <cp_list>.  While we are at it, see if everything above 255 is in the
14840  * list, and if so, set a flag to speed up execution */
14841
14842  populate_ANYOF_from_invlist(ret, &cp_list);
14843
14844  if (invert) {
14845   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14846  }
14847
14848  /* Here, the bitmap has been populated with all the Latin1 code points that
14849  * always match.  Can now add to the overall list those that match only
14850  * when the target string is UTF-8 (<depends_list>). */
14851  if (depends_list) {
14852   if (cp_list) {
14853    _invlist_union(cp_list, depends_list, &cp_list);
14854    SvREFCNT_dec_NN(depends_list);
14855   }
14856   else {
14857    cp_list = depends_list;
14858   }
14859   ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14860  }
14861
14862  /* If there is a swash and more than one element, we can't use the swash in
14863  * the optimization below. */
14864  if (swash && element_count > 1) {
14865   SvREFCNT_dec_NN(swash);
14866   swash = NULL;
14867  }
14868
14869  set_ANYOF_arg(pRExC_state, ret, cp_list,
14870     (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14871     ? listsv : NULL,
14872     only_utf8_locale_list,
14873     swash, has_user_defined_property);
14874
14875  *flagp |= HASWIDTH|SIMPLE;
14876
14877  if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14878   RExC_contains_locale = 1;
14879  }
14880
14881  return ret;
14882 }
14883
14884 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14885
14886 STATIC void
14887 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14888     regnode* const node,
14889     SV* const cp_list,
14890     SV* const runtime_defns,
14891     SV* const only_utf8_locale_list,
14892     SV* const swash,
14893     const bool has_user_defined_property)
14894 {
14895  /* Sets the arg field of an ANYOF-type node 'node', using information about
14896  * the node passed-in.  If there is nothing outside the node's bitmap, the
14897  * arg is set to ANYOF_NONBITMAP_EMPTY.  Otherwise, it sets the argument to
14898  * the count returned by add_data(), having allocated and stored an array,
14899  * av, that that count references, as follows:
14900  *  av[0] stores the character class description in its textual form.
14901  *        This is used later (regexec.c:Perl_regclass_swash()) to
14902  *        initialize the appropriate swash, and is also useful for dumping
14903  *        the regnode.  This is set to &PL_sv_undef if the textual
14904  *        description is not needed at run-time (as happens if the other
14905  *        elements completely define the class)
14906  *  av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14907  *        computed from av[0].  But if no further computation need be done,
14908  *        the swash is stored here now (and av[0] is &PL_sv_undef).
14909  *  av[2] stores the inversion list of code points that match only if the
14910  *        current locale is UTF-8
14911  *  av[3] stores the cp_list inversion list for use in addition or instead
14912  *        of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14913  *        (Otherwise everything needed is already in av[0] and av[1])
14914  *  av[4] is set if any component of the class is from a user-defined
14915  *        property; used only if av[3] exists */
14916
14917  UV n;
14918
14919  PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14920
14921  if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14922   assert(! (ANYOF_FLAGS(node)
14923      & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14924   ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14925  }
14926  else {
14927   AV * const av = newAV();
14928   SV *rv;
14929
14930   assert(ANYOF_FLAGS(node)
14931      & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14932
14933   av_store(av, 0, (runtime_defns)
14934       ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14935   if (swash) {
14936    av_store(av, 1, swash);
14937    SvREFCNT_dec_NN(cp_list);
14938   }
14939   else {
14940    av_store(av, 1, &PL_sv_undef);
14941    if (cp_list) {
14942     av_store(av, 3, cp_list);
14943     av_store(av, 4, newSVuv(has_user_defined_property));
14944    }
14945   }
14946
14947   if (only_utf8_locale_list) {
14948    av_store(av, 2, only_utf8_locale_list);
14949   }
14950   else {
14951    av_store(av, 2, &PL_sv_undef);
14952   }
14953
14954   rv = newRV_noinc(MUTABLE_SV(av));
14955   n = add_data(pRExC_state, STR_WITH_LEN("s"));
14956   RExC_rxi->data->data[n] = (void*)rv;
14957   ARG_SET(node, n);
14958  }
14959 }
14960
14961
14962 /* reg_skipcomment()
14963
14964    Absorbs an /x style # comments from the input stream.
14965    Returns true if there is more text remaining in the stream.
14966    Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
14967    terminates the pattern without including a newline.
14968
14969    Note its the callers responsibility to ensure that we are
14970    actually in /x mode
14971
14972 */
14973
14974 STATIC bool
14975 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14976 {
14977  bool ended = 0;
14978
14979  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14980
14981  while (RExC_parse < RExC_end)
14982   if (*RExC_parse++ == '\n') {
14983    ended = 1;
14984    break;
14985   }
14986  if (!ended) {
14987   /* we ran off the end of the pattern without ending
14988   the comment, so we have to add an \n when wrapping */
14989   RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
14990   return 0;
14991  } else
14992   return 1;
14993 }
14994
14995 /* nextchar()
14996
14997    Advances the parse position, and optionally absorbs
14998    "whitespace" from the inputstream.
14999
15000    Without /x "whitespace" means (?#...) style comments only,
15001    with /x this means (?#...) and # comments and whitespace proper.
15002
15003    Returns the RExC_parse point from BEFORE the scan occurs.
15004
15005    This is the /x friendly way of saying RExC_parse++.
15006 */
15007
15008 STATIC char*
15009 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15010 {
15011  char* const retval = RExC_parse++;
15012
15013  PERL_ARGS_ASSERT_NEXTCHAR;
15014
15015  for (;;) {
15016   if (RExC_end - RExC_parse >= 3
15017    && *RExC_parse == '('
15018    && RExC_parse[1] == '?'
15019    && RExC_parse[2] == '#')
15020   {
15021    while (*RExC_parse != ')') {
15022     if (RExC_parse == RExC_end)
15023      FAIL("Sequence (?#... not terminated");
15024     RExC_parse++;
15025    }
15026    RExC_parse++;
15027    continue;
15028   }
15029   if (RExC_flags & RXf_PMf_EXTENDED) {
15030    if (isSPACE(*RExC_parse)) {
15031     RExC_parse++;
15032     continue;
15033    }
15034    else if (*RExC_parse == '#') {
15035     if ( reg_skipcomment( pRExC_state ) )
15036      continue;
15037    }
15038   }
15039   return retval;
15040  }
15041 }
15042
15043 /*
15044 - reg_node - emit a node
15045 */
15046 STATIC regnode *   /* Location. */
15047 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15048 {
15049  dVAR;
15050  regnode *ptr;
15051  regnode * const ret = RExC_emit;
15052  GET_RE_DEBUG_FLAGS_DECL;
15053
15054  PERL_ARGS_ASSERT_REG_NODE;
15055
15056  if (SIZE_ONLY) {
15057   SIZE_ALIGN(RExC_size);
15058   RExC_size += 1;
15059   return(ret);
15060  }
15061  if (RExC_emit >= RExC_emit_bound)
15062   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15063     op, RExC_emit, RExC_emit_bound);
15064
15065  NODE_ALIGN_FILL(ret);
15066  ptr = ret;
15067  FILL_ADVANCE_NODE(ptr, op);
15068  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
15069 #ifdef RE_TRACK_PATTERN_OFFSETS
15070  if (RExC_offsets) {         /* MJD */
15071   MJD_OFFSET_DEBUG(
15072    ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15073    "reg_node", __LINE__,
15074    PL_reg_name[op],
15075    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15076     ? "Overwriting end of array!\n" : "OK",
15077    (UV)(RExC_emit - RExC_emit_start),
15078    (UV)(RExC_parse - RExC_start),
15079    (UV)RExC_offsets[0]));
15080   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15081  }
15082 #endif
15083  RExC_emit = ptr;
15084  return(ret);
15085 }
15086
15087 /*
15088 - reganode - emit a node with an argument
15089 */
15090 STATIC regnode *   /* Location. */
15091 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15092 {
15093  dVAR;
15094  regnode *ptr;
15095  regnode * const ret = RExC_emit;
15096  GET_RE_DEBUG_FLAGS_DECL;
15097
15098  PERL_ARGS_ASSERT_REGANODE;
15099
15100  if (SIZE_ONLY) {
15101   SIZE_ALIGN(RExC_size);
15102   RExC_size += 2;
15103   /*
15104   We can't do this:
15105
15106   assert(2==regarglen[op]+1);
15107
15108   Anything larger than this has to allocate the extra amount.
15109   If we changed this to be:
15110
15111   RExC_size += (1 + regarglen[op]);
15112
15113   then it wouldn't matter. Its not clear what side effect
15114   might come from that so its not done so far.
15115   -- dmq
15116   */
15117   return(ret);
15118  }
15119  if (RExC_emit >= RExC_emit_bound)
15120   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15121     op, RExC_emit, RExC_emit_bound);
15122
15123  NODE_ALIGN_FILL(ret);
15124  ptr = ret;
15125  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15126  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
15127 #ifdef RE_TRACK_PATTERN_OFFSETS
15128  if (RExC_offsets) {         /* MJD */
15129   MJD_OFFSET_DEBUG(
15130    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15131    "reganode",
15132    __LINE__,
15133    PL_reg_name[op],
15134    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15135    "Overwriting end of array!\n" : "OK",
15136    (UV)(RExC_emit - RExC_emit_start),
15137    (UV)(RExC_parse - RExC_start),
15138    (UV)RExC_offsets[0]));
15139   Set_Cur_Node_Offset;
15140  }
15141 #endif
15142  RExC_emit = ptr;
15143  return(ret);
15144 }
15145
15146 /*
15147 - reguni - emit (if appropriate) a Unicode character
15148 */
15149 PERL_STATIC_INLINE STRLEN
15150 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15151 {
15152  dVAR;
15153
15154  PERL_ARGS_ASSERT_REGUNI;
15155
15156  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15157 }
15158
15159 /*
15160 - reginsert - insert an operator in front of already-emitted operand
15161 *
15162 * Means relocating the operand.
15163 */
15164 STATIC void
15165 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15166 {
15167  dVAR;
15168  regnode *src;
15169  regnode *dst;
15170  regnode *place;
15171  const int offset = regarglen[(U8)op];
15172  const int size = NODE_STEP_REGNODE + offset;
15173  GET_RE_DEBUG_FLAGS_DECL;
15174
15175  PERL_ARGS_ASSERT_REGINSERT;
15176  PERL_UNUSED_ARG(depth);
15177 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15178  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15179  if (SIZE_ONLY) {
15180   RExC_size += size;
15181   return;
15182  }
15183
15184  src = RExC_emit;
15185  RExC_emit += size;
15186  dst = RExC_emit;
15187  if (RExC_open_parens) {
15188   int paren;
15189   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15190   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15191    if ( RExC_open_parens[paren] >= opnd ) {
15192     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15193     RExC_open_parens[paren] += size;
15194    } else {
15195     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15196    }
15197    if ( RExC_close_parens[paren] >= opnd ) {
15198     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15199     RExC_close_parens[paren] += size;
15200    } else {
15201     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15202    }
15203   }
15204  }
15205
15206  while (src > opnd) {
15207   StructCopy(--src, --dst, regnode);
15208 #ifdef RE_TRACK_PATTERN_OFFSETS
15209   if (RExC_offsets) {     /* MJD 20010112 */
15210    MJD_OFFSET_DEBUG(
15211     ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15212     "reg_insert",
15213     __LINE__,
15214     PL_reg_name[op],
15215     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15216      ? "Overwriting end of array!\n" : "OK",
15217     (UV)(src - RExC_emit_start),
15218     (UV)(dst - RExC_emit_start),
15219     (UV)RExC_offsets[0]));
15220    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15221    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15222   }
15223 #endif
15224  }
15225
15226
15227  place = opnd;  /* Op node, where operand used to be. */
15228 #ifdef RE_TRACK_PATTERN_OFFSETS
15229  if (RExC_offsets) {         /* MJD */
15230   MJD_OFFSET_DEBUG(
15231    ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15232    "reginsert",
15233    __LINE__,
15234    PL_reg_name[op],
15235    (UV)(place - RExC_emit_start) > RExC_offsets[0]
15236    ? "Overwriting end of array!\n" : "OK",
15237    (UV)(place - RExC_emit_start),
15238    (UV)(RExC_parse - RExC_start),
15239    (UV)RExC_offsets[0]));
15240   Set_Node_Offset(place, RExC_parse);
15241   Set_Node_Length(place, 1);
15242  }
15243 #endif
15244  src = NEXTOPER(place);
15245  FILL_ADVANCE_NODE(place, op);
15246  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
15247  Zero(src, offset, regnode);
15248 }
15249
15250 /*
15251 - regtail - set the next-pointer at the end of a node chain of p to val.
15252 - SEE ALSO: regtail_study
15253 */
15254 /* TODO: All three parms should be const */
15255 STATIC void
15256 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15257     const regnode *val,U32 depth)
15258 {
15259  dVAR;
15260  regnode *scan;
15261  GET_RE_DEBUG_FLAGS_DECL;
15262
15263  PERL_ARGS_ASSERT_REGTAIL;
15264 #ifndef DEBUGGING
15265  PERL_UNUSED_ARG(depth);
15266 #endif
15267
15268  if (SIZE_ONLY)
15269   return;
15270
15271  /* Find last node. */
15272  scan = p;
15273  for (;;) {
15274   regnode * const temp = regnext(scan);
15275   DEBUG_PARSE_r({
15276    SV * const mysv=sv_newmortal();
15277    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15278    regprop(RExC_rx, mysv, scan, NULL);
15279    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15280     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15281      (temp == NULL ? "->" : ""),
15282      (temp == NULL ? PL_reg_name[OP(val)] : "")
15283    );
15284   });
15285   if (temp == NULL)
15286    break;
15287   scan = temp;
15288  }
15289
15290  if (reg_off_by_arg[OP(scan)]) {
15291   ARG_SET(scan, val - scan);
15292  }
15293  else {
15294   NEXT_OFF(scan) = val - scan;
15295  }
15296 }
15297
15298 #ifdef DEBUGGING
15299 /*
15300 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15301 - Look for optimizable sequences at the same time.
15302 - currently only looks for EXACT chains.
15303
15304 This is experimental code. The idea is to use this routine to perform
15305 in place optimizations on branches and groups as they are constructed,
15306 with the long term intention of removing optimization from study_chunk so
15307 that it is purely analytical.
15308
15309 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15310 to control which is which.
15311
15312 */
15313 /* TODO: All four parms should be const */
15314
15315 STATIC U8
15316 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15317      const regnode *val,U32 depth)
15318 {
15319  dVAR;
15320  regnode *scan;
15321  U8 exact = PSEUDO;
15322 #ifdef EXPERIMENTAL_INPLACESCAN
15323  I32 min = 0;
15324 #endif
15325  GET_RE_DEBUG_FLAGS_DECL;
15326
15327  PERL_ARGS_ASSERT_REGTAIL_STUDY;
15328
15329
15330  if (SIZE_ONLY)
15331   return exact;
15332
15333  /* Find last node. */
15334
15335  scan = p;
15336  for (;;) {
15337   regnode * const temp = regnext(scan);
15338 #ifdef EXPERIMENTAL_INPLACESCAN
15339   if (PL_regkind[OP(scan)] == EXACT) {
15340    bool unfolded_multi_char; /* Unexamined in this routine */
15341    if (join_exact(pRExC_state, scan, &min,
15342       &unfolded_multi_char, 1, val, depth+1))
15343     return EXACT;
15344   }
15345 #endif
15346   if ( exact ) {
15347    switch (OP(scan)) {
15348     case EXACT:
15349     case EXACTF:
15350     case EXACTFA_NO_TRIE:
15351     case EXACTFA:
15352     case EXACTFU:
15353     case EXACTFU_SS:
15354     case EXACTFL:
15355       if( exact == PSEUDO )
15356        exact= OP(scan);
15357       else if ( exact != OP(scan) )
15358        exact= 0;
15359     case NOTHING:
15360      break;
15361     default:
15362      exact= 0;
15363    }
15364   }
15365   DEBUG_PARSE_r({
15366    SV * const mysv=sv_newmortal();
15367    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15368    regprop(RExC_rx, mysv, scan, NULL);
15369    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15370     SvPV_nolen_const(mysv),
15371     REG_NODE_NUM(scan),
15372     PL_reg_name[exact]);
15373   });
15374   if (temp == NULL)
15375    break;
15376   scan = temp;
15377  }
15378  DEBUG_PARSE_r({
15379   SV * const mysv_val=sv_newmortal();
15380   DEBUG_PARSE_MSG("");
15381   regprop(RExC_rx, mysv_val, val, NULL);
15382   PerlIO_printf(Perl_debug_log,
15383      "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15384      SvPV_nolen_const(mysv_val),
15385      (IV)REG_NODE_NUM(val),
15386      (IV)(val - scan)
15387   );
15388  });
15389  if (reg_off_by_arg[OP(scan)]) {
15390   ARG_SET(scan, val - scan);
15391  }
15392  else {
15393   NEXT_OFF(scan) = val - scan;
15394  }
15395
15396  return exact;
15397 }
15398 #endif
15399
15400 /*
15401  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15402  */
15403 #ifdef DEBUGGING
15404
15405 static void
15406 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15407 {
15408  int bit;
15409  int set=0;
15410
15411  ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15412
15413  for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15414   if (flags & (1<<bit)) {
15415    if (!set++ && lead)
15416     PerlIO_printf(Perl_debug_log, "%s",lead);
15417    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15418   }
15419  }
15420  if (lead)  {
15421   if (set)
15422    PerlIO_printf(Perl_debug_log, "\n");
15423   else
15424    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15425  }
15426 }
15427
15428 static void
15429 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15430 {
15431  int bit;
15432  int set=0;
15433  regex_charset cs;
15434
15435  ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15436
15437  for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15438   if (flags & (1<<bit)) {
15439    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15440     continue;
15441    }
15442    if (!set++ && lead)
15443     PerlIO_printf(Perl_debug_log, "%s",lead);
15444    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15445   }
15446  }
15447  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15448    if (!set++ && lead) {
15449     PerlIO_printf(Perl_debug_log, "%s",lead);
15450    }
15451    switch (cs) {
15452     case REGEX_UNICODE_CHARSET:
15453      PerlIO_printf(Perl_debug_log, "UNICODE");
15454      break;
15455     case REGEX_LOCALE_CHARSET:
15456      PerlIO_printf(Perl_debug_log, "LOCALE");
15457      break;
15458     case REGEX_ASCII_RESTRICTED_CHARSET:
15459      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15460      break;
15461     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15462      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15463      break;
15464     default:
15465      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15466      break;
15467    }
15468  }
15469  if (lead)  {
15470   if (set)
15471    PerlIO_printf(Perl_debug_log, "\n");
15472   else
15473    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15474  }
15475 }
15476 #endif
15477
15478 void
15479 Perl_regdump(pTHX_ const regexp *r)
15480 {
15481 #ifdef DEBUGGING
15482  dVAR;
15483  SV * const sv = sv_newmortal();
15484  SV *dsv= sv_newmortal();
15485  RXi_GET_DECL(r,ri);
15486  GET_RE_DEBUG_FLAGS_DECL;
15487
15488  PERL_ARGS_ASSERT_REGDUMP;
15489
15490  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15491
15492  /* Header fields of interest. */
15493  if (r->anchored_substr) {
15494   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15495    RE_SV_DUMPLEN(r->anchored_substr), 30);
15496   PerlIO_printf(Perl_debug_log,
15497      "anchored %s%s at %"IVdf" ",
15498      s, RE_SV_TAIL(r->anchored_substr),
15499      (IV)r->anchored_offset);
15500  } else if (r->anchored_utf8) {
15501   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15502    RE_SV_DUMPLEN(r->anchored_utf8), 30);
15503   PerlIO_printf(Perl_debug_log,
15504      "anchored utf8 %s%s at %"IVdf" ",
15505      s, RE_SV_TAIL(r->anchored_utf8),
15506      (IV)r->anchored_offset);
15507  }
15508  if (r->float_substr) {
15509   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15510    RE_SV_DUMPLEN(r->float_substr), 30);
15511   PerlIO_printf(Perl_debug_log,
15512      "floating %s%s at %"IVdf"..%"UVuf" ",
15513      s, RE_SV_TAIL(r->float_substr),
15514      (IV)r->float_min_offset, (UV)r->float_max_offset);
15515  } else if (r->float_utf8) {
15516   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15517    RE_SV_DUMPLEN(r->float_utf8), 30);
15518   PerlIO_printf(Perl_debug_log,
15519      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15520      s, RE_SV_TAIL(r->float_utf8),
15521      (IV)r->float_min_offset, (UV)r->float_max_offset);
15522  }
15523  if (r->check_substr || r->check_utf8)
15524   PerlIO_printf(Perl_debug_log,
15525      (const char *)
15526      (r->check_substr == r->float_substr
15527      && r->check_utf8 == r->float_utf8
15528      ? "(checking floating" : "(checking anchored"));
15529  if (r->intflags & PREGf_NOSCAN)
15530   PerlIO_printf(Perl_debug_log, " noscan");
15531  if (r->extflags & RXf_CHECK_ALL)
15532   PerlIO_printf(Perl_debug_log, " isall");
15533  if (r->check_substr || r->check_utf8)
15534   PerlIO_printf(Perl_debug_log, ") ");
15535
15536  if (ri->regstclass) {
15537   regprop(r, sv, ri->regstclass, NULL);
15538   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15539  }
15540  if (r->intflags & PREGf_ANCH) {
15541   PerlIO_printf(Perl_debug_log, "anchored");
15542   if (r->intflags & PREGf_ANCH_BOL)
15543    PerlIO_printf(Perl_debug_log, "(BOL)");
15544   if (r->intflags & PREGf_ANCH_MBOL)
15545    PerlIO_printf(Perl_debug_log, "(MBOL)");
15546   if (r->intflags & PREGf_ANCH_SBOL)
15547    PerlIO_printf(Perl_debug_log, "(SBOL)");
15548   if (r->intflags & PREGf_ANCH_GPOS)
15549    PerlIO_printf(Perl_debug_log, "(GPOS)");
15550   PerlIO_putc(Perl_debug_log, ' ');
15551  }
15552  if (r->intflags & PREGf_GPOS_SEEN)
15553   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15554  if (r->intflags & PREGf_SKIP)
15555   PerlIO_printf(Perl_debug_log, "plus ");
15556  if (r->intflags & PREGf_IMPLICIT)
15557   PerlIO_printf(Perl_debug_log, "implicit ");
15558  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15559  if (r->extflags & RXf_EVAL_SEEN)
15560   PerlIO_printf(Perl_debug_log, "with eval ");
15561  PerlIO_printf(Perl_debug_log, "\n");
15562  DEBUG_FLAGS_r({
15563   regdump_extflags("r->extflags: ",r->extflags);
15564   regdump_intflags("r->intflags: ",r->intflags);
15565  });
15566 #else
15567  PERL_ARGS_ASSERT_REGDUMP;
15568  PERL_UNUSED_CONTEXT;
15569  PERL_UNUSED_ARG(r);
15570 #endif /* DEBUGGING */
15571 }
15572
15573 /*
15574 - regprop - printable representation of opcode, with run time support
15575 */
15576
15577 void
15578 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15579 {
15580 #ifdef DEBUGGING
15581  dVAR;
15582  int k;
15583
15584  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15585  static const char * const anyofs[] = {
15586 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15587  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
15588  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
15589  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
15590  || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
15591  || _CC_VERTSPACE != 16
15592   #error Need to adjust order of anyofs[]
15593 #endif
15594   "\\w",
15595   "\\W",
15596   "\\d",
15597   "\\D",
15598   "[:alpha:]",
15599   "[:^alpha:]",
15600   "[:lower:]",
15601   "[:^lower:]",
15602   "[:upper:]",
15603   "[:^upper:]",
15604   "[:punct:]",
15605   "[:^punct:]",
15606   "[:print:]",
15607   "[:^print:]",
15608   "[:alnum:]",
15609   "[:^alnum:]",
15610   "[:graph:]",
15611   "[:^graph:]",
15612   "[:cased:]",
15613   "[:^cased:]",
15614   "\\s",
15615   "\\S",
15616   "[:blank:]",
15617   "[:^blank:]",
15618   "[:xdigit:]",
15619   "[:^xdigit:]",
15620   "[:space:]",
15621   "[:^space:]",
15622   "[:cntrl:]",
15623   "[:^cntrl:]",
15624   "[:ascii:]",
15625   "[:^ascii:]",
15626   "\\v",
15627   "\\V"
15628  };
15629  RXi_GET_DECL(prog,progi);
15630  GET_RE_DEBUG_FLAGS_DECL;
15631
15632  PERL_ARGS_ASSERT_REGPROP;
15633
15634  sv_setpvs(sv, "");
15635
15636  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
15637   /* It would be nice to FAIL() here, but this may be called from
15638   regexec.c, and it would be hard to supply pRExC_state. */
15639   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15640            (int)OP(o), (int)REGNODE_MAX);
15641  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15642
15643  k = PL_regkind[OP(o)];
15644
15645  if (k == EXACT) {
15646   sv_catpvs(sv, " ");
15647   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15648   * is a crude hack but it may be the best for now since
15649   * we have no flag "this EXACTish node was UTF-8"
15650   * --jhi */
15651   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15652     PERL_PV_ESCAPE_UNI_DETECT |
15653     PERL_PV_ESCAPE_NONASCII   |
15654     PERL_PV_PRETTY_ELLIPSES   |
15655     PERL_PV_PRETTY_LTGT       |
15656     PERL_PV_PRETTY_NOCLEAR
15657     );
15658  } else if (k == TRIE) {
15659   /* print the details of the trie in dumpuntil instead, as
15660   * progi->data isn't available here */
15661   const char op = OP(o);
15662   const U32 n = ARG(o);
15663   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15664    (reg_ac_data *)progi->data->data[n] :
15665    NULL;
15666   const reg_trie_data * const trie
15667    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15668
15669   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15670   DEBUG_TRIE_COMPILE_r(
15671   Perl_sv_catpvf(aTHX_ sv,
15672    "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15673    (UV)trie->startstate,
15674    (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15675    (UV)trie->wordcount,
15676    (UV)trie->minlen,
15677    (UV)trie->maxlen,
15678    (UV)TRIE_CHARCOUNT(trie),
15679    (UV)trie->uniquecharcount
15680   );
15681   );
15682   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15683    sv_catpvs(sv, "[");
15684    (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15685             ? ANYOF_BITMAP(o)
15686             : TRIE_BITMAP(trie));
15687    sv_catpvs(sv, "]");
15688   }
15689
15690  } else if (k == CURLY) {
15691   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15692    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15693   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15694  }
15695  else if (k == WHILEM && o->flags)   /* Ordinal/of */
15696   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15697  else if (k == REF || k == OPEN || k == CLOSE
15698    || k == GROUPP || OP(o)==ACCEPT)
15699  {
15700   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15701   if ( RXp_PAREN_NAMES(prog) ) {
15702    if ( k != REF || (OP(o) < NREF)) {
15703     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15704     SV **name= av_fetch(list, ARG(o), 0 );
15705     if (name)
15706      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15707    }
15708    else {
15709     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15710     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15711     I32 *nums=(I32*)SvPVX(sv_dat);
15712     SV **name= av_fetch(list, nums[0], 0 );
15713     I32 n;
15714     if (name) {
15715      for ( n=0; n<SvIVX(sv_dat); n++ ) {
15716       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15717          (n ? "," : ""), (IV)nums[n]);
15718      }
15719      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15720     }
15721    }
15722   }
15723   if ( k == REF && reginfo) {
15724    U32 n = ARG(o);  /* which paren pair */
15725    I32 ln = prog->offs[n].start;
15726    if (prog->lastparen < n || ln == -1)
15727     Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15728    else if (ln == prog->offs[n].end)
15729     Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15730    else {
15731     const char *s = reginfo->strbeg + ln;
15732     Perl_sv_catpvf(aTHX_ sv, ": ");
15733     Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15734      PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15735    }
15736   }
15737  } else if (k == GOSUB)
15738   /* Paren and offset */
15739   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15740  else if (k == VERB) {
15741   if (!o->flags)
15742    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15743       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15744  } else if (k == LOGICAL)
15745   /* 2: embedded, otherwise 1 */
15746   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15747  else if (k == ANYOF) {
15748   const U8 flags = ANYOF_FLAGS(o);
15749   int do_sep = 0;
15750
15751
15752   if (flags & ANYOF_LOCALE_FLAGS)
15753    sv_catpvs(sv, "{loc}");
15754   if (flags & ANYOF_LOC_FOLD)
15755    sv_catpvs(sv, "{i}");
15756   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15757   if (flags & ANYOF_INVERT)
15758    sv_catpvs(sv, "^");
15759
15760   /* output what the standard cp 0-255 bitmap matches */
15761   do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15762
15763   /* output any special charclass tests (used entirely under use
15764   * locale) * */
15765   if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15766    int i;
15767    for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15768     if (ANYOF_POSIXL_TEST(o,i)) {
15769      sv_catpv(sv, anyofs[i]);
15770      do_sep = 1;
15771     }
15772    }
15773   }
15774
15775   if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15776      |ANYOF_UTF8
15777      |ANYOF_NONBITMAP_NON_UTF8
15778      |ANYOF_LOC_FOLD)))
15779   {
15780    if (do_sep) {
15781     Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15782     if (flags & ANYOF_INVERT)
15783      /*make sure the invert info is in each */
15784      sv_catpvs(sv, "^");
15785    }
15786
15787    if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15788     sv_catpvs(sv, "{non-utf8-latin1-all}");
15789    }
15790
15791    /* output information about the unicode matching */
15792    if (flags & ANYOF_ABOVE_LATIN1_ALL)
15793     sv_catpvs(sv, "{unicode_all}");
15794    else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15795     SV *lv; /* Set if there is something outside the bit map. */
15796     bool byte_output = FALSE;   /* If something in the bitmap has
15797            been output */
15798     SV *only_utf8_locale;
15799
15800     /* Get the stuff that wasn't in the bitmap */
15801     (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15802              &lv, &only_utf8_locale);
15803     if (lv && lv != &PL_sv_undef) {
15804      char *s = savesvpv(lv);
15805      char * const origs = s;
15806
15807      while (*s && *s != '\n')
15808       s++;
15809
15810      if (*s == '\n') {
15811       const char * const t = ++s;
15812
15813       if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15814        sv_catpvs(sv, "{outside bitmap}");
15815       }
15816       else {
15817        sv_catpvs(sv, "{utf8}");
15818       }
15819
15820       if (byte_output) {
15821        sv_catpvs(sv, " ");
15822       }
15823
15824       while (*s) {
15825        if (*s == '\n') {
15826
15827         /* Truncate very long output */
15828         if (s - origs > 256) {
15829          Perl_sv_catpvf(aTHX_ sv,
15830             "%.*s...",
15831             (int) (s - origs - 1),
15832             t);
15833          goto out_dump;
15834         }
15835         *s = ' ';
15836        }
15837        else if (*s == '\t') {
15838         *s = '-';
15839        }
15840        s++;
15841       }
15842       if (s[-1] == ' ')
15843        s[-1] = 0;
15844
15845       sv_catpv(sv, t);
15846      }
15847
15848     out_dump:
15849
15850      Safefree(origs);
15851      SvREFCNT_dec_NN(lv);
15852     }
15853
15854     if ((flags & ANYOF_LOC_FOLD)
15855      && only_utf8_locale
15856      && only_utf8_locale != &PL_sv_undef)
15857     {
15858      UV start, end;
15859      int max_entries = 256;
15860
15861      sv_catpvs(sv, "{utf8 locale}");
15862      invlist_iterinit(only_utf8_locale);
15863      while (invlist_iternext(only_utf8_locale,
15864            &start, &end)) {
15865       put_range(sv, start, end);
15866       max_entries --;
15867       if (max_entries < 0) {
15868        sv_catpvs(sv, "...");
15869        break;
15870       }
15871      }
15872      invlist_iterfinish(only_utf8_locale);
15873     }
15874    }
15875   }
15876
15877   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15878  }
15879  else if (k == POSIXD || k == NPOSIXD) {
15880   U8 index = FLAGS(o) * 2;
15881   if (index < C_ARRAY_LENGTH(anyofs)) {
15882    if (*anyofs[index] != '[')  {
15883     sv_catpv(sv, "[");
15884    }
15885    sv_catpv(sv, anyofs[index]);
15886    if (*anyofs[index] != '[')  {
15887     sv_catpv(sv, "]");
15888    }
15889   }
15890   else {
15891    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15892   }
15893  }
15894  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15895   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15896 #else
15897  PERL_UNUSED_CONTEXT;
15898  PERL_UNUSED_ARG(sv);
15899  PERL_UNUSED_ARG(o);
15900  PERL_UNUSED_ARG(prog);
15901  PERL_UNUSED_ARG(reginfo);
15902 #endif /* DEBUGGING */
15903 }
15904
15905
15906
15907 SV *
15908 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15909 {    /* Assume that RE_INTUIT is set */
15910  dVAR;
15911  struct regexp *const prog = ReANY(r);
15912  GET_RE_DEBUG_FLAGS_DECL;
15913
15914  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15915  PERL_UNUSED_CONTEXT;
15916
15917  DEBUG_COMPILE_r(
15918   {
15919    const char * const s = SvPV_nolen_const(prog->check_substr
15920      ? prog->check_substr : prog->check_utf8);
15921
15922    if (!PL_colorset) reginitcolors();
15923    PerlIO_printf(Perl_debug_log,
15924      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15925      PL_colors[4],
15926      prog->check_substr ? "" : "utf8 ",
15927      PL_colors[5],PL_colors[0],
15928      s,
15929      PL_colors[1],
15930      (strlen(s) > 60 ? "..." : ""));
15931   } );
15932
15933  return prog->check_substr ? prog->check_substr : prog->check_utf8;
15934 }
15935
15936 /*
15937    pregfree()
15938
15939    handles refcounting and freeing the perl core regexp structure. When
15940    it is necessary to actually free the structure the first thing it
15941    does is call the 'free' method of the regexp_engine associated to
15942    the regexp, allowing the handling of the void *pprivate; member
15943    first. (This routine is not overridable by extensions, which is why
15944    the extensions free is called first.)
15945
15946    See regdupe and regdupe_internal if you change anything here.
15947 */
15948 #ifndef PERL_IN_XSUB_RE
15949 void
15950 Perl_pregfree(pTHX_ REGEXP *r)
15951 {
15952  SvREFCNT_dec(r);
15953 }
15954
15955 void
15956 Perl_pregfree2(pTHX_ REGEXP *rx)
15957 {
15958  dVAR;
15959  struct regexp *const r = ReANY(rx);
15960  GET_RE_DEBUG_FLAGS_DECL;
15961
15962  PERL_ARGS_ASSERT_PREGFREE2;
15963
15964  if (r->mother_re) {
15965   ReREFCNT_dec(r->mother_re);
15966  } else {
15967   CALLREGFREE_PVT(rx); /* free the private data */
15968   SvREFCNT_dec(RXp_PAREN_NAMES(r));
15969   Safefree(r->xpv_len_u.xpvlenu_pv);
15970  }
15971  if (r->substrs) {
15972   SvREFCNT_dec(r->anchored_substr);
15973   SvREFCNT_dec(r->anchored_utf8);
15974   SvREFCNT_dec(r->float_substr);
15975   SvREFCNT_dec(r->float_utf8);
15976   Safefree(r->substrs);
15977  }
15978  RX_MATCH_COPY_FREE(rx);
15979 #ifdef PERL_ANY_COW
15980  SvREFCNT_dec(r->saved_copy);
15981 #endif
15982  Safefree(r->offs);
15983  SvREFCNT_dec(r->qr_anoncv);
15984  rx->sv_u.svu_rx = 0;
15985 }
15986
15987 /*  reg_temp_copy()
15988
15989  This is a hacky workaround to the structural issue of match results
15990  being stored in the regexp structure which is in turn stored in
15991  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15992  could be PL_curpm in multiple contexts, and could require multiple
15993  result sets being associated with the pattern simultaneously, such
15994  as when doing a recursive match with (??{$qr})
15995
15996  The solution is to make a lightweight copy of the regexp structure
15997  when a qr// is returned from the code executed by (??{$qr}) this
15998  lightweight copy doesn't actually own any of its data except for
15999  the starp/end and the actual regexp structure itself.
16000
16001 */
16002
16003
16004 REGEXP *
16005 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16006 {
16007  struct regexp *ret;
16008  struct regexp *const r = ReANY(rx);
16009  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16010
16011  PERL_ARGS_ASSERT_REG_TEMP_COPY;
16012
16013  if (!ret_x)
16014   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16015  else {
16016   SvOK_off((SV *)ret_x);
16017   if (islv) {
16018    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16019    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
16020    made both spots point to the same regexp body.) */
16021    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16022    assert(!SvPVX(ret_x));
16023    ret_x->sv_u.svu_rx = temp->sv_any;
16024    temp->sv_any = NULL;
16025    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16026    SvREFCNT_dec_NN(temp);
16027    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16028    ing below will not set it. */
16029    SvCUR_set(ret_x, SvCUR(rx));
16030   }
16031  }
16032  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16033  sv_force_normal(sv) is called.  */
16034  SvFAKE_on(ret_x);
16035  ret = ReANY(ret_x);
16036
16037  SvFLAGS(ret_x) |= SvUTF8(rx);
16038  /* We share the same string buffer as the original regexp, on which we
16039  hold a reference count, incremented when mother_re is set below.
16040  The string pointer is copied here, being part of the regexp struct.
16041  */
16042  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16043   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16044  if (r->offs) {
16045   const I32 npar = r->nparens+1;
16046   Newx(ret->offs, npar, regexp_paren_pair);
16047   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16048  }
16049  if (r->substrs) {
16050   Newx(ret->substrs, 1, struct reg_substr_data);
16051   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16052
16053   SvREFCNT_inc_void(ret->anchored_substr);
16054   SvREFCNT_inc_void(ret->anchored_utf8);
16055   SvREFCNT_inc_void(ret->float_substr);
16056   SvREFCNT_inc_void(ret->float_utf8);
16057
16058   /* check_substr and check_utf8, if non-NULL, point to either their
16059   anchored or float namesakes, and don't hold a second reference.  */
16060  }
16061  RX_MATCH_COPIED_off(ret_x);
16062 #ifdef PERL_ANY_COW
16063  ret->saved_copy = NULL;
16064 #endif
16065  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16066  SvREFCNT_inc_void(ret->qr_anoncv);
16067
16068  return ret_x;
16069 }
16070 #endif
16071
16072 /* regfree_internal()
16073
16074    Free the private data in a regexp. This is overloadable by
16075    extensions. Perl takes care of the regexp structure in pregfree(),
16076    this covers the *pprivate pointer which technically perl doesn't
16077    know about, however of course we have to handle the
16078    regexp_internal structure when no extension is in use.
16079
16080    Note this is called before freeing anything in the regexp
16081    structure.
16082  */
16083
16084 void
16085 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16086 {
16087  dVAR;
16088  struct regexp *const r = ReANY(rx);
16089  RXi_GET_DECL(r,ri);
16090  GET_RE_DEBUG_FLAGS_DECL;
16091
16092  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16093
16094  DEBUG_COMPILE_r({
16095   if (!PL_colorset)
16096    reginitcolors();
16097   {
16098    SV *dsv= sv_newmortal();
16099    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16100     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16101    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16102     PL_colors[4],PL_colors[5],s);
16103   }
16104  });
16105 #ifdef RE_TRACK_PATTERN_OFFSETS
16106  if (ri->u.offsets)
16107   Safefree(ri->u.offsets);             /* 20010421 MJD */
16108 #endif
16109  if (ri->code_blocks) {
16110   int n;
16111   for (n = 0; n < ri->num_code_blocks; n++)
16112    SvREFCNT_dec(ri->code_blocks[n].src_regex);
16113   Safefree(ri->code_blocks);
16114  }
16115
16116  if (ri->data) {
16117   int n = ri->data->count;
16118
16119   while (--n >= 0) {
16120   /* If you add a ->what type here, update the comment in regcomp.h */
16121    switch (ri->data->what[n]) {
16122    case 'a':
16123    case 'r':
16124    case 's':
16125    case 'S':
16126    case 'u':
16127     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16128     break;
16129    case 'f':
16130     Safefree(ri->data->data[n]);
16131     break;
16132    case 'l':
16133    case 'L':
16134     break;
16135    case 'T':
16136     { /* Aho Corasick add-on structure for a trie node.
16137      Used in stclass optimization only */
16138      U32 refcount;
16139      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16140      OP_REFCNT_LOCK;
16141      refcount = --aho->refcount;
16142      OP_REFCNT_UNLOCK;
16143      if ( !refcount ) {
16144       PerlMemShared_free(aho->states);
16145       PerlMemShared_free(aho->fail);
16146       /* do this last!!!! */
16147       PerlMemShared_free(ri->data->data[n]);
16148       PerlMemShared_free(ri->regstclass);
16149      }
16150     }
16151     break;
16152    case 't':
16153     {
16154      /* trie structure. */
16155      U32 refcount;
16156      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16157      OP_REFCNT_LOCK;
16158      refcount = --trie->refcount;
16159      OP_REFCNT_UNLOCK;
16160      if ( !refcount ) {
16161       PerlMemShared_free(trie->charmap);
16162       PerlMemShared_free(trie->states);
16163       PerlMemShared_free(trie->trans);
16164       if (trie->bitmap)
16165        PerlMemShared_free(trie->bitmap);
16166       if (trie->jump)
16167        PerlMemShared_free(trie->jump);
16168       PerlMemShared_free(trie->wordinfo);
16169       /* do this last!!!! */
16170       PerlMemShared_free(ri->data->data[n]);
16171      }
16172     }
16173     break;
16174    default:
16175     Perl_croak(aTHX_ "panic: regfree data code '%c'",
16176              ri->data->what[n]);
16177    }
16178   }
16179   Safefree(ri->data->what);
16180   Safefree(ri->data);
16181  }
16182
16183  Safefree(ri);
16184 }
16185
16186 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16187 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16188 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16189
16190 /*
16191    re_dup - duplicate a regexp.
16192
16193    This routine is expected to clone a given regexp structure. It is only
16194    compiled under USE_ITHREADS.
16195
16196    After all of the core data stored in struct regexp is duplicated
16197    the regexp_engine.dupe method is used to copy any private data
16198    stored in the *pprivate pointer. This allows extensions to handle
16199    any duplication it needs to do.
16200
16201    See pregfree() and regfree_internal() if you change anything here.
16202 */
16203 #if defined(USE_ITHREADS)
16204 #ifndef PERL_IN_XSUB_RE
16205 void
16206 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16207 {
16208  dVAR;
16209  I32 npar;
16210  const struct regexp *r = ReANY(sstr);
16211  struct regexp *ret = ReANY(dstr);
16212
16213  PERL_ARGS_ASSERT_RE_DUP_GUTS;
16214
16215  npar = r->nparens+1;
16216  Newx(ret->offs, npar, regexp_paren_pair);
16217  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16218
16219  if (ret->substrs) {
16220   /* Do it this way to avoid reading from *r after the StructCopy().
16221   That way, if any of the sv_dup_inc()s dislodge *r from the L1
16222   cache, it doesn't matter.  */
16223   const bool anchored = r->check_substr
16224    ? r->check_substr == r->anchored_substr
16225    : r->check_utf8 == r->anchored_utf8;
16226   Newx(ret->substrs, 1, struct reg_substr_data);
16227   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16228
16229   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16230   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16231   ret->float_substr = sv_dup_inc(ret->float_substr, param);
16232   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16233
16234   /* check_substr and check_utf8, if non-NULL, point to either their
16235   anchored or float namesakes, and don't hold a second reference.  */
16236
16237   if (ret->check_substr) {
16238    if (anchored) {
16239     assert(r->check_utf8 == r->anchored_utf8);
16240     ret->check_substr = ret->anchored_substr;
16241     ret->check_utf8 = ret->anchored_utf8;
16242    } else {
16243     assert(r->check_substr == r->float_substr);
16244     assert(r->check_utf8 == r->float_utf8);
16245     ret->check_substr = ret->float_substr;
16246     ret->check_utf8 = ret->float_utf8;
16247    }
16248   } else if (ret->check_utf8) {
16249    if (anchored) {
16250     ret->check_utf8 = ret->anchored_utf8;
16251    } else {
16252     ret->check_utf8 = ret->float_utf8;
16253    }
16254   }
16255  }
16256
16257  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16258  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16259
16260  if (ret->pprivate)
16261   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16262
16263  if (RX_MATCH_COPIED(dstr))
16264   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
16265  else
16266   ret->subbeg = NULL;
16267 #ifdef PERL_ANY_COW
16268  ret->saved_copy = NULL;
16269 #endif
16270
16271  /* Whether mother_re be set or no, we need to copy the string.  We
16272  cannot refrain from copying it when the storage points directly to
16273  our mother regexp, because that's
16274    1: a buffer in a different thread
16275    2: something we no longer hold a reference on
16276    so we need to copy it locally.  */
16277  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16278  ret->mother_re   = NULL;
16279 }
16280 #endif /* PERL_IN_XSUB_RE */
16281
16282 /*
16283    regdupe_internal()
16284
16285    This is the internal complement to regdupe() which is used to copy
16286    the structure pointed to by the *pprivate pointer in the regexp.
16287    This is the core version of the extension overridable cloning hook.
16288    The regexp structure being duplicated will be copied by perl prior
16289    to this and will be provided as the regexp *r argument, however
16290    with the /old/ structures pprivate pointer value. Thus this routine
16291    may override any copying normally done by perl.
16292
16293    It returns a pointer to the new regexp_internal structure.
16294 */
16295
16296 void *
16297 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16298 {
16299  dVAR;
16300  struct regexp *const r = ReANY(rx);
16301  regexp_internal *reti;
16302  int len;
16303  RXi_GET_DECL(r,ri);
16304
16305  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16306
16307  len = ProgLen(ri);
16308
16309  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16310   char, regexp_internal);
16311  Copy(ri->program, reti->program, len+1, regnode);
16312
16313  reti->num_code_blocks = ri->num_code_blocks;
16314  if (ri->code_blocks) {
16315   int n;
16316   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16317     struct reg_code_block);
16318   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16319     struct reg_code_block);
16320   for (n = 0; n < ri->num_code_blocks; n++)
16321    reti->code_blocks[n].src_regex = (REGEXP*)
16322      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16323  }
16324  else
16325   reti->code_blocks = NULL;
16326
16327  reti->regstclass = NULL;
16328
16329  if (ri->data) {
16330   struct reg_data *d;
16331   const int count = ri->data->count;
16332   int i;
16333
16334   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16335     char, struct reg_data);
16336   Newx(d->what, count, U8);
16337
16338   d->count = count;
16339   for (i = 0; i < count; i++) {
16340    d->what[i] = ri->data->what[i];
16341    switch (d->what[i]) {
16342     /* see also regcomp.h and regfree_internal() */
16343    case 'a': /* actually an AV, but the dup function is identical.  */
16344    case 'r':
16345    case 's':
16346    case 'S':
16347    case 'u': /* actually an HV, but the dup function is identical.  */
16348     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16349     break;
16350    case 'f':
16351     /* This is cheating. */
16352     Newx(d->data[i], 1, regnode_ssc);
16353     StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16354     reti->regstclass = (regnode*)d->data[i];
16355     break;
16356    case 'T':
16357     /* Trie stclasses are readonly and can thus be shared
16358     * without duplication. We free the stclass in pregfree
16359     * when the corresponding reg_ac_data struct is freed.
16360     */
16361     reti->regstclass= ri->regstclass;
16362     /* Fall through */
16363    case 't':
16364     OP_REFCNT_LOCK;
16365     ((reg_trie_data*)ri->data->data[i])->refcount++;
16366     OP_REFCNT_UNLOCK;
16367     /* Fall through */
16368    case 'l':
16369    case 'L':
16370     d->data[i] = ri->data->data[i];
16371     break;
16372    default:
16373     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16374               ri->data->what[i]);
16375    }
16376   }
16377
16378   reti->data = d;
16379  }
16380  else
16381   reti->data = NULL;
16382
16383  reti->name_list_idx = ri->name_list_idx;
16384
16385 #ifdef RE_TRACK_PATTERN_OFFSETS
16386  if (ri->u.offsets) {
16387   Newx(reti->u.offsets, 2*len+1, U32);
16388   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16389  }
16390 #else
16391  SetProgLen(reti,len);
16392 #endif
16393
16394  return (void*)reti;
16395 }
16396
16397 #endif    /* USE_ITHREADS */
16398
16399 #ifndef PERL_IN_XSUB_RE
16400
16401 /*
16402  - regnext - dig the "next" pointer out of a node
16403  */
16404 regnode *
16405 Perl_regnext(pTHX_ regnode *p)
16406 {
16407  dVAR;
16408  I32 offset;
16409
16410  if (!p)
16411   return(NULL);
16412
16413  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
16414   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16415             (int)OP(p), (int)REGNODE_MAX);
16416  }
16417
16418  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16419  if (offset == 0)
16420   return(NULL);
16421
16422  return(p+offset);
16423 }
16424 #endif
16425
16426 STATIC void
16427 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16428 {
16429  va_list args;
16430  STRLEN l1 = strlen(pat1);
16431  STRLEN l2 = strlen(pat2);
16432  char buf[512];
16433  SV *msv;
16434  const char *message;
16435
16436  PERL_ARGS_ASSERT_RE_CROAK2;
16437
16438  if (l1 > 510)
16439   l1 = 510;
16440  if (l1 + l2 > 510)
16441   l2 = 510 - l1;
16442  Copy(pat1, buf, l1 , char);
16443  Copy(pat2, buf + l1, l2 , char);
16444  buf[l1 + l2] = '\n';
16445  buf[l1 + l2 + 1] = '\0';
16446  va_start(args, pat2);
16447  msv = vmess(buf, &args);
16448  va_end(args);
16449  message = SvPV_const(msv,l1);
16450  if (l1 > 512)
16451   l1 = 512;
16452  Copy(message, buf, l1 , char);
16453  /* l1-1 to avoid \n */
16454  Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16455 }
16456
16457 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
16458
16459 #ifndef PERL_IN_XSUB_RE
16460 void
16461 Perl_save_re_context(pTHX)
16462 {
16463  dVAR;
16464
16465  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16466  if (PL_curpm) {
16467   const REGEXP * const rx = PM_GETRE(PL_curpm);
16468   if (rx) {
16469    U32 i;
16470    for (i = 1; i <= RX_NPARENS(rx); i++) {
16471     char digits[TYPE_CHARS(long)];
16472     const STRLEN len = my_snprintf(digits, sizeof(digits),
16473            "%lu", (long)i);
16474     GV *const *const gvp
16475      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16476
16477     if (gvp) {
16478      GV * const gv = *gvp;
16479      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16480       save_scalar(gv);
16481     }
16482    }
16483   }
16484  }
16485 }
16486 #endif
16487
16488 #ifdef DEBUGGING
16489
16490 STATIC void
16491 S_put_byte(pTHX_ SV *sv, int c)
16492 {
16493  PERL_ARGS_ASSERT_PUT_BYTE;
16494
16495  if (!isPRINT(c)) {
16496   switch (c) {
16497    case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16498    case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16499    case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16500    case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16501    case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16502
16503    default:
16504     Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16505     break;
16506   }
16507  }
16508  else {
16509   const char string = c;
16510   if (c == '-' || c == ']' || c == '\\' || c == '^')
16511    sv_catpvs(sv, "\\");
16512   sv_catpvn(sv, &string, 1);
16513  }
16514 }
16515
16516 STATIC void
16517 S_put_range(pTHX_ SV *sv, UV start, UV end)
16518 {
16519
16520  /* Appends to 'sv' a displayable version of the range of code points from
16521  * 'start' to 'end' */
16522
16523  assert(start <= end);
16524
16525  PERL_ARGS_ASSERT_PUT_RANGE;
16526
16527  if (end - start < 3) {  /* Individual chars in short ranges */
16528   for (; start <= end; start++)
16529    put_byte(sv, start);
16530  }
16531  else if (   end > 255
16532    || ! isALPHANUMERIC(start)
16533    || ! isALPHANUMERIC(end)
16534    || isDIGIT(start) != isDIGIT(end)
16535    || isUPPER(start) != isUPPER(end)
16536    || isLOWER(start) != isLOWER(end)
16537
16538     /* This final test should get optimized out except on EBCDIC
16539     * platforms, where it causes ranges that cross discontinuities
16540     * like i/j to be shown as hex instead of the misleading,
16541     * e.g. H-K (since that range includes more than H, I, J, K).
16542     * */
16543    || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16544  {
16545   Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16546      start,
16547      (end < 256) ? end : 255);
16548  }
16549  else { /* Here, the ends of the range are both digits, or both uppercase,
16550    or both lowercase; and there's no discontinuity in the range
16551    (which could happen on EBCDIC platforms) */
16552   put_byte(sv, start);
16553   sv_catpvs(sv, "-");
16554   put_byte(sv, end);
16555  }
16556 }
16557
16558 STATIC bool
16559 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16560 {
16561  /* Appends to 'sv' a displayable version of the innards of the bracketed
16562  * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
16563  * output anything */
16564
16565  int i;
16566  bool has_output_anything = FALSE;
16567
16568  PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16569
16570  for (i = 0; i < 256; i++) {
16571   if (BITMAP_TEST((U8 *) bitmap,i)) {
16572
16573    /* The character at index i should be output.  Find the next
16574    * character that should NOT be output */
16575    int j;
16576    for (j = i + 1; j < 256; j++) {
16577     if (! BITMAP_TEST((U8 *) bitmap, j)) {
16578      break;
16579     }
16580    }
16581
16582    /* Everything between them is a single range that should be output
16583    * */
16584    put_range(sv, i, j - 1);
16585    has_output_anything = TRUE;
16586    i = j;
16587   }
16588  }
16589
16590  return has_output_anything;
16591 }
16592
16593 #define CLEAR_OPTSTART \
16594  if (optstart) STMT_START {                                               \
16595   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,                       \
16596        " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16597   optstart=NULL;                                                       \
16598  } STMT_END
16599
16600 #define DUMPUNTIL(b,e)                                                       \
16601      CLEAR_OPTSTART;                                          \
16602      node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16603
16604 STATIC const regnode *
16605 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16606    const regnode *last, const regnode *plast,
16607    SV* sv, I32 indent, U32 depth)
16608 {
16609  dVAR;
16610  U8 op = PSEUDO; /* Arbitrary non-END op. */
16611  const regnode *next;
16612  const regnode *optstart= NULL;
16613
16614  RXi_GET_DECL(r,ri);
16615  GET_RE_DEBUG_FLAGS_DECL;
16616
16617  PERL_ARGS_ASSERT_DUMPUNTIL;
16618
16619 #ifdef DEBUG_DUMPUNTIL
16620  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16621   last ? last-start : 0,plast ? plast-start : 0);
16622 #endif
16623
16624  if (plast && plast < last)
16625   last= plast;
16626
16627  while (PL_regkind[op] != END && (!last || node < last)) {
16628   /* While that wasn't END last time... */
16629   NODE_ALIGN(node);
16630   op = OP(node);
16631   if (op == CLOSE || op == WHILEM)
16632    indent--;
16633   next = regnext((regnode *)node);
16634
16635   /* Where, what. */
16636   if (OP(node) == OPTIMIZED) {
16637    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16638     optstart = node;
16639    else
16640     goto after_print;
16641   } else
16642    CLEAR_OPTSTART;
16643
16644   regprop(r, sv, node, NULL);
16645   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16646      (int)(2*indent + 1), "", SvPVX_const(sv));
16647
16648   if (OP(node) != OPTIMIZED) {
16649    if (next == NULL)  /* Next ptr. */
16650     PerlIO_printf(Perl_debug_log, " (0)");
16651    else if (PL_regkind[(U8)op] == BRANCH
16652      && PL_regkind[OP(next)] != BRANCH )
16653     PerlIO_printf(Perl_debug_log, " (FAIL)");
16654    else
16655     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16656    (void)PerlIO_putc(Perl_debug_log, '\n');
16657   }
16658
16659  after_print:
16660   if (PL_regkind[(U8)op] == BRANCHJ) {
16661    assert(next);
16662    {
16663     const regnode *nnode = (OP(next) == LONGJMP
16664          ? regnext((regnode *)next)
16665          : next);
16666     if (last && nnode > last)
16667      nnode = last;
16668     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16669    }
16670   }
16671   else if (PL_regkind[(U8)op] == BRANCH) {
16672    assert(next);
16673    DUMPUNTIL(NEXTOPER(node), next);
16674   }
16675   else if ( PL_regkind[(U8)op]  == TRIE ) {
16676    const regnode *this_trie = node;
16677    const char op = OP(node);
16678    const U32 n = ARG(node);
16679    const reg_ac_data * const ac = op>=AHOCORASICK ?
16680    (reg_ac_data *)ri->data->data[n] :
16681    NULL;
16682    const reg_trie_data * const trie =
16683     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16684 #ifdef DEBUGGING
16685    AV *const trie_words
16686       = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16687 #endif
16688    const regnode *nextbranch= NULL;
16689    I32 word_idx;
16690    sv_setpvs(sv, "");
16691    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16692     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16693
16694     PerlIO_printf(Perl_debug_log, "%*s%s ",
16695     (int)(2*(indent+3)), "",
16696      elem_ptr
16697      ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16698         SvCUR(*elem_ptr), 60,
16699         PL_colors[0], PL_colors[1],
16700         (SvUTF8(*elem_ptr)
16701         ? PERL_PV_ESCAPE_UNI
16702         : 0)
16703         | PERL_PV_PRETTY_ELLIPSES
16704         | PERL_PV_PRETTY_LTGT
16705        )
16706      : "???"
16707     );
16708     if (trie->jump) {
16709      U16 dist= trie->jump[word_idx+1];
16710      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16711        (UV)((dist ? this_trie + dist : next) - start));
16712      if (dist) {
16713       if (!nextbranch)
16714        nextbranch= this_trie + trie->jump[0];
16715       DUMPUNTIL(this_trie + dist, nextbranch);
16716      }
16717      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16718       nextbranch= regnext((regnode *)nextbranch);
16719     } else {
16720      PerlIO_printf(Perl_debug_log, "\n");
16721     }
16722    }
16723    if (last && next > last)
16724     node= last;
16725    else
16726     node= next;
16727   }
16728   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
16729    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16730      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16731   }
16732   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16733    assert(next);
16734    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16735   }
16736   else if ( op == PLUS || op == STAR) {
16737    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16738   }
16739   else if (PL_regkind[(U8)op] == ANYOF) {
16740    /* arglen 1 + class block */
16741    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16742       ? ANYOF_POSIXL_SKIP
16743       : ANYOF_SKIP);
16744    node = NEXTOPER(node);
16745   }
16746   else if (PL_regkind[(U8)op] == EXACT) {
16747    /* Literal string, where present. */
16748    node += NODE_SZ_STR(node) - 1;
16749    node = NEXTOPER(node);
16750   }
16751   else {
16752    node = NEXTOPER(node);
16753    node += regarglen[(U8)op];
16754   }
16755   if (op == CURLYX || op == OPEN)
16756    indent++;
16757  }
16758  CLEAR_OPTSTART;
16759 #ifdef DEBUG_DUMPUNTIL
16760  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16761 #endif
16762  return node;
16763 }
16764
16765 #endif /* DEBUGGING */
16766
16767 /*
16768  * Local variables:
16769  * c-indentation-style: bsd
16770  * c-basic-offset: 4
16771  * indent-tabs-mode: nil
16772  * End:
16773  *
16774  * ex: set ts=8 sts=4 sw=4 et:
16775  */