src/5019003/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifdef op
  99 #undef op
 100 #endif /* op */
 101
 102 #ifdef MSDOS
 103 #  if defined(BUGGY_MSC6)
 104  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 105 #    pragma optimize("a",off)
 106  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 107 #    pragma optimize("w",on )
 108 #  endif /* BUGGY_MSC6 */
 109 #endif /* MSDOS */
 110
 111 #ifndef STATIC
 112 #define STATIC static
 113 #endif
 114
 115
 116 typedef struct RExC_state_t {
 117  U32  flags;   /* RXf_* are we folding, multilining? */
 118  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 119  char *precomp;  /* uncompiled string. */
 120  REGEXP *rx_sv;   /* The SV that is the regexp. */
 121  regexp *rx;                    /* perl core regexp structure */
 122  regexp_internal *rxi;           /* internal data for regexp object pprivate field */
 123  char *start;   /* Start of input for compile */
 124  char *end;   /* End of input for compile */
 125  char *parse;   /* Input-scan pointer. */
 126  I32  whilem_seen;  /* number of WHILEM in this expr */
 127  regnode *emit_start;  /* Start of emitted-code area */
 128  regnode *emit_bound;  /* First regnode outside of the allocated space */
 129  regnode *emit;   /* Code-emit pointer; if = &emit_dummy,
 130           implies compiling, so don't emit */
 131  regnode emit_dummy;  /* placeholder for emit to point to */
 132  I32  naughty;  /* How bad is this pattern? */
 133  I32  sawback;  /* Did we see \1, ...? */
 134  U32  seen;
 135  I32  size;   /* Code size. */
 136  I32  npar;   /* Capture buffer count, (OPEN). */
 137  I32  cpar;   /* Capture buffer count, (CLOSE). */
 138  I32  nestroot;  /* root parens we are in - used by accept */
 139  I32  extralen;
 140  I32  seen_zerolen;
 141  regnode **open_parens;  /* pointers to open parens */
 142  regnode **close_parens;  /* pointers to close parens */
 143  regnode *opend;   /* END node in program */
 144  I32  utf8;  /* whether the pattern is utf8 or not */
 145  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 146         /* XXX use this for future optimisation of case
 147         * where pattern must be upgraded to utf8. */
 148  I32  uni_semantics; /* If a d charset modifier should use unicode
 149         rules, even if the pattern is not in
 150         utf8 */
 151  HV  *paren_names;  /* Paren names */
 152
 153  regnode **recurse;  /* Recurse regops */
 154  I32  recurse_count;  /* Number of recurse regops */
 155  I32  in_lookbehind;
 156  I32  contains_locale;
 157  I32  override_recoding;
 158  I32  in_multi_char_class;
 159  struct reg_code_block *code_blocks; /* positions of literal (?{})
 160            within pattern */
 161  int  num_code_blocks; /* size of code_blocks[] */
 162  int  code_index;  /* next code_blocks[] slot */
 163 #if ADD_TO_REGEXEC
 164  char  *starttry;  /* -Dr: where regtry was called. */
 165 #define RExC_starttry (pRExC_state->starttry)
 166 #endif
 167  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 168 #ifdef DEBUGGING
 169  const char  *lastparse;
 170  I32         lastnum;
 171  AV          *paren_name_list;       /* idx -> name */
 172 #define RExC_lastparse (pRExC_state->lastparse)
 173 #define RExC_lastnum (pRExC_state->lastnum)
 174 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 175 #endif
 176 } RExC_state_t;
 177
 178 #define RExC_flags (pRExC_state->flags)
 179 #define RExC_pm_flags (pRExC_state->pm_flags)
 180 #define RExC_precomp (pRExC_state->precomp)
 181 #define RExC_rx_sv (pRExC_state->rx_sv)
 182 #define RExC_rx  (pRExC_state->rx)
 183 #define RExC_rxi (pRExC_state->rxi)
 184 #define RExC_start (pRExC_state->start)
 185 #define RExC_end (pRExC_state->end)
 186 #define RExC_parse (pRExC_state->parse)
 187 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 188 #ifdef RE_TRACK_PATTERN_OFFSETS
 189 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
 190 #endif
 191 #define RExC_emit (pRExC_state->emit)
 192 #define RExC_emit_dummy (pRExC_state->emit_dummy)
 193 #define RExC_emit_start (pRExC_state->emit_start)
 194 #define RExC_emit_bound (pRExC_state->emit_bound)
 195 #define RExC_naughty (pRExC_state->naughty)
 196 #define RExC_sawback (pRExC_state->sawback)
 197 #define RExC_seen (pRExC_state->seen)
 198 #define RExC_size (pRExC_state->size)
 199 #define RExC_npar (pRExC_state->npar)
 200 #define RExC_nestroot   (pRExC_state->nestroot)
 201 #define RExC_extralen (pRExC_state->extralen)
 202 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 203 #define RExC_utf8 (pRExC_state->utf8)
 204 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 205 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 206 #define RExC_open_parens (pRExC_state->open_parens)
 207 #define RExC_close_parens (pRExC_state->close_parens)
 208 #define RExC_opend (pRExC_state->opend)
 209 #define RExC_paren_names (pRExC_state->paren_names)
 210 #define RExC_recurse (pRExC_state->recurse)
 211 #define RExC_recurse_count (pRExC_state->recurse_count)
 212 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 213 #define RExC_contains_locale (pRExC_state->contains_locale)
 214 #define RExC_override_recoding (pRExC_state->override_recoding)
 215 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 216
 217
 218 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 219 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 220   ((*s) == '{' && regcurly(s, FALSE)))
 221
 222 #ifdef SPSTART
 223 #undef SPSTART  /* dratted cpp namespace... */
 224 #endif
 225 /*
 226  * Flags to be passed up and down.
 227  */
 228 #define WORST  0 /* Worst case. */
 229 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 230
 231 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 232  * character.  (There needs to be a case: in the switch statement in regexec.c
 233  * for any node marked SIMPLE.)  Note that this is not the same thing as
 234  * REGNODE_SIMPLE */
 235 #define SIMPLE  0x02
 236 #define SPSTART  0x04 /* Starts with * or + */
 237 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 238 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 239 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 240
 241 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 242
 243 /* whether trie related optimizations are enabled */
 244 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 245 #define TRIE_STUDY_OPT
 246 #define FULL_TRIE_STUDY
 247 #define TRIE_STCLASS
 248 #endif
 249
 250
 251
 252 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 253 #define PBITVAL(paren) (1 << ((paren) & 7))
 254 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 255 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 256 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 257
 258 #define REQUIRE_UTF8 STMT_START {                                       \
 259          if (!UTF) {                           \
 260           *flagp = RESTART_UTF8;            \
 261           return NULL;                      \
 262          }                                     \
 263       } STMT_END
 264
 265 /* This converts the named class defined in regcomp.h to its equivalent class
 266  * number defined in handy.h. */
 267 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 268 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 269
 270 /* About scan_data_t.
 271
 272   During optimisation we recurse through the regexp program performing
 273   various inplace (keyhole style) optimisations. In addition study_chunk
 274   and scan_commit populate this data structure with information about
 275   what strings MUST appear in the pattern. We look for the longest
 276   string that must appear at a fixed location, and we look for the
 277   longest string that may appear at a floating location. So for instance
 278   in the pattern:
 279
 280  /FOO[xX]A.*B[xX]BAR/
 281
 282   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 283   strings (because they follow a .* construct). study_chunk will identify
 284   both FOO and BAR as being the longest fixed and floating strings respectively.
 285
 286   The strings can be composites, for instance
 287
 288  /(f)(o)(o)/
 289
 290   will result in a composite fixed substring 'foo'.
 291
 292   For each string some basic information is maintained:
 293
 294   - offset or min_offset
 295  This is the position the string must appear at, or not before.
 296  It also implicitly (when combined with minlenp) tells us how many
 297  characters must match before the string we are searching for.
 298  Likewise when combined with minlenp and the length of the string it
 299  tells us how many characters must appear after the string we have
 300  found.
 301
 302   - max_offset
 303  Only used for floating strings. This is the rightmost point that
 304  the string can appear at. If set to I32 max it indicates that the
 305  string can occur infinitely far to the right.
 306
 307   - minlenp
 308  A pointer to the minimum number of characters of the pattern that the
 309  string was found inside. This is important as in the case of positive
 310  lookahead or positive lookbehind we can have multiple patterns
 311  involved. Consider
 312
 313  /(?=FOO).*F/
 314
 315  The minimum length of the pattern overall is 3, the minimum length
 316  of the lookahead part is 3, but the minimum length of the part that
 317  will actually match is 1. So 'FOO's minimum length is 3, but the
 318  minimum length for the F is 1. This is important as the minimum length
 319  is used to determine offsets in front of and behind the string being
 320  looked for.  Since strings can be composites this is the length of the
 321  pattern at the time it was committed with a scan_commit. Note that
 322  the length is calculated by study_chunk, so that the minimum lengths
 323  are not known until the full pattern has been compiled, thus the
 324  pointer to the value.
 325
 326   - lookbehind
 327
 328  In the case of lookbehind the string being searched for can be
 329  offset past the start point of the final matching string.
 330  If this value was just blithely removed from the min_offset it would
 331  invalidate some of the calculations for how many chars must match
 332  before or after (as they are derived from min_offset and minlen and
 333  the length of the string being searched for).
 334  When the final pattern is compiled and the data is moved from the
 335  scan_data_t structure into the regexp structure the information
 336  about lookbehind is factored in, with the information that would
 337  have been lost precalculated in the end_shift field for the
 338  associated string.
 339
 340   The fields pos_min and pos_delta are used to store the minimum offset
 341   and the delta to the maximum offset at the current point in the pattern.
 342
 343 */
 344
 345 typedef struct scan_data_t {
 346  /*I32 len_min;      unused */
 347  /*I32 len_delta;    unused */
 348  I32 pos_min;
 349  I32 pos_delta;
 350  SV *last_found;
 351  I32 last_end;     /* min value, <0 unless valid. */
 352  I32 last_start_min;
 353  I32 last_start_max;
 354  SV **longest;     /* Either &l_fixed, or &l_float. */
 355  SV *longest_fixed;      /* longest fixed string found in pattern */
 356  I32 offset_fixed;       /* offset where it starts */
 357  I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 358  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 359  SV *longest_float;      /* longest floating string found in pattern */
 360  I32 offset_float_min;   /* earliest point in string it can appear */
 361  I32 offset_float_max;   /* latest point in string it can appear */
 362  I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 363  I32 lookbehind_float;   /* is the position of the string modified by LB */
 364  I32 flags;
 365  I32 whilem_c;
 366  I32 *last_closep;
 367  struct regnode_charclass_class *start_class;
 368 } scan_data_t;
 369
 370 /* The below is perhaps overboard, but this allows us to save a test at the
 371  * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
 372  * and 'a' differ by a single bit; the same with the upper and lower case of
 373  * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
 374  * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
 375  * then inverts it to form a mask, with just a single 0, in the bit position
 376  * where the upper- and lowercase differ.  XXX There are about 40 other
 377  * instances in the Perl core where this micro-optimization could be used.
 378  * Should decide if maintenance cost is worse, before changing those
 379  *
 380  * Returns a boolean as to whether or not 'v' is either a lowercase or
 381  * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
 382  * compile-time constant, the generated code is better than some optimizing
 383  * compilers figure out, amounting to a mask and test.  The results are
 384  * meaningless if 'c' is not one of [A-Za-z] */
 385 #define isARG2_lower_or_UPPER_ARG1(c, v) \
 386        (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
 387
 388 /*
 389  * Forward declarations for pregcomp()'s friends.
 390  */
 391
 392 static const scan_data_t zero_scan_data =
 393   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 394
 395 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 396 #define SF_BEFORE_SEOL  0x0001
 397 #define SF_BEFORE_MEOL  0x0002
 398 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 399 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 400
 401 #ifdef NO_UNARY_PLUS
 402 #  define SF_FIX_SHIFT_EOL (0+2)
 403 #  define SF_FL_SHIFT_EOL  (0+4)
 404 #else
 405 #  define SF_FIX_SHIFT_EOL (+2)
 406 #  define SF_FL_SHIFT_EOL  (+4)
 407 #endif
 408
 409 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 410 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 411
 412 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 413 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 414 #define SF_IS_INF  0x0040
 415 #define SF_HAS_PAR  0x0080
 416 #define SF_IN_PAR  0x0100
 417 #define SF_HAS_EVAL  0x0200
 418 #define SCF_DO_SUBSTR  0x0400
 419 #define SCF_DO_STCLASS_AND 0x0800
 420 #define SCF_DO_STCLASS_OR 0x1000
 421 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 422 #define SCF_WHILEM_VISITED_POS 0x2000
 423
 424 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 425 #define SCF_SEEN_ACCEPT         0x8000
 426 #define SCF_TRIE_DOING_RESTUDY 0x10000
 427
 428 #define UTF cBOOL(RExC_utf8)
 429
 430 /* The enums for all these are ordered so things work out correctly */
 431 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 432 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 433 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 434 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 435 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 436 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 437 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 438
 439 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 440
 441 #define OOB_NAMEDCLASS  -1
 442
 443 /* There is no code point that is out-of-bounds, so this is problematic.  But
 444  * its only current use is to initialize a variable that is always set before
 445  * looked at. */
 446 #define OOB_UNICODE  0xDEADBEEF
 447
 448 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 449 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 450
 451
 452 /* length of regex to show in messages that don't mark a position within */
 453 #define RegexLengthToShowInErrorMessages 127
 454
 455 /*
 456  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 457  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 458  * op/pragma/warn/regcomp.
 459  */
 460 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 461 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 462
 463 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 464
 465 /*
 466  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 467  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 468  * "...".
 469  */
 470 #define _FAIL(code) STMT_START {     \
 471  const char *ellipses = "";      \
 472  IV len = RExC_end - RExC_precomp;     \
 473                   \
 474  if (!SIZE_ONLY)       \
 475   SAVEFREESV(RExC_rx_sv);      \
 476  if (len > RegexLengthToShowInErrorMessages) {   \
 477   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 478   len = RegexLengthToShowInErrorMessages - 10;   \
 479   ellipses = "...";      \
 480  }         \
 481  code;                                                               \
 482 } STMT_END
 483
 484 #define FAIL(msg) _FAIL(       \
 485  Perl_croak(aTHX_ "%s in regex m/%.*s%s/",     \
 486    msg, (int)len, RExC_precomp, ellipses))
 487
 488 #define FAIL2(msg,arg) _FAIL(       \
 489  Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 490    arg, (int)len, RExC_precomp, ellipses))
 491
 492 /*
 493  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 494  */
 495 #define Simple_vFAIL(m) STMT_START {     \
 496  const IV offset = RExC_parse - RExC_precomp;   \
 497  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 498    m, (int)offset, RExC_precomp, RExC_precomp + offset); \
 499 } STMT_END
 500
 501 /*
 502  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 503  */
 504 #define vFAIL(m) STMT_START {    \
 505  if (!SIZE_ONLY)     \
 506   SAVEFREESV(RExC_rx_sv);    \
 507  Simple_vFAIL(m);     \
 508 } STMT_END
 509
 510 /*
 511  * Like Simple_vFAIL(), but accepts two arguments.
 512  */
 513 #define Simple_vFAIL2(m,a1) STMT_START {   \
 514  const IV offset = RExC_parse - RExC_precomp;   \
 515  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,   \
 516    (int)offset, RExC_precomp, RExC_precomp + offset); \
 517 } STMT_END
 518
 519 /*
 520  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 521  */
 522 #define vFAIL2(m,a1) STMT_START {   \
 523  if (!SIZE_ONLY)     \
 524   SAVEFREESV(RExC_rx_sv);    \
 525  Simple_vFAIL2(m, a1);    \
 526 } STMT_END
 527
 528
 529 /*
 530  * Like Simple_vFAIL(), but accepts three arguments.
 531  */
 532 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 533  const IV offset = RExC_parse - RExC_precomp;  \
 534  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,  \
 535    (int)offset, RExC_precomp, RExC_precomp + offset); \
 536 } STMT_END
 537
 538 /*
 539  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 540  */
 541 #define vFAIL3(m,a1,a2) STMT_START {   \
 542  if (!SIZE_ONLY)     \
 543   SAVEFREESV(RExC_rx_sv);    \
 544  Simple_vFAIL3(m, a1, a2);    \
 545 } STMT_END
 546
 547 /*
 548  * Like Simple_vFAIL(), but accepts four arguments.
 549  */
 550 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 551  const IV offset = RExC_parse - RExC_precomp;  \
 552  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,  \
 553    (int)offset, RExC_precomp, RExC_precomp + offset); \
 554 } STMT_END
 555
 556 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 557  if (!SIZE_ONLY)     \
 558   SAVEFREESV(RExC_rx_sv);    \
 559  Simple_vFAIL4(m, a1, a2, a3);   \
 560 } STMT_END
 561
 562 /* m is not necessarily a "literal string", in this macro */
 563 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 564  const IV offset = loc - RExC_precomp;                               \
 565  Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 566    m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 567 } STMT_END
 568
 569 #define ckWARNreg(loc,m) STMT_START {     \
 570  const IV offset = loc - RExC_precomp;    \
 571  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 572    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 573 } STMT_END
 574
 575 #define vWARN_dep(loc, m) STMT_START {            \
 576  const IV offset = loc - RExC_precomp;    \
 577  Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 578    (int)offset, RExC_precomp, RExC_precomp + offset);         \
 579 } STMT_END
 580
 581 #define ckWARNdep(loc,m) STMT_START {            \
 582  const IV offset = loc - RExC_precomp;    \
 583  Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 584    m REPORT_LOCATION,      \
 585    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 586 } STMT_END
 587
 588 #define ckWARNregdep(loc,m) STMT_START {    \
 589  const IV offset = loc - RExC_precomp;    \
 590  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 591    m REPORT_LOCATION,      \
 592    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 593 } STMT_END
 594
 595 #define ckWARN2reg_d(loc,m, a1) STMT_START {    \
 596  const IV offset = loc - RExC_precomp;    \
 597  Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP),   \
 598    m REPORT_LOCATION,      \
 599    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 600 } STMT_END
 601
 602 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 603  const IV offset = loc - RExC_precomp;    \
 604  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 605    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 606 } STMT_END
 607
 608 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 609  const IV offset = loc - RExC_precomp;    \
 610  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 611    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 612 } STMT_END
 613
 614 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 615  const IV offset = loc - RExC_precomp;    \
 616  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 617    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 618 } STMT_END
 619
 620 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 621  const IV offset = loc - RExC_precomp;    \
 622  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 623    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 624 } STMT_END
 625
 626 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 627  const IV offset = loc - RExC_precomp;    \
 628  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 629    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 630 } STMT_END
 631
 632 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 633  const IV offset = loc - RExC_precomp;    \
 634  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 635    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 636 } STMT_END
 637
 638
 639 /* Allow for side effects in s */
 640 #define REGC(c,s) STMT_START {   \
 641  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 642 } STMT_END
 643
 644 /* Macros for recording node offsets.   20001227 mjd@plover.com
 645  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 646  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 647  * Element 0 holds the number n.
 648  * Position is 1 indexed.
 649  */
 650 #ifndef RE_TRACK_PATTERN_OFFSETS
 651 #define Set_Node_Offset_To_R(node,byte)
 652 #define Set_Node_Offset(node,byte)
 653 #define Set_Cur_Node_Offset
 654 #define Set_Node_Length_To_R(node,len)
 655 #define Set_Node_Length(node,len)
 656 #define Set_Node_Cur_Length(node,start)
 657 #define Node_Offset(n)
 658 #define Node_Length(n)
 659 #define Set_Node_Offset_Length(node,offset,len)
 660 #define ProgLen(ri) ri->u.proglen
 661 #define SetProgLen(ri,x) ri->u.proglen = x
 662 #else
 663 #define ProgLen(ri) ri->u.offsets[0]
 664 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 665 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 666  if (! SIZE_ONLY) {       \
 667   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 668      __LINE__, (int)(node), (int)(byte)));  \
 669   if((node) < 0) {      \
 670    Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 671   } else {       \
 672    RExC_offsets[2*(node)-1] = (byte);    \
 673   }        \
 674  }         \
 675 } STMT_END
 676
 677 #define Set_Node_Offset(node,byte) \
 678  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 679 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 680
 681 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 682  if (! SIZE_ONLY) {       \
 683   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 684     __LINE__, (int)(node), (int)(len)));   \
 685   if((node) < 0) {      \
 686    Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 687   } else {       \
 688    RExC_offsets[2*(node)] = (len);    \
 689   }        \
 690  }         \
 691 } STMT_END
 692
 693 #define Set_Node_Length(node,len) \
 694  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 695 #define Set_Node_Cur_Length(node, start)                \
 696  Set_Node_Length(node, RExC_parse - start)
 697
 698 /* Get offsets and lengths */
 699 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 700 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 701
 702 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 703  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 704  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 705 } STMT_END
 706 #endif
 707
 708 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 709 #define EXPERIMENTAL_INPLACESCAN
 710 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 711
 712 #define DEBUG_STUDYDATA(str,data,depth)                              \
 713 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 714  PerlIO_printf(Perl_debug_log,                                    \
 715   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 716   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 717   (int)(depth)*2, "",                                          \
 718   (IV)((data)->pos_min),                                       \
 719   (IV)((data)->pos_delta),                                     \
 720   (UV)((data)->flags),                                         \
 721   (IV)((data)->whilem_c),                                      \
 722   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 723   is_inf ? "INF " : ""                                         \
 724  );                                                               \
 725  if ((data)->last_found)                                          \
 726   PerlIO_printf(Perl_debug_log,                                \
 727    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 728    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 729    SvPVX_const((data)->last_found),                         \
 730    (IV)((data)->last_end),                                  \
 731    (IV)((data)->last_start_min),                            \
 732    (IV)((data)->last_start_max),                            \
 733    ((data)->longest &&                                      \
 734    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 735    SvPVX_const((data)->longest_fixed),                      \
 736    (IV)((data)->offset_fixed),                              \
 737    ((data)->longest &&                                      \
 738    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 739    SvPVX_const((data)->longest_float),                      \
 740    (IV)((data)->offset_float_min),                          \
 741    (IV)((data)->offset_float_max)                           \
 742   );                                                           \
 743  PerlIO_printf(Perl_debug_log,"\n");                              \
 744 });
 745
 746 /* Mark that we cannot extend a found fixed substring at this point.
 747    Update the longest found anchored substring and the longest found
 748    floating substrings if needed. */
 749
 750 STATIC void
 751 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 752 {
 753  const STRLEN l = CHR_SVLEN(data->last_found);
 754  const STRLEN old_l = CHR_SVLEN(*data->longest);
 755  GET_RE_DEBUG_FLAGS_DECL;
 756
 757  PERL_ARGS_ASSERT_SCAN_COMMIT;
 758
 759  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 760   SvSetMagicSV(*data->longest, data->last_found);
 761   if (*data->longest == data->longest_fixed) {
 762    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 763    if (data->flags & SF_BEFORE_EOL)
 764     data->flags
 765      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 766    else
 767     data->flags &= ~SF_FIX_BEFORE_EOL;
 768    data->minlen_fixed=minlenp;
 769    data->lookbehind_fixed=0;
 770   }
 771   else { /* *data->longest == data->longest_float */
 772    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 773    data->offset_float_max = (l
 774          ? data->last_start_max
 775          : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
 776    if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 777     data->offset_float_max = I32_MAX;
 778    if (data->flags & SF_BEFORE_EOL)
 779     data->flags
 780      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 781    else
 782     data->flags &= ~SF_FL_BEFORE_EOL;
 783    data->minlen_float=minlenp;
 784    data->lookbehind_float=0;
 785   }
 786  }
 787  SvCUR_set(data->last_found, 0);
 788  {
 789   SV * const sv = data->last_found;
 790   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 791    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 792    if (mg)
 793     mg->mg_len = 0;
 794   }
 795  }
 796  data->last_end = -1;
 797  data->flags &= ~SF_BEFORE_EOL;
 798  DEBUG_STUDYDATA("commit: ",data,0);
 799 }
 800
 801 /* These macros set, clear and test whether the synthetic start class ('ssc',
 802  * given by the parameter) matches an empty string (EOS).  This uses the
 803  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 804  * stands alone, so there is never a next_off, so this field is otherwise
 805  * unused.  The EOS information is used only for compilation, but theoretically
 806  * it could be passed on to the execution code.  This could be used to store
 807  * more than one bit of information, but only this one is currently used. */
 808 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 809 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 810 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 811
 812 /* Can match anything (initialization) */
 813 STATIC void
 814 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 815 {
 816  PERL_ARGS_ASSERT_CL_ANYTHING;
 817
 818  ANYOF_BITMAP_SETALL(cl);
 819  cl->flags = ANYOF_UNICODE_ALL;
 820  SET_SSC_EOS(cl);
 821
 822  /* If any portion of the regex is to operate under locale rules,
 823  * initialization includes it.  The reason this isn't done for all regexes
 824  * is that the optimizer was written under the assumption that locale was
 825  * all-or-nothing.  Given the complexity and lack of documentation in the
 826  * optimizer, and that there are inadequate test cases for locale, so many
 827  * parts of it may not work properly, it is safest to avoid locale unless
 828  * necessary. */
 829  if (RExC_contains_locale) {
 830   ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 831   cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 832  }
 833  else {
 834   ANYOF_CLASS_ZERO(cl);     /* Only /l uses class now */
 835  }
 836 }
 837
 838 /* Can match anything (initialization) */
 839 STATIC int
 840 S_cl_is_anything(const struct regnode_charclass_class *cl)
 841 {
 842  int value;
 843
 844  PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 845
 846  for (value = 0; value < ANYOF_MAX; value += 2)
 847   if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 848    return 1;
 849  if (!(cl->flags & ANYOF_UNICODE_ALL))
 850   return 0;
 851  if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 852   return 0;
 853  return 1;
 854 }
 855
 856 /* Can match anything (initialization) */
 857 STATIC void
 858 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 859 {
 860  PERL_ARGS_ASSERT_CL_INIT;
 861
 862  Zero(cl, 1, struct regnode_charclass_class);
 863  cl->type = ANYOF;
 864  cl_anything(pRExC_state, cl);
 865  ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 866 }
 867
 868 /* These two functions currently do the exact same thing */
 869 #define cl_init_zero  cl_init
 870
 871 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 872  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 873  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 874 STATIC void
 875 S_cl_and(struct regnode_charclass_class *cl,
 876   const struct regnode_charclass_class *and_with)
 877 {
 878  PERL_ARGS_ASSERT_CL_AND;
 879
 880  assert(PL_regkind[and_with->type] == ANYOF);
 881
 882  /* I (khw) am not sure all these restrictions are necessary XXX */
 883  if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 884   && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 885   && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 886   && !(and_with->flags & ANYOF_LOC_FOLD)
 887   && !(cl->flags & ANYOF_LOC_FOLD)) {
 888   int i;
 889
 890   if (and_with->flags & ANYOF_INVERT)
 891    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 892     cl->bitmap[i] &= ~and_with->bitmap[i];
 893   else
 894    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 895     cl->bitmap[i] &= and_with->bitmap[i];
 896  } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 897
 898  if (and_with->flags & ANYOF_INVERT) {
 899
 900   /* Here, the and'ed node is inverted.  Get the AND of the flags that
 901   * aren't affected by the inversion.  Those that are affected are
 902   * handled individually below */
 903   U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 904   cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 905   cl->flags |= affected_flags;
 906
 907   /* We currently don't know how to deal with things that aren't in the
 908   * bitmap, but we know that the intersection is no greater than what
 909   * is already in cl, so let there be false positives that get sorted
 910   * out after the synthetic start class succeeds, and the node is
 911   * matched for real. */
 912
 913   /* The inversion of these two flags indicate that the resulting
 914   * intersection doesn't have them */
 915   if (and_with->flags & ANYOF_UNICODE_ALL) {
 916    cl->flags &= ~ANYOF_UNICODE_ALL;
 917   }
 918   if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 919    cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 920   }
 921  }
 922  else {   /* and'd node is not inverted */
 923   U8 outside_bitmap_but_not_utf8; /* Temp variable */
 924
 925   if (! ANYOF_NONBITMAP(and_with)) {
 926
 927    /* Here 'and_with' doesn't match anything outside the bitmap
 928    * (except possibly ANYOF_UNICODE_ALL), which means the
 929    * intersection can't either, except for ANYOF_UNICODE_ALL, in
 930    * which case we don't know what the intersection is, but it's no
 931    * greater than what cl already has, so can just leave it alone,
 932    * with possible false positives */
 933    if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 934     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 935     cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 936    }
 937   }
 938   else if (! ANYOF_NONBITMAP(cl)) {
 939
 940    /* Here, 'and_with' does match something outside the bitmap, and cl
 941    * doesn't have a list of things to match outside the bitmap.  If
 942    * cl can match all code points above 255, the intersection will
 943    * be those above-255 code points that 'and_with' matches.  If cl
 944    * can't match all Unicode code points, it means that it can't
 945    * match anything outside the bitmap (since the 'if' that got us
 946    * into this block tested for that), so we leave the bitmap empty.
 947    */
 948    if (cl->flags & ANYOF_UNICODE_ALL) {
 949     ARG_SET(cl, ARG(and_with));
 950
 951     /* and_with's ARG may match things that don't require UTF8.
 952     * And now cl's will too, in spite of this being an 'and'.  See
 953     * the comments below about the kludge */
 954     cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 955    }
 956   }
 957   else {
 958    /* Here, both 'and_with' and cl match something outside the
 959    * bitmap.  Currently we do not do the intersection, so just match
 960    * whatever cl had at the beginning.  */
 961   }
 962
 963
 964   /* Take the intersection of the two sets of flags.  However, the
 965   * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 966   * kludge around the fact that this flag is not treated like the others
 967   * which are initialized in cl_anything().  The way the optimizer works
 968   * is that the synthetic start class (SSC) is initialized to match
 969   * anything, and then the first time a real node is encountered, its
 970   * values are AND'd with the SSC's with the result being the values of
 971   * the real node.  However, there are paths through the optimizer where
 972   * the AND never gets called, so those initialized bits are set
 973   * inappropriately, which is not usually a big deal, as they just cause
 974   * false positives in the SSC, which will just mean a probably
 975   * imperceptible slow down in execution.  However this bit has a
 976   * higher false positive consequence in that it can cause utf8.pm,
 977   * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 978   * bigger slowdown and also causes significant extra memory to be used.
 979   * In order to prevent this, the code now takes a different tack.  The
 980   * bit isn't set unless some part of the regular expression needs it,
 981   * but once set it won't get cleared.  This means that these extra
 982   * modules won't get loaded unless there was some path through the
 983   * pattern that would have required them anyway, and  so any false
 984   * positives that occur by not ANDing them out when they could be
 985   * aren't as severe as they would be if we treated this bit like all
 986   * the others */
 987   outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 988          & ANYOF_NONBITMAP_NON_UTF8;
 989   cl->flags &= and_with->flags;
 990   cl->flags |= outside_bitmap_but_not_utf8;
 991  }
 992 }
 993
 994 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 995  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 996  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 997 STATIC void
 998 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 999 {
1000  PERL_ARGS_ASSERT_CL_OR;
1001
1002  if (or_with->flags & ANYOF_INVERT) {
1003
1004   /* Here, the or'd node is to be inverted.  This means we take the
1005   * complement of everything not in the bitmap, but currently we don't
1006   * know what that is, so give up and match anything */
1007   if (ANYOF_NONBITMAP(or_with)) {
1008    cl_anything(pRExC_state, cl);
1009   }
1010   /* We do not use
1011   * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
1012   *   <= (B1 | !B2) | (CL1 | !CL2)
1013   * which is wasteful if CL2 is small, but we ignore CL2:
1014   *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
1015   * XXXX Can we handle case-fold?  Unclear:
1016   *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
1017   *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1018   */
1019   else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1020    && !(or_with->flags & ANYOF_LOC_FOLD)
1021    && !(cl->flags & ANYOF_LOC_FOLD) ) {
1022    int i;
1023
1024    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1025     cl->bitmap[i] |= ~or_with->bitmap[i];
1026   } /* XXXX: logic is complicated otherwise */
1027   else {
1028    cl_anything(pRExC_state, cl);
1029   }
1030
1031   /* And, we can just take the union of the flags that aren't affected
1032   * by the inversion */
1033   cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1034
1035   /* For the remaining flags:
1036    ANYOF_UNICODE_ALL and inverted means to not match anything above
1037      255, which means that the union with cl should just be
1038      what cl has in it, so can ignore this flag
1039    ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1040      is 127-255 to match them, but then invert that, so the
1041      union with cl should just be what cl has in it, so can
1042      ignore this flag
1043   */
1044  } else {    /* 'or_with' is not inverted */
1045   /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1046   if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1047    && (!(or_with->flags & ANYOF_LOC_FOLD)
1048     || (cl->flags & ANYOF_LOC_FOLD)) ) {
1049    int i;
1050
1051    /* OR char bitmap and class bitmap separately */
1052    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1053     cl->bitmap[i] |= or_with->bitmap[i];
1054    if (or_with->flags & ANYOF_CLASS) {
1055     ANYOF_CLASS_OR(or_with, cl);
1056    }
1057   }
1058   else { /* XXXX: logic is complicated, leave it along for a moment. */
1059    cl_anything(pRExC_state, cl);
1060   }
1061
1062   if (ANYOF_NONBITMAP(or_with)) {
1063
1064    /* Use the added node's outside-the-bit-map match if there isn't a
1065    * conflict.  If there is a conflict (both nodes match something
1066    * outside the bitmap, but what they match outside is not the same
1067    * pointer, and hence not easily compared until XXX we extend
1068    * inversion lists this far), give up and allow the start class to
1069    * match everything outside the bitmap.  If that stuff is all above
1070    * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1071    if (! ANYOF_NONBITMAP(cl)) {
1072     ARG_SET(cl, ARG(or_with));
1073    }
1074    else if (ARG(cl) != ARG(or_with)) {
1075
1076     if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1077      cl_anything(pRExC_state, cl);
1078     }
1079     else {
1080      cl->flags |= ANYOF_UNICODE_ALL;
1081     }
1082    }
1083   }
1084
1085   /* Take the union */
1086   cl->flags |= or_with->flags;
1087  }
1088 }
1089
1090 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1091 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1092 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1093 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1094
1095
1096 #ifdef DEBUGGING
1097 /*
1098    dump_trie(trie,widecharmap,revcharmap)
1099    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1100    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1101
1102    These routines dump out a trie in a somewhat readable format.
1103    The _interim_ variants are used for debugging the interim
1104    tables that are used to generate the final compressed
1105    representation which is what dump_trie expects.
1106
1107    Part of the reason for their existence is to provide a form
1108    of documentation as to how the different representations function.
1109
1110 */
1111
1112 /*
1113   Dumps the final compressed table form of the trie to Perl_debug_log.
1114   Used for debugging make_trie().
1115 */
1116
1117 STATIC void
1118 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1119    AV *revcharmap, U32 depth)
1120 {
1121  U32 state;
1122  SV *sv=sv_newmortal();
1123  int colwidth= widecharmap ? 6 : 4;
1124  U16 word;
1125  GET_RE_DEBUG_FLAGS_DECL;
1126
1127  PERL_ARGS_ASSERT_DUMP_TRIE;
1128
1129  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1130   (int)depth * 2 + 2,"",
1131   "Match","Base","Ofs" );
1132
1133  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1134   SV ** const tmp = av_fetch( revcharmap, state, 0);
1135   if ( tmp ) {
1136    PerlIO_printf( Perl_debug_log, "%*s",
1137     colwidth,
1138     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1139        PL_colors[0], PL_colors[1],
1140        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1141        PERL_PV_ESCAPE_FIRSTCHAR
1142     )
1143    );
1144   }
1145  }
1146  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1147   (int)depth * 2 + 2,"");
1148
1149  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1150   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1151  PerlIO_printf( Perl_debug_log, "\n");
1152
1153  for( state = 1 ; state < trie->statecount ; state++ ) {
1154   const U32 base = trie->states[ state ].trans.base;
1155
1156   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1157
1158   if ( trie->states[ state ].wordnum ) {
1159    PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1160   } else {
1161    PerlIO_printf( Perl_debug_log, "%6s", "" );
1162   }
1163
1164   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1165
1166   if ( base ) {
1167    U32 ofs = 0;
1168
1169    while( ( base + ofs  < trie->uniquecharcount ) ||
1170     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1171      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1172      ofs++;
1173
1174    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1175
1176    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1177     if ( ( base + ofs >= trie->uniquecharcount ) &&
1178      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1179      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1180     {
1181     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1182      colwidth,
1183      (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1184     } else {
1185      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1186     }
1187    }
1188
1189    PerlIO_printf( Perl_debug_log, "]");
1190
1191   }
1192   PerlIO_printf( Perl_debug_log, "\n" );
1193  }
1194  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1195  for (word=1; word <= trie->wordcount; word++) {
1196   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1197    (int)word, (int)(trie->wordinfo[word].prev),
1198    (int)(trie->wordinfo[word].len));
1199  }
1200  PerlIO_printf(Perl_debug_log, "\n" );
1201 }
1202 /*
1203   Dumps a fully constructed but uncompressed trie in list form.
1204   List tries normally only are used for construction when the number of
1205   possible chars (trie->uniquecharcount) is very high.
1206   Used for debugging make_trie().
1207 */
1208 STATIC void
1209 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1210       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1211       U32 depth)
1212 {
1213  U32 state;
1214  SV *sv=sv_newmortal();
1215  int colwidth= widecharmap ? 6 : 4;
1216  GET_RE_DEBUG_FLAGS_DECL;
1217
1218  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1219
1220  /* print out the table precompression.  */
1221  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1222   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1223   "------:-----+-----------------\n" );
1224
1225  for( state=1 ; state < next_alloc ; state ++ ) {
1226   U16 charid;
1227
1228   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1229    (int)depth * 2 + 2,"", (UV)state  );
1230   if ( ! trie->states[ state ].wordnum ) {
1231    PerlIO_printf( Perl_debug_log, "%5s| ","");
1232   } else {
1233    PerlIO_printf( Perl_debug_log, "W%4x| ",
1234     trie->states[ state ].wordnum
1235    );
1236   }
1237   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1238    SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1239    if ( tmp ) {
1240     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1241      colwidth,
1242      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1243        PL_colors[0], PL_colors[1],
1244        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1245        PERL_PV_ESCAPE_FIRSTCHAR
1246      ) ,
1247      TRIE_LIST_ITEM(state,charid).forid,
1248      (UV)TRIE_LIST_ITEM(state,charid).newstate
1249     );
1250     if (!(charid % 10))
1251      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1252       (int)((depth * 2) + 14), "");
1253    }
1254   }
1255   PerlIO_printf( Perl_debug_log, "\n");
1256  }
1257 }
1258
1259 /*
1260   Dumps a fully constructed but uncompressed trie in table form.
1261   This is the normal DFA style state transition table, with a few
1262   twists to facilitate compression later.
1263   Used for debugging make_trie().
1264 */
1265 STATIC void
1266 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1267       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1268       U32 depth)
1269 {
1270  U32 state;
1271  U16 charid;
1272  SV *sv=sv_newmortal();
1273  int colwidth= widecharmap ? 6 : 4;
1274  GET_RE_DEBUG_FLAGS_DECL;
1275
1276  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1277
1278  /*
1279  print out the table precompression so that we can do a visual check
1280  that they are identical.
1281  */
1282
1283  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1284
1285  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1286   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1287   if ( tmp ) {
1288    PerlIO_printf( Perl_debug_log, "%*s",
1289     colwidth,
1290     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1291        PL_colors[0], PL_colors[1],
1292        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1293        PERL_PV_ESCAPE_FIRSTCHAR
1294     )
1295    );
1296   }
1297  }
1298
1299  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1300
1301  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1302   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1303  }
1304
1305  PerlIO_printf( Perl_debug_log, "\n" );
1306
1307  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1308
1309   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1310    (int)depth * 2 + 2,"",
1311    (UV)TRIE_NODENUM( state ) );
1312
1313   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1314    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1315    if (v)
1316     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1317    else
1318     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1319   }
1320   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1321    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1322   } else {
1323    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1324    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1325   }
1326  }
1327 }
1328
1329 #endif
1330
1331
1332 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1333   startbranch: the first branch in the whole branch sequence
1334   first      : start branch of sequence of branch-exact nodes.
1335    May be the same as startbranch
1336   last       : Thing following the last branch.
1337    May be the same as tail.
1338   tail       : item following the branch sequence
1339   count      : words in the sequence
1340   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1341   depth      : indent depth
1342
1343 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1344
1345 A trie is an N'ary tree where the branches are determined by digital
1346 decomposition of the key. IE, at the root node you look up the 1st character and
1347 follow that branch repeat until you find the end of the branches. Nodes can be
1348 marked as "accepting" meaning they represent a complete word. Eg:
1349
1350   /he|she|his|hers/
1351
1352 would convert into the following structure. Numbers represent states, letters
1353 following numbers represent valid transitions on the letter from that state, if
1354 the number is in square brackets it represents an accepting state, otherwise it
1355 will be in parenthesis.
1356
1357  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1358  |    |
1359  |   (2)
1360  |    |
1361  (1)   +-i->(6)-+-s->[7]
1362  |
1363  +-s->(3)-+-h->(4)-+-e->[5]
1364
1365  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1366
1367 This shows that when matching against the string 'hers' we will begin at state 1
1368 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1369 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1370 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1371 single traverse. We store a mapping from accepting to state to which word was
1372 matched, and then when we have multiple possibilities we try to complete the
1373 rest of the regex in the order in which they occured in the alternation.
1374
1375 The only prior NFA like behaviour that would be changed by the TRIE support is
1376 the silent ignoring of duplicate alternations which are of the form:
1377
1378  / (DUPE|DUPE) X? (?{ ... }) Y /x
1379
1380 Thus EVAL blocks following a trie may be called a different number of times with
1381 and without the optimisation. With the optimisations dupes will be silently
1382 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1383 the following demonstrates:
1384
1385  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1386
1387 which prints out 'word' three times, but
1388
1389  'words'=~/(word|word|word)(?{ print $1 })S/
1390
1391 which doesnt print it out at all. This is due to other optimisations kicking in.
1392
1393 Example of what happens on a structural level:
1394
1395 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1396
1397    1: CURLYM[1] {1,32767}(18)
1398    5:   BRANCH(8)
1399    6:     EXACT <ac>(16)
1400    8:   BRANCH(11)
1401    9:     EXACT <ad>(16)
1402   11:   BRANCH(14)
1403   12:     EXACT <ab>(16)
1404   16:   SUCCEED(0)
1405   17:   NOTHING(18)
1406   18: END(0)
1407
1408 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1409 and should turn into:
1410
1411    1: CURLYM[1] {1,32767}(18)
1412    5:   TRIE(16)
1413   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1414   <ac>
1415   <ad>
1416   <ab>
1417   16:   SUCCEED(0)
1418   17:   NOTHING(18)
1419   18: END(0)
1420
1421 Cases where tail != last would be like /(?foo|bar)baz/:
1422
1423    1: BRANCH(4)
1424    2:   EXACT <foo>(8)
1425    4: BRANCH(7)
1426    5:   EXACT <bar>(8)
1427    7: TAIL(8)
1428    8: EXACT <baz>(10)
1429   10: END(0)
1430
1431 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1432 and would end up looking like:
1433
1434  1: TRIE(8)
1435  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1436   <foo>
1437   <bar>
1438    7: TAIL(8)
1439    8: EXACT <baz>(10)
1440   10: END(0)
1441
1442  d = uvuni_to_utf8_flags(d, uv, 0);
1443
1444 is the recommended Unicode-aware way of saying
1445
1446  *(d++) = uv;
1447 */
1448
1449 #define TRIE_STORE_REVCHAR(val)                                            \
1450  STMT_START {                                                           \
1451   if (UTF) {          \
1452    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1453    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1454    unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1455    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1456    SvPOK_on(zlopp);         \
1457    SvUTF8_on(zlopp);         \
1458    av_push(revcharmap, zlopp);        \
1459   } else {          \
1460    char ooooff = (char)val;                                           \
1461    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1462   }           \
1463   } STMT_END
1464
1465 #define TRIE_READ_CHAR STMT_START {                                                     \
1466  wordlen++;                                                                          \
1467  if ( UTF ) {                                                                        \
1468   /* if it is UTF then it is either already folded, or does not need folding */   \
1469   uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1470  }                                                                                   \
1471  else if (folder == PL_fold_latin1) {                                                \
1472   /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1473   if ( foldlen > 0 ) {                                                            \
1474   uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1475   foldlen -= len;                                                              \
1476   scan += len;                                                                 \
1477   len = 0;                                                                     \
1478   } else {                                                                        \
1479    len = 1;                                                                    \
1480    uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL);       \
1481    skiplen = UNISKIP(uvc);                                                     \
1482    foldlen -= skiplen;                                                         \
1483    scan = foldbuf + skiplen;                                                   \
1484   }                                                                               \
1485  } else {                                                                            \
1486   /* raw data, will be folded later if needed */                                  \
1487   uvc = (U32)*uc;                                                                 \
1488   len = 1;                                                                        \
1489  }                                                                                   \
1490 } STMT_END
1491
1492
1493
1494 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1495  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1496   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1497   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1498  }                                                           \
1499  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1500  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1501  TRIE_LIST_CUR( state )++;                                   \
1502 } STMT_END
1503
1504 #define TRIE_LIST_NEW(state) STMT_START {                       \
1505  Newxz( trie->states[ state ].trans.list,               \
1506   4, reg_trie_trans_le );                                 \
1507  TRIE_LIST_CUR( state ) = 1;                                \
1508  TRIE_LIST_LEN( state ) = 4;                                \
1509 } STMT_END
1510
1511 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1512  U16 dupe= trie->states[ state ].wordnum;                    \
1513  regnode * const noper_next = regnext( noper );              \
1514                 \
1515  DEBUG_r({                                                   \
1516   /* store the word for dumping */                        \
1517   SV* tmp;                                                \
1518   if (OP(noper) != NOTHING)                               \
1519    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1520   else                                                    \
1521    tmp = newSVpvn_utf8( "", 0, UTF );   \
1522   av_push( trie_words, tmp );                             \
1523  });                                                         \
1524                 \
1525  curword++;                                                  \
1526  trie->wordinfo[curword].prev   = 0;                         \
1527  trie->wordinfo[curword].len    = wordlen;                   \
1528  trie->wordinfo[curword].accept = state;                     \
1529                 \
1530  if ( noper_next < tail ) {                                  \
1531   if (!trie->jump)                                        \
1532    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1533   trie->jump[curword] = (U16)(noper_next - convert);      \
1534   if (!jumper)                                            \
1535    jumper = noper_next;                                \
1536   if (!nextbranch)                                        \
1537    nextbranch= regnext(cur);                           \
1538  }                                                           \
1539                 \
1540  if ( dupe ) {                                               \
1541   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1542   /* chain, so that when the bits of chain are later    */\
1543   /* linked together, the dups appear in the chain      */\
1544   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1545   trie->wordinfo[dupe].prev = curword;                    \
1546  } else {                                                    \
1547   /* we haven't inserted this word yet.                */ \
1548   trie->states[ state ].wordnum = curword;                \
1549  }                                                           \
1550 } STMT_END
1551
1552
1553 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1554  ( ( base + charid >=  ucharcount     \
1555   && base + charid < ubound     \
1556   && state == trie->trans[ base - ucharcount + charid ].check \
1557   && trie->trans[ base - ucharcount + charid ].next )  \
1558   ? trie->trans[ base - ucharcount + charid ].next  \
1559   : ( state==1 ? special : 0 )     \
1560  )
1561
1562 #define MADE_TRIE       1
1563 #define MADE_JUMP_TRIE  2
1564 #define MADE_EXACT_TRIE 4
1565
1566 STATIC I32
1567 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1568 {
1569  dVAR;
1570  /* first pass, loop through and scan words */
1571  reg_trie_data *trie;
1572  HV *widecharmap = NULL;
1573  AV *revcharmap = newAV();
1574  regnode *cur;
1575  const U32 uniflags = UTF8_ALLOW_DEFAULT;
1576  STRLEN len = 0;
1577  UV uvc = 0;
1578  U16 curword = 0;
1579  U32 next_alloc = 0;
1580  regnode *jumper = NULL;
1581  regnode *nextbranch = NULL;
1582  regnode *convert = NULL;
1583  U32 *prev_states; /* temp array mapping each state to previous one */
1584  /* we just use folder as a flag in utf8 */
1585  const U8 * folder = NULL;
1586
1587 #ifdef DEBUGGING
1588  const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1589  AV *trie_words = NULL;
1590  /* along with revcharmap, this only used during construction but both are
1591  * useful during debugging so we store them in the struct when debugging.
1592  */
1593 #else
1594  const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1595  STRLEN trie_charcount=0;
1596 #endif
1597  SV *re_trie_maxbuff;
1598  GET_RE_DEBUG_FLAGS_DECL;
1599
1600  PERL_ARGS_ASSERT_MAKE_TRIE;
1601 #ifndef DEBUGGING
1602  PERL_UNUSED_ARG(depth);
1603 #endif
1604
1605  switch (flags) {
1606   case EXACT: break;
1607   case EXACTFA:
1608   case EXACTFU_SS:
1609   case EXACTFU_TRICKYFOLD:
1610   case EXACTFU: folder = PL_fold_latin1; break;
1611   case EXACTF:  folder = PL_fold; break;
1612   case EXACTFL: folder = PL_fold_locale; break;
1613   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1614  }
1615
1616  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1617  trie->refcount = 1;
1618  trie->startstate = 1;
1619  trie->wordcount = word_count;
1620  RExC_rxi->data->data[ data_slot ] = (void*)trie;
1621  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1622  if (flags == EXACT)
1623   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1624  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1625      trie->wordcount+1, sizeof(reg_trie_wordinfo));
1626
1627  DEBUG_r({
1628   trie_words = newAV();
1629  });
1630
1631  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1632  if (!SvIOK(re_trie_maxbuff)) {
1633   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1634  }
1635  DEBUG_TRIE_COMPILE_r({
1636     PerlIO_printf( Perl_debug_log,
1637     "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1638     (int)depth * 2 + 2, "",
1639     REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1640     REG_NODE_NUM(last), REG_NODE_NUM(tail),
1641     (int)depth);
1642  });
1643
1644    /* Find the node we are going to overwrite */
1645  if ( first == startbranch && OP( last ) != BRANCH ) {
1646   /* whole branch chain */
1647   convert = first;
1648  } else {
1649   /* branch sub-chain */
1650   convert = NEXTOPER( first );
1651  }
1652
1653  /*  -- First loop and Setup --
1654
1655  We first traverse the branches and scan each word to determine if it
1656  contains widechars, and how many unique chars there are, this is
1657  important as we have to build a table with at least as many columns as we
1658  have unique chars.
1659
1660  We use an array of integers to represent the character codes 0..255
1661  (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1662  native representation of the character value as the key and IV's for the
1663  coded index.
1664
1665  *TODO* If we keep track of how many times each character is used we can
1666  remap the columns so that the table compression later on is more
1667  efficient in terms of memory by ensuring the most common value is in the
1668  middle and the least common are on the outside.  IMO this would be better
1669  than a most to least common mapping as theres a decent chance the most
1670  common letter will share a node with the least common, meaning the node
1671  will not be compressible. With a middle is most common approach the worst
1672  case is when we have the least common nodes twice.
1673
1674  */
1675
1676  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1677   regnode *noper = NEXTOPER( cur );
1678   const U8 *uc = (U8*)STRING( noper );
1679   const U8 *e  = uc + STR_LEN( noper );
1680   STRLEN foldlen = 0;
1681   U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1682   STRLEN skiplen = 0;
1683   const U8 *scan = (U8*)NULL;
1684   U32 wordlen      = 0;         /* required init */
1685   STRLEN chars = 0;
1686   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1687
1688   if (OP(noper) == NOTHING) {
1689    regnode *noper_next= regnext(noper);
1690    if (noper_next != tail && OP(noper_next) == flags) {
1691     noper = noper_next;
1692     uc= (U8*)STRING(noper);
1693     e= uc + STR_LEN(noper);
1694     trie->minlen= STR_LEN(noper);
1695    } else {
1696     trie->minlen= 0;
1697     continue;
1698    }
1699   }
1700
1701   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1702    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1703           regardless of encoding */
1704    if (OP( noper ) == EXACTFU_SS) {
1705     /* false positives are ok, so just set this */
1706     TRIE_BITMAP_SET(trie,0xDF);
1707    }
1708   }
1709   for ( ; uc < e ; uc += len ) {
1710    TRIE_CHARCOUNT(trie)++;
1711    TRIE_READ_CHAR;
1712    chars++;
1713    if ( uvc < 256 ) {
1714     if ( folder ) {
1715      U8 folded= folder[ (U8) uvc ];
1716      if ( !trie->charmap[ folded ] ) {
1717       trie->charmap[ folded ]=( ++trie->uniquecharcount );
1718       TRIE_STORE_REVCHAR( folded );
1719      }
1720     }
1721     if ( !trie->charmap[ uvc ] ) {
1722      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1723      TRIE_STORE_REVCHAR( uvc );
1724     }
1725     if ( set_bit ) {
1726      /* store the codepoint in the bitmap, and its folded
1727      * equivalent. */
1728      TRIE_BITMAP_SET(trie, uvc);
1729
1730      /* store the folded codepoint */
1731      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1732
1733      if ( !UTF ) {
1734       /* store first byte of utf8 representation of
1735       variant codepoints */
1736       if (! UNI_IS_INVARIANT(uvc)) {
1737        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1738       }
1739      }
1740      set_bit = 0; /* We've done our bit :-) */
1741     }
1742    } else {
1743     SV** svpp;
1744     if ( !widecharmap )
1745      widecharmap = newHV();
1746
1747     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1748
1749     if ( !svpp )
1750      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1751
1752     if ( !SvTRUE( *svpp ) ) {
1753      sv_setiv( *svpp, ++trie->uniquecharcount );
1754      TRIE_STORE_REVCHAR(uvc);
1755     }
1756    }
1757   }
1758   if( cur == first ) {
1759    trie->minlen = chars;
1760    trie->maxlen = chars;
1761   } else if (chars < trie->minlen) {
1762    trie->minlen = chars;
1763   } else if (chars > trie->maxlen) {
1764    trie->maxlen = chars;
1765   }
1766   if (OP( noper ) == EXACTFU_SS) {
1767    /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1768    if (trie->minlen > 1)
1769     trie->minlen= 1;
1770   }
1771   if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1772    /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1773    *        - We assume that any such sequence might match a 2 byte string */
1774    if (trie->minlen > 2 )
1775     trie->minlen= 2;
1776   }
1777
1778  } /* end first pass */
1779  DEBUG_TRIE_COMPILE_r(
1780   PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1781     (int)depth * 2 + 2,"",
1782     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1783     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1784     (int)trie->minlen, (int)trie->maxlen )
1785  );
1786
1787  /*
1788   We now know what we are dealing with in terms of unique chars and
1789   string sizes so we can calculate how much memory a naive
1790   representation using a flat table  will take. If it's over a reasonable
1791   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1792   conservative but potentially much slower representation using an array
1793   of lists.
1794
1795   At the end we convert both representations into the same compressed
1796   form that will be used in regexec.c for matching with. The latter
1797   is a form that cannot be used to construct with but has memory
1798   properties similar to the list form and access properties similar
1799   to the table form making it both suitable for fast searches and
1800   small enough that its feasable to store for the duration of a program.
1801
1802   See the comment in the code where the compressed table is produced
1803   inplace from the flat tabe representation for an explanation of how
1804   the compression works.
1805
1806  */
1807
1808
1809  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1810  prev_states[1] = 0;
1811
1812  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1813   /*
1814    Second Pass -- Array Of Lists Representation
1815
1816    Each state will be represented by a list of charid:state records
1817    (reg_trie_trans_le) the first such element holds the CUR and LEN
1818    points of the allocated array. (See defines above).
1819
1820    We build the initial structure using the lists, and then convert
1821    it into the compressed table form which allows faster lookups
1822    (but cant be modified once converted).
1823   */
1824
1825   STRLEN transcount = 1;
1826
1827   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1828    "%*sCompiling trie using list compiler\n",
1829    (int)depth * 2 + 2, ""));
1830
1831   trie->states = (reg_trie_state *)
1832    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1833         sizeof(reg_trie_state) );
1834   TRIE_LIST_NEW(1);
1835   next_alloc = 2;
1836
1837   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1838
1839    regnode *noper   = NEXTOPER( cur );
1840    U8 *uc           = (U8*)STRING( noper );
1841    const U8 *e      = uc + STR_LEN( noper );
1842    U32 state        = 1;         /* required init */
1843    U16 charid       = 0;         /* sanity init */
1844    U8 *scan         = (U8*)NULL; /* sanity init */
1845    STRLEN foldlen   = 0;         /* required init */
1846    U32 wordlen      = 0;         /* required init */
1847    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1848    STRLEN skiplen   = 0;
1849
1850    if (OP(noper) == NOTHING) {
1851     regnode *noper_next= regnext(noper);
1852     if (noper_next != tail && OP(noper_next) == flags) {
1853      noper = noper_next;
1854      uc= (U8*)STRING(noper);
1855      e= uc + STR_LEN(noper);
1856     }
1857    }
1858
1859    if (OP(noper) != NOTHING) {
1860     for ( ; uc < e ; uc += len ) {
1861
1862      TRIE_READ_CHAR;
1863
1864      if ( uvc < 256 ) {
1865       charid = trie->charmap[ uvc ];
1866      } else {
1867       SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1868       if ( !svpp ) {
1869        charid = 0;
1870       } else {
1871        charid=(U16)SvIV( *svpp );
1872       }
1873      }
1874      /* charid is now 0 if we dont know the char read, or nonzero if we do */
1875      if ( charid ) {
1876
1877       U16 check;
1878       U32 newstate = 0;
1879
1880       charid--;
1881       if ( !trie->states[ state ].trans.list ) {
1882        TRIE_LIST_NEW( state );
1883       }
1884       for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1885        if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1886         newstate = TRIE_LIST_ITEM( state, check ).newstate;
1887         break;
1888        }
1889       }
1890       if ( ! newstate ) {
1891        newstate = next_alloc++;
1892        prev_states[newstate] = state;
1893        TRIE_LIST_PUSH( state, charid, newstate );
1894        transcount++;
1895       }
1896       state = newstate;
1897      } else {
1898       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1899      }
1900     }
1901    }
1902    TRIE_HANDLE_WORD(state);
1903
1904   } /* end second pass */
1905
1906   /* next alloc is the NEXT state to be allocated */
1907   trie->statecount = next_alloc;
1908   trie->states = (reg_trie_state *)
1909    PerlMemShared_realloc( trie->states,
1910         next_alloc
1911         * sizeof(reg_trie_state) );
1912
1913   /* and now dump it out before we compress it */
1914   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1915               revcharmap, next_alloc,
1916               depth+1)
1917   );
1918
1919   trie->trans = (reg_trie_trans *)
1920    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1921   {
1922    U32 state;
1923    U32 tp = 0;
1924    U32 zp = 0;
1925
1926
1927    for( state=1 ; state < next_alloc ; state ++ ) {
1928     U32 base=0;
1929
1930     /*
1931     DEBUG_TRIE_COMPILE_MORE_r(
1932      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1933     );
1934     */
1935
1936     if (trie->states[state].trans.list) {
1937      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1938      U16 maxid=minid;
1939      U16 idx;
1940
1941      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1942       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1943       if ( forid < minid ) {
1944        minid=forid;
1945       } else if ( forid > maxid ) {
1946        maxid=forid;
1947       }
1948      }
1949      if ( transcount < tp + maxid - minid + 1) {
1950       transcount *= 2;
1951       trie->trans = (reg_trie_trans *)
1952        PerlMemShared_realloc( trie->trans,
1953              transcount
1954              * sizeof(reg_trie_trans) );
1955       Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1956      }
1957      base = trie->uniquecharcount + tp - minid;
1958      if ( maxid == minid ) {
1959       U32 set = 0;
1960       for ( ; zp < tp ; zp++ ) {
1961        if ( ! trie->trans[ zp ].next ) {
1962         base = trie->uniquecharcount + zp - minid;
1963         trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1964         trie->trans[ zp ].check = state;
1965         set = 1;
1966         break;
1967        }
1968       }
1969       if ( !set ) {
1970        trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1971        trie->trans[ tp ].check = state;
1972        tp++;
1973        zp = tp;
1974       }
1975      } else {
1976       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1977        const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1978        trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1979        trie->trans[ tid ].check = state;
1980       }
1981       tp += ( maxid - minid + 1 );
1982      }
1983      Safefree(trie->states[ state ].trans.list);
1984     }
1985     /*
1986     DEBUG_TRIE_COMPILE_MORE_r(
1987      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1988     );
1989     */
1990     trie->states[ state ].trans.base=base;
1991    }
1992    trie->lasttrans = tp + 1;
1993   }
1994  } else {
1995   /*
1996   Second Pass -- Flat Table Representation.
1997
1998   we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1999   We know that we will need Charcount+1 trans at most to store the data
2000   (one row per char at worst case) So we preallocate both structures
2001   assuming worst case.
2002
2003   We then construct the trie using only the .next slots of the entry
2004   structs.
2005
2006   We use the .check field of the first entry of the node temporarily to
2007   make compression both faster and easier by keeping track of how many non
2008   zero fields are in the node.
2009
2010   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2011   transition.
2012
2013   There are two terms at use here: state as a TRIE_NODEIDX() which is a
2014   number representing the first entry of the node, and state as a
2015   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
2016   TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
2017   are 2 entrys per node. eg:
2018
2019    A B       A B
2020   1. 2 4    1. 3 7
2021   2. 0 3    3. 0 5
2022   3. 0 0    5. 0 0
2023   4. 0 0    7. 0 0
2024
2025   The table is internally in the right hand, idx form. However as we also
2026   have to deal with the states array which is indexed by nodenum we have to
2027   use TRIE_NODENUM() to convert.
2028
2029   */
2030   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2031    "%*sCompiling trie using table compiler\n",
2032    (int)depth * 2 + 2, ""));
2033
2034   trie->trans = (reg_trie_trans *)
2035    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2036         * trie->uniquecharcount + 1,
2037         sizeof(reg_trie_trans) );
2038   trie->states = (reg_trie_state *)
2039    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2040         sizeof(reg_trie_state) );
2041   next_alloc = trie->uniquecharcount + 1;
2042
2043
2044   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2045
2046    regnode *noper   = NEXTOPER( cur );
2047    const U8 *uc     = (U8*)STRING( noper );
2048    const U8 *e      = uc + STR_LEN( noper );
2049
2050    U32 state        = 1;         /* required init */
2051
2052    U16 charid       = 0;         /* sanity init */
2053    U32 accept_state = 0;         /* sanity init */
2054    U8 *scan         = (U8*)NULL; /* sanity init */
2055
2056    STRLEN foldlen   = 0;         /* required init */
2057    U32 wordlen      = 0;         /* required init */
2058    STRLEN skiplen   = 0;
2059    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2060
2061    if (OP(noper) == NOTHING) {
2062     regnode *noper_next= regnext(noper);
2063     if (noper_next != tail && OP(noper_next) == flags) {
2064      noper = noper_next;
2065      uc= (U8*)STRING(noper);
2066      e= uc + STR_LEN(noper);
2067     }
2068    }
2069
2070    if ( OP(noper) != NOTHING ) {
2071     for ( ; uc < e ; uc += len ) {
2072
2073      TRIE_READ_CHAR;
2074
2075      if ( uvc < 256 ) {
2076       charid = trie->charmap[ uvc ];
2077      } else {
2078       SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2079       charid = svpp ? (U16)SvIV(*svpp) : 0;
2080      }
2081      if ( charid ) {
2082       charid--;
2083       if ( !trie->trans[ state + charid ].next ) {
2084        trie->trans[ state + charid ].next = next_alloc;
2085        trie->trans[ state ].check++;
2086        prev_states[TRIE_NODENUM(next_alloc)]
2087          = TRIE_NODENUM(state);
2088        next_alloc += trie->uniquecharcount;
2089       }
2090       state = trie->trans[ state + charid ].next;
2091      } else {
2092       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2093      }
2094      /* charid is now 0 if we dont know the char read, or nonzero if we do */
2095     }
2096    }
2097    accept_state = TRIE_NODENUM( state );
2098    TRIE_HANDLE_WORD(accept_state);
2099
2100   } /* end second pass */
2101
2102   /* and now dump it out before we compress it */
2103   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2104               revcharmap,
2105               next_alloc, depth+1));
2106
2107   {
2108   /*
2109   * Inplace compress the table.*
2110
2111   For sparse data sets the table constructed by the trie algorithm will
2112   be mostly 0/FAIL transitions or to put it another way mostly empty.
2113   (Note that leaf nodes will not contain any transitions.)
2114
2115   This algorithm compresses the tables by eliminating most such
2116   transitions, at the cost of a modest bit of extra work during lookup:
2117
2118   - Each states[] entry contains a .base field which indicates the
2119   index in the state[] array wheres its transition data is stored.
2120
2121   - If .base is 0 there are no valid transitions from that node.
2122
2123   - If .base is nonzero then charid is added to it to find an entry in
2124   the trans array.
2125
2126   -If trans[states[state].base+charid].check!=state then the
2127   transition is taken to be a 0/Fail transition. Thus if there are fail
2128   transitions at the front of the node then the .base offset will point
2129   somewhere inside the previous nodes data (or maybe even into a node
2130   even earlier), but the .check field determines if the transition is
2131   valid.
2132
2133   XXX - wrong maybe?
2134   The following process inplace converts the table to the compressed
2135   table: We first do not compress the root node 1,and mark all its
2136   .check pointers as 1 and set its .base pointer as 1 as well. This
2137   allows us to do a DFA construction from the compressed table later,
2138   and ensures that any .base pointers we calculate later are greater
2139   than 0.
2140
2141   - We set 'pos' to indicate the first entry of the second node.
2142
2143   - We then iterate over the columns of the node, finding the first and
2144   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2145   and set the .check pointers accordingly, and advance pos
2146   appropriately and repreat for the next node. Note that when we copy
2147   the next pointers we have to convert them from the original
2148   NODEIDX form to NODENUM form as the former is not valid post
2149   compression.
2150
2151   - If a node has no transitions used we mark its base as 0 and do not
2152   advance the pos pointer.
2153
2154   - If a node only has one transition we use a second pointer into the
2155   structure to fill in allocated fail transitions from other states.
2156   This pointer is independent of the main pointer and scans forward
2157   looking for null transitions that are allocated to a state. When it
2158   finds one it writes the single transition into the "hole".  If the
2159   pointer doesnt find one the single transition is appended as normal.
2160
2161   - Once compressed we can Renew/realloc the structures to release the
2162   excess space.
2163
2164   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2165   specifically Fig 3.47 and the associated pseudocode.
2166
2167   demq
2168   */
2169   const U32 laststate = TRIE_NODENUM( next_alloc );
2170   U32 state, charid;
2171   U32 pos = 0, zp=0;
2172   trie->statecount = laststate;
2173
2174   for ( state = 1 ; state < laststate ; state++ ) {
2175    U8 flag = 0;
2176    const U32 stateidx = TRIE_NODEIDX( state );
2177    const U32 o_used = trie->trans[ stateidx ].check;
2178    U32 used = trie->trans[ stateidx ].check;
2179    trie->trans[ stateidx ].check = 0;
2180
2181    for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2182     if ( flag || trie->trans[ stateidx + charid ].next ) {
2183      if ( trie->trans[ stateidx + charid ].next ) {
2184       if (o_used == 1) {
2185        for ( ; zp < pos ; zp++ ) {
2186         if ( ! trie->trans[ zp ].next ) {
2187          break;
2188         }
2189        }
2190        trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2191        trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2192        trie->trans[ zp ].check = state;
2193        if ( ++zp > pos ) pos = zp;
2194        break;
2195       }
2196       used--;
2197      }
2198      if ( !flag ) {
2199       flag = 1;
2200       trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2201      }
2202      trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2203      trie->trans[ pos ].check = state;
2204      pos++;
2205     }
2206    }
2207   }
2208   trie->lasttrans = pos + 1;
2209   trie->states = (reg_trie_state *)
2210    PerlMemShared_realloc( trie->states, laststate
2211         * sizeof(reg_trie_state) );
2212   DEBUG_TRIE_COMPILE_MORE_r(
2213     PerlIO_printf( Perl_debug_log,
2214      "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2215      (int)depth * 2 + 2,"",
2216      (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2217      (IV)next_alloc,
2218      (IV)pos,
2219      ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2220    );
2221
2222   } /* end table compress */
2223  }
2224  DEBUG_TRIE_COMPILE_MORE_r(
2225    PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2226     (int)depth * 2 + 2, "",
2227     (UV)trie->statecount,
2228     (UV)trie->lasttrans)
2229  );
2230  /* resize the trans array to remove unused space */
2231  trie->trans = (reg_trie_trans *)
2232   PerlMemShared_realloc( trie->trans, trie->lasttrans
2233        * sizeof(reg_trie_trans) );
2234
2235  {   /* Modify the program and insert the new TRIE node */
2236   U8 nodetype =(U8)(flags & 0xFF);
2237   char *str=NULL;
2238
2239 #ifdef DEBUGGING
2240   regnode *optimize = NULL;
2241 #ifdef RE_TRACK_PATTERN_OFFSETS
2242
2243   U32 mjd_offset = 0;
2244   U32 mjd_nodelen = 0;
2245 #endif /* RE_TRACK_PATTERN_OFFSETS */
2246 #endif /* DEBUGGING */
2247   /*
2248   This means we convert either the first branch or the first Exact,
2249   depending on whether the thing following (in 'last') is a branch
2250   or not and whther first is the startbranch (ie is it a sub part of
2251   the alternation or is it the whole thing.)
2252   Assuming its a sub part we convert the EXACT otherwise we convert
2253   the whole branch sequence, including the first.
2254   */
2255   /* Find the node we are going to overwrite */
2256   if ( first != startbranch || OP( last ) == BRANCH ) {
2257    /* branch sub-chain */
2258    NEXT_OFF( first ) = (U16)(last - first);
2259 #ifdef RE_TRACK_PATTERN_OFFSETS
2260    DEBUG_r({
2261     mjd_offset= Node_Offset((convert));
2262     mjd_nodelen= Node_Length((convert));
2263    });
2264 #endif
2265    /* whole branch chain */
2266   }
2267 #ifdef RE_TRACK_PATTERN_OFFSETS
2268   else {
2269    DEBUG_r({
2270     const  regnode *nop = NEXTOPER( convert );
2271     mjd_offset= Node_Offset((nop));
2272     mjd_nodelen= Node_Length((nop));
2273    });
2274   }
2275   DEBUG_OPTIMISE_r(
2276    PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2277     (int)depth * 2 + 2, "",
2278     (UV)mjd_offset, (UV)mjd_nodelen)
2279   );
2280 #endif
2281   /* But first we check to see if there is a common prefix we can
2282   split out as an EXACT and put in front of the TRIE node.  */
2283   trie->startstate= 1;
2284   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2285    U32 state;
2286    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2287     U32 ofs = 0;
2288     I32 idx = -1;
2289     U32 count = 0;
2290     const U32 base = trie->states[ state ].trans.base;
2291
2292     if ( trie->states[state].wordnum )
2293       count = 1;
2294
2295     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2296      if ( ( base + ofs >= trie->uniquecharcount ) &&
2297       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2298       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2299      {
2300       if ( ++count > 1 ) {
2301        SV **tmp = av_fetch( revcharmap, ofs, 0);
2302        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2303        if ( state == 1 ) break;
2304        if ( count == 2 ) {
2305         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2306         DEBUG_OPTIMISE_r(
2307          PerlIO_printf(Perl_debug_log,
2308           "%*sNew Start State=%"UVuf" Class: [",
2309           (int)depth * 2 + 2, "",
2310           (UV)state));
2311         if (idx >= 0) {
2312          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2313          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2314
2315          TRIE_BITMAP_SET(trie,*ch);
2316          if ( folder )
2317           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2318          DEBUG_OPTIMISE_r(
2319           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2320          );
2321         }
2322        }
2323        TRIE_BITMAP_SET(trie,*ch);
2324        if ( folder )
2325         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2326        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2327       }
2328       idx = ofs;
2329      }
2330     }
2331     if ( count == 1 ) {
2332      SV **tmp = av_fetch( revcharmap, idx, 0);
2333      STRLEN len;
2334      char *ch = SvPV( *tmp, len );
2335      DEBUG_OPTIMISE_r({
2336       SV *sv=sv_newmortal();
2337       PerlIO_printf( Perl_debug_log,
2338        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2339        (int)depth * 2 + 2, "",
2340        (UV)state, (UV)idx,
2341        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2342         PL_colors[0], PL_colors[1],
2343         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2344         PERL_PV_ESCAPE_FIRSTCHAR
2345        )
2346       );
2347      });
2348      if ( state==1 ) {
2349       OP( convert ) = nodetype;
2350       str=STRING(convert);
2351       STR_LEN(convert)=0;
2352      }
2353      STR_LEN(convert) += len;
2354      while (len--)
2355       *str++ = *ch++;
2356     } else {
2357 #ifdef DEBUGGING
2358      if (state>1)
2359       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2360 #endif
2361      break;
2362     }
2363    }
2364    trie->prefixlen = (state-1);
2365    if (str) {
2366     regnode *n = convert+NODE_SZ_STR(convert);
2367     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2368     trie->startstate = state;
2369     trie->minlen -= (state - 1);
2370     trie->maxlen -= (state - 1);
2371 #ifdef DEBUGGING
2372    /* At least the UNICOS C compiler choked on this
2373     * being argument to DEBUG_r(), so let's just have
2374     * it right here. */
2375    if (
2376 #ifdef PERL_EXT_RE_BUILD
2377     1
2378 #else
2379     DEBUG_r_TEST
2380 #endif
2381     ) {
2382     regnode *fix = convert;
2383     U32 word = trie->wordcount;
2384     mjd_nodelen++;
2385     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2386     while( ++fix < n ) {
2387      Set_Node_Offset_Length(fix, 0, 0);
2388     }
2389     while (word--) {
2390      SV ** const tmp = av_fetch( trie_words, word, 0 );
2391      if (tmp) {
2392       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2393        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2394       else
2395        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2396      }
2397     }
2398    }
2399 #endif
2400     if (trie->maxlen) {
2401      convert = n;
2402     } else {
2403      NEXT_OFF(convert) = (U16)(tail - convert);
2404      DEBUG_r(optimize= n);
2405     }
2406    }
2407   }
2408   if (!jumper)
2409    jumper = last;
2410   if ( trie->maxlen ) {
2411    NEXT_OFF( convert ) = (U16)(tail - convert);
2412    ARG_SET( convert, data_slot );
2413    /* Store the offset to the first unabsorbed branch in
2414    jump[0], which is otherwise unused by the jump logic.
2415    We use this when dumping a trie and during optimisation. */
2416    if (trie->jump)
2417     trie->jump[0] = (U16)(nextbranch - convert);
2418
2419    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2420    *   and there is a bitmap
2421    *   and the first "jump target" node we found leaves enough room
2422    * then convert the TRIE node into a TRIEC node, with the bitmap
2423    * embedded inline in the opcode - this is hypothetically faster.
2424    */
2425    if ( !trie->states[trie->startstate].wordnum
2426     && trie->bitmap
2427     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2428    {
2429     OP( convert ) = TRIEC;
2430     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2431     PerlMemShared_free(trie->bitmap);
2432     trie->bitmap= NULL;
2433    } else
2434     OP( convert ) = TRIE;
2435
2436    /* store the type in the flags */
2437    convert->flags = nodetype;
2438    DEBUG_r({
2439    optimize = convert
2440      + NODE_STEP_REGNODE
2441      + regarglen[ OP( convert ) ];
2442    });
2443    /* XXX We really should free up the resource in trie now,
2444     as we won't use them - (which resources?) dmq */
2445   }
2446   /* needed for dumping*/
2447   DEBUG_r(if (optimize) {
2448    regnode *opt = convert;
2449
2450    while ( ++opt < optimize) {
2451     Set_Node_Offset_Length(opt,0,0);
2452    }
2453    /*
2454     Try to clean up some of the debris left after the
2455     optimisation.
2456    */
2457    while( optimize < jumper ) {
2458     mjd_nodelen += Node_Length((optimize));
2459     OP( optimize ) = OPTIMIZED;
2460     Set_Node_Offset_Length(optimize,0,0);
2461     optimize++;
2462    }
2463    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2464   });
2465  } /* end node insert */
2466  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2467
2468  /*  Finish populating the prev field of the wordinfo array.  Walk back
2469  *  from each accept state until we find another accept state, and if
2470  *  so, point the first word's .prev field at the second word. If the
2471  *  second already has a .prev field set, stop now. This will be the
2472  *  case either if we've already processed that word's accept state,
2473  *  or that state had multiple words, and the overspill words were
2474  *  already linked up earlier.
2475  */
2476  {
2477   U16 word;
2478   U32 state;
2479   U16 prev;
2480
2481   for (word=1; word <= trie->wordcount; word++) {
2482    prev = 0;
2483    if (trie->wordinfo[word].prev)
2484     continue;
2485    state = trie->wordinfo[word].accept;
2486    while (state) {
2487     state = prev_states[state];
2488     if (!state)
2489      break;
2490     prev = trie->states[state].wordnum;
2491     if (prev)
2492      break;
2493    }
2494    trie->wordinfo[word].prev = prev;
2495   }
2496   Safefree(prev_states);
2497  }
2498
2499
2500  /* and now dump out the compressed format */
2501  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2502
2503  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2504 #ifdef DEBUGGING
2505  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2506  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2507 #else
2508  SvREFCNT_dec_NN(revcharmap);
2509 #endif
2510  return trie->jump
2511   ? MADE_JUMP_TRIE
2512   : trie->startstate>1
2513    ? MADE_EXACT_TRIE
2514    : MADE_TRIE;
2515 }
2516
2517 STATIC void
2518 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2519 {
2520 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2521
2522    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2523    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2524    ISBN 0-201-10088-6
2525
2526    We find the fail state for each state in the trie, this state is the longest proper
2527    suffix of the current state's 'word' that is also a proper prefix of another word in our
2528    trie. State 1 represents the word '' and is thus the default fail state. This allows
2529    the DFA not to have to restart after its tried and failed a word at a given point, it
2530    simply continues as though it had been matching the other word in the first place.
2531    Consider
2532  'abcdgu'=~/abcdefg|cdgu/
2533    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2534    fail, which would bring us to the state representing 'd' in the second word where we would
2535    try 'g' and succeed, proceeding to match 'cdgu'.
2536  */
2537  /* add a fail transition */
2538  const U32 trie_offset = ARG(source);
2539  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2540  U32 *q;
2541  const U32 ucharcount = trie->uniquecharcount;
2542  const U32 numstates = trie->statecount;
2543  const U32 ubound = trie->lasttrans + ucharcount;
2544  U32 q_read = 0;
2545  U32 q_write = 0;
2546  U32 charid;
2547  U32 base = trie->states[ 1 ].trans.base;
2548  U32 *fail;
2549  reg_ac_data *aho;
2550  const U32 data_slot = add_data( pRExC_state, 1, "T" );
2551  GET_RE_DEBUG_FLAGS_DECL;
2552
2553  PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2554 #ifndef DEBUGGING
2555  PERL_UNUSED_ARG(depth);
2556 #endif
2557
2558
2559  ARG_SET( stclass, data_slot );
2560  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2561  RExC_rxi->data->data[ data_slot ] = (void*)aho;
2562  aho->trie=trie_offset;
2563  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2564  Copy( trie->states, aho->states, numstates, reg_trie_state );
2565  Newxz( q, numstates, U32);
2566  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2567  aho->refcount = 1;
2568  fail = aho->fail;
2569  /* initialize fail[0..1] to be 1 so that we always have
2570  a valid final fail state */
2571  fail[ 0 ] = fail[ 1 ] = 1;
2572
2573  for ( charid = 0; charid < ucharcount ; charid++ ) {
2574   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2575   if ( newstate ) {
2576    q[ q_write ] = newstate;
2577    /* set to point at the root */
2578    fail[ q[ q_write++ ] ]=1;
2579   }
2580  }
2581  while ( q_read < q_write) {
2582   const U32 cur = q[ q_read++ % numstates ];
2583   base = trie->states[ cur ].trans.base;
2584
2585   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2586    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2587    if (ch_state) {
2588     U32 fail_state = cur;
2589     U32 fail_base;
2590     do {
2591      fail_state = fail[ fail_state ];
2592      fail_base = aho->states[ fail_state ].trans.base;
2593     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2594
2595     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2596     fail[ ch_state ] = fail_state;
2597     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2598     {
2599       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2600     }
2601     q[ q_write++ % numstates] = ch_state;
2602    }
2603   }
2604  }
2605  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2606  when we fail in state 1, this allows us to use the
2607  charclass scan to find a valid start char. This is based on the principle
2608  that theres a good chance the string being searched contains lots of stuff
2609  that cant be a start char.
2610  */
2611  fail[ 0 ] = fail[ 1 ] = 0;
2612  DEBUG_TRIE_COMPILE_r({
2613   PerlIO_printf(Perl_debug_log,
2614      "%*sStclass Failtable (%"UVuf" states): 0",
2615      (int)(depth * 2), "", (UV)numstates
2616   );
2617   for( q_read=1; q_read<numstates; q_read++ ) {
2618    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2619   }
2620   PerlIO_printf(Perl_debug_log, "\n");
2621  });
2622  Safefree(q);
2623  /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2624 }
2625
2626
2627 /*
2628  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2629  * These need to be revisited when a newer toolchain becomes available.
2630  */
2631 #if defined(__sparc64__) && defined(__GNUC__)
2632 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2633 #       undef  SPARC64_GCC_WORKAROUND
2634 #       define SPARC64_GCC_WORKAROUND 1
2635 #   endif
2636 #endif
2637
2638 #define DEBUG_PEEP(str,scan,depth) \
2639  DEBUG_OPTIMISE_r({if (scan){ \
2640  SV * const mysv=sv_newmortal(); \
2641  regnode *Next = regnext(scan); \
2642  regprop(RExC_rx, mysv, scan); \
2643  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2644  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2645  Next ? (REG_NODE_NUM(Next)) : 0 ); \
2646    }});
2647
2648
2649 /* The below joins as many adjacent EXACTish nodes as possible into a single
2650  * one.  The regop may be changed if the node(s) contain certain sequences that
2651  * require special handling.  The joining is only done if:
2652  * 1) there is room in the current conglomerated node to entirely contain the
2653  *    next one.
2654  * 2) they are the exact same node type
2655  *
2656  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2657  * these get optimized out
2658  *
2659  * If a node is to match under /i (folded), the number of characters it matches
2660  * can be different than its character length if it contains a multi-character
2661  * fold.  *min_subtract is set to the total delta of the input nodes.
2662  *
2663  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2664  * and contains LATIN SMALL LETTER SHARP S
2665  *
2666  * This is as good a place as any to discuss the design of handling these
2667  * multi-character fold sequences.  It's been wrong in Perl for a very long
2668  * time.  There are three code points in Unicode whose multi-character folds
2669  * were long ago discovered to mess things up.  The previous designs for
2670  * dealing with these involved assigning a special node for them.  This
2671  * approach doesn't work, as evidenced by this example:
2672  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2673  * Both these fold to "sss", but if the pattern is parsed to create a node that
2674  * would match just the \xDF, it won't be able to handle the case where a
2675  * successful match would have to cross the node's boundary.  The new approach
2676  * that hopefully generally solves the problem generates an EXACTFU_SS node
2677  * that is "sss".
2678  *
2679  * It turns out that there are problems with all multi-character folds, and not
2680  * just these three.  Now the code is general, for all such cases, but the
2681  * three still have some special handling.  The approach taken is:
2682  * 1)   This routine examines each EXACTFish node that could contain multi-
2683  *      character fold sequences.  It returns in *min_subtract how much to
2684  *      subtract from the the actual length of the string to get a real minimum
2685  *      match length; it is 0 if there are no multi-char folds.  This delta is
2686  *      used by the caller to adjust the min length of the match, and the delta
2687  *      between min and max, so that the optimizer doesn't reject these
2688  *      possibilities based on size constraints.
2689  * 2)   Certain of these sequences require special handling by the trie code,
2690  *      so, if found, this code changes the joined node type to special ops:
2691  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2692  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2693  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2694  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2695  *      there is a possible fold length change.  That means that a regular
2696  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2697  *      with length changes, and so can be processed faster.  regexec.c takes
2698  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2699  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2700  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2701  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2702  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2703  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2704  *      possibilities for the non-UTF8 patterns are quite simple, except for
2705  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2706  *      members of a fold-pair, and arrays are set up for all of them so that
2707  *      the other member of the pair can be found quickly.  Code elsewhere in
2708  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2709  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2710  *      described in the next item.
2711  * 4)   A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
2712  *      pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
2713  *      UTF-8 pattern.)  An assumption that the optimizer part of regexec.c
2714  *      (probably unwittingly, in Perl_regexec_flags()) makes is that a
2715  *      character in the pattern corresponds to at most a single character in
2716  *      the target string.  (And I do mean character, and not byte here, unlike
2717  *      other parts of the documentation that have never been updated to
2718  *      account for multibyte Unicode.)  sharp s in EXACTF nodes can match the
2719  *      two character string 'ss'; in EXACTFA nodes it can match
2720  *      "\x{17F}\x{17F}".  These violate the assumption, and they are the only
2721  *      instances where it is violated.  I'm reluctant to try to change the
2722  *      assumption, as the code involved is impenetrable to me (khw), so
2723  *      instead the code here punts.  This routine examines (when the pattern
2724  *      isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
2725  *      boolean indicating whether or not the node contains a sharp s.  When it
2726  *      is true, the caller sets a flag that later causes the optimizer in this
2727  *      file to not set values for the floating and fixed string lengths, and
2728  *      thus avoids the optimizer code in regexec.c that makes the invalid
2729  *      assumption.  Thus, there is no optimization based on string lengths for
2730  *      non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
2731  *      (The reason the assumption is wrong only in these two cases is that all
2732  *      other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
2733  *      other folds to their expanded versions.  We can't prefold sharp s to
2734  *      'ss' in EXACTF nodes because we don't know at compile time if it
2735  *      actually matches 'ss' or not.  It will match iff the target string is
2736  *      in UTF-8, unlike the EXACTFU nodes, where it always matches; and
2737  *      EXACTFA and EXACTFL where it never does.  In an EXACTFA node in a UTF-8
2738  *      pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
2739  *      but in a non-UTF8 pattern, folding it to that above-Latin1 string would
2740  *      require the pattern to be forced into UTF-8, the overhead of which we
2741  *      want to avoid.)
2742  */
2743
2744 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2745  if (PL_regkind[OP(scan)] == EXACT) \
2746   join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2747
2748 STATIC U32
2749 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2750  /* Merge several consecutive EXACTish nodes into one. */
2751  regnode *n = regnext(scan);
2752  U32 stringok = 1;
2753  regnode *next = scan + NODE_SZ_STR(scan);
2754  U32 merged = 0;
2755  U32 stopnow = 0;
2756 #ifdef DEBUGGING
2757  regnode *stop = scan;
2758  GET_RE_DEBUG_FLAGS_DECL;
2759 #else
2760  PERL_UNUSED_ARG(depth);
2761 #endif
2762
2763  PERL_ARGS_ASSERT_JOIN_EXACT;
2764 #ifndef EXPERIMENTAL_INPLACESCAN
2765  PERL_UNUSED_ARG(flags);
2766  PERL_UNUSED_ARG(val);
2767 #endif
2768  DEBUG_PEEP("join",scan,depth);
2769
2770  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2771  * EXACT ones that are mergeable to the current one. */
2772  while (n
2773   && (PL_regkind[OP(n)] == NOTHING
2774    || (stringok && OP(n) == OP(scan)))
2775   && NEXT_OFF(n)
2776   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2777  {
2778
2779   if (OP(n) == TAIL || n > next)
2780    stringok = 0;
2781   if (PL_regkind[OP(n)] == NOTHING) {
2782    DEBUG_PEEP("skip:",n,depth);
2783    NEXT_OFF(scan) += NEXT_OFF(n);
2784    next = n + NODE_STEP_REGNODE;
2785 #ifdef DEBUGGING
2786    if (stringok)
2787     stop = n;
2788 #endif
2789    n = regnext(n);
2790   }
2791   else if (stringok) {
2792    const unsigned int oldl = STR_LEN(scan);
2793    regnode * const nnext = regnext(n);
2794
2795    /* XXX I (khw) kind of doubt that this works on platforms where
2796    * U8_MAX is above 255 because of lots of other assumptions */
2797    /* Don't join if the sum can't fit into a single node */
2798    if (oldl + STR_LEN(n) > U8_MAX)
2799     break;
2800
2801    DEBUG_PEEP("merg",n,depth);
2802    merged++;
2803
2804    NEXT_OFF(scan) += NEXT_OFF(n);
2805    STR_LEN(scan) += STR_LEN(n);
2806    next = n + NODE_SZ_STR(n);
2807    /* Now we can overwrite *n : */
2808    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2809 #ifdef DEBUGGING
2810    stop = next - 1;
2811 #endif
2812    n = nnext;
2813    if (stopnow) break;
2814   }
2815
2816 #ifdef EXPERIMENTAL_INPLACESCAN
2817   if (flags && !NEXT_OFF(n)) {
2818    DEBUG_PEEP("atch", val, depth);
2819    if (reg_off_by_arg[OP(n)]) {
2820     ARG_SET(n, val - n);
2821    }
2822    else {
2823     NEXT_OFF(n) = val - n;
2824    }
2825    stopnow = 1;
2826   }
2827 #endif
2828  }
2829
2830  *min_subtract = 0;
2831  *has_exactf_sharp_s = FALSE;
2832
2833  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2834  * can now analyze for sequences of problematic code points.  (Prior to
2835  * this final joining, sequences could have been split over boundaries, and
2836  * hence missed).  The sequences only happen in folding, hence for any
2837  * non-EXACT EXACTish node */
2838  if (OP(scan) != EXACT) {
2839   const U8 * const s0 = (U8*) STRING(scan);
2840   const U8 * s = s0;
2841   const U8 * const s_end = s0 + STR_LEN(scan);
2842
2843   /* One pass is made over the node's string looking for all the
2844   * possibilities.  to avoid some tests in the loop, there are two main
2845   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2846   * non-UTF-8 */
2847   if (UTF) {
2848
2849    /* Examine the string for a multi-character fold sequence.  UTF-8
2850    * patterns have all characters pre-folded by the time this code is
2851    * executed */
2852    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2853          length sequence we are looking for is 2 */
2854    {
2855     int count = 0;
2856     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2857     if (! len) {    /* Not a multi-char fold: get next char */
2858      s += UTF8SKIP(s);
2859      continue;
2860     }
2861
2862     /* Nodes with 'ss' require special handling, except for EXACTFL
2863     * and EXACTFA for which there is no multi-char fold to this */
2864     if (len == 2 && *s == 's' && *(s+1) == 's'
2865      && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2866     {
2867      count = 2;
2868      OP(scan) = EXACTFU_SS;
2869      s += 2;
2870     }
2871     else if (len == 6   /* len is the same in both ASCII and EBCDIC
2872          for these */
2873       && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2874          COMBINING_DIAERESIS_UTF8
2875          COMBINING_ACUTE_ACCENT_UTF8,
2876         6)
2877        || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2878           COMBINING_DIAERESIS_UTF8
2879           COMBINING_ACUTE_ACCENT_UTF8,
2880          6)))
2881     {
2882      count = 3;
2883
2884      /* These two folds require special handling by trie's, so
2885      * change the node type to indicate this.  If EXACTFA and
2886      * EXACTFL were ever to be handled by trie's, this would
2887      * have to be changed.  If this node has already been
2888      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2889      * (khw) think it doesn't matter in regexec.c for UTF
2890      * patterns, but no need to change it */
2891      if (OP(scan) == EXACTFU) {
2892       OP(scan) = EXACTFU_TRICKYFOLD;
2893      }
2894      s += 6;
2895     }
2896     else { /* Here is a generic multi-char fold. */
2897      const U8* multi_end  = s + len;
2898
2899      /* Count how many characters in it.  In the case of /l and
2900      * /aa, no folds which contain ASCII code points are
2901      * allowed, so check for those, and skip if found.  (In
2902      * EXACTFL, no folds are allowed to any Latin1 code point,
2903      * not just ASCII.  But there aren't any of these
2904      * currently, nor ever likely, so don't take the time to
2905      * test for them.  The code that generates the
2906      * is_MULTI_foo() macros croaks should one actually get put
2907      * into Unicode .) */
2908      if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2909       count = utf8_length(s, multi_end);
2910       s = multi_end;
2911      }
2912      else {
2913       while (s < multi_end) {
2914        if (isASCII(*s)) {
2915         s++;
2916         goto next_iteration;
2917        }
2918        else {
2919         s += UTF8SKIP(s);
2920        }
2921        count++;
2922       }
2923      }
2924     }
2925
2926     /* The delta is how long the sequence is minus 1 (1 is how long
2927     * the character that folds to the sequence is) */
2928     *min_subtract += count - 1;
2929    next_iteration: ;
2930    }
2931   }
2932   else if (OP(scan) == EXACTFA) {
2933
2934    /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
2935    * fold to the ASCII range (and there are no existing ones in the
2936    * upper latin1 range).  But, as outlined in the comments preceding
2937    * this function, we need to flag any occurrences of the sharp s */
2938    while (s < s_end) {
2939     if (*s == LATIN_SMALL_LETTER_SHARP_S) {
2940      *has_exactf_sharp_s = TRUE;
2941      break;
2942     }
2943     s++;
2944     continue;
2945    }
2946   }
2947   else if (OP(scan) != EXACTFL) {
2948
2949    /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node.  Look for the
2950    * multi-char folds that are all Latin1.  (This code knows that
2951    * there are no current multi-char folds possible with EXACTFL,
2952    * relying on fold_grind.t to catch any errors if the very unlikely
2953    * event happens that some get added in future Unicode versions.)
2954    * As explained in the comments preceding this function, we look
2955    * also for the sharp s in EXACTF nodes; it can be in the final
2956    * position.  Otherwise we can stop looking 1 byte earlier because
2957    * have to find at least two characters for a multi-fold */
2958    const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2959
2960    while (s < upper) {
2961     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2962     if (! len) {    /* Not a multi-char fold. */
2963      if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2964      {
2965       *has_exactf_sharp_s = TRUE;
2966      }
2967      s++;
2968      continue;
2969     }
2970
2971     if (len == 2
2972      && isARG2_lower_or_UPPER_ARG1('s', *s)
2973      && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
2974     {
2975
2976      /* EXACTF nodes need to know that the minimum length
2977      * changed so that a sharp s in the string can match this
2978      * ss in the pattern, but they remain EXACTF nodes, as they
2979      * won't match this unless the target string is is UTF-8,
2980      * which we don't know until runtime */
2981      if (OP(scan) != EXACTF) {
2982       OP(scan) = EXACTFU_SS;
2983      }
2984     }
2985
2986     *min_subtract += len - 1;
2987     s += len;
2988    }
2989   }
2990  }
2991
2992 #ifdef DEBUGGING
2993  /* Allow dumping but overwriting the collection of skipped
2994  * ops and/or strings with fake optimized ops */
2995  n = scan + NODE_SZ_STR(scan);
2996  while (n <= stop) {
2997   OP(n) = OPTIMIZED;
2998   FLAGS(n) = 0;
2999   NEXT_OFF(n) = 0;
3000   n++;
3001  }
3002 #endif
3003  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3004  return stopnow;
3005 }
3006
3007 /* REx optimizer.  Converts nodes into quicker variants "in place".
3008    Finds fixed substrings.  */
3009
3010 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3011    to the position after last scanned or to NULL. */
3012
3013 #define INIT_AND_WITHP \
3014  assert(!and_withp); \
3015  Newx(and_withp,1,struct regnode_charclass_class); \
3016  SAVEFREEPV(and_withp)
3017
3018 /* this is a chain of data about sub patterns we are processing that
3019    need to be handled separately/specially in study_chunk. Its so
3020    we can simulate recursion without losing state.  */
3021 struct scan_frame;
3022 typedef struct scan_frame {
3023  regnode *last;  /* last node to process in this frame */
3024  regnode *next;  /* next node to process when last is reached */
3025  struct scan_frame *prev; /*previous frame*/
3026  I32 stop; /* what stopparen do we use */
3027 } scan_frame;
3028
3029
3030 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3031
3032 STATIC I32
3033 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3034       I32 *minlenp, I32 *deltap,
3035       regnode *last,
3036       scan_data_t *data,
3037       I32 stopparen,
3038       U8* recursed,
3039       struct regnode_charclass_class *and_withp,
3040       U32 flags, U32 depth)
3041       /* scanp: Start here (read-write). */
3042       /* deltap: Write maxlen-minlen here. */
3043       /* last: Stop before this one. */
3044       /* data: string data about the pattern */
3045       /* stopparen: treat close N as END */
3046       /* recursed: which subroutines have we recursed into */
3047       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3048 {
3049  dVAR;
3050  I32 min = 0;    /* There must be at least this number of characters to match */
3051  I32 pars = 0, code;
3052  regnode *scan = *scanp, *next;
3053  I32 delta = 0;
3054  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3055  int is_inf_internal = 0;  /* The studied chunk is infinite */
3056  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3057  scan_data_t data_fake;
3058  SV *re_trie_maxbuff = NULL;
3059  regnode *first_non_open = scan;
3060  I32 stopmin = I32_MAX;
3061  scan_frame *frame = NULL;
3062  GET_RE_DEBUG_FLAGS_DECL;
3063
3064  PERL_ARGS_ASSERT_STUDY_CHUNK;
3065
3066 #ifdef DEBUGGING
3067  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3068 #endif
3069
3070  if ( depth == 0 ) {
3071   while (first_non_open && OP(first_non_open) == OPEN)
3072    first_non_open=regnext(first_non_open);
3073  }
3074
3075
3076   fake_study_recurse:
3077  while ( scan && OP(scan) != END && scan < last ){
3078   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3079         node length to get a real minimum (because
3080         the folded version may be shorter) */
3081   bool has_exactf_sharp_s = FALSE;
3082   /* Peephole optimizer: */
3083   DEBUG_STUDYDATA("Peep:", data,depth);
3084   DEBUG_PEEP("Peep",scan,depth);
3085
3086   /* Its not clear to khw or hv why this is done here, and not in the
3087   * clauses that deal with EXACT nodes.  khw's guess is that it's
3088   * because of a previous design */
3089   JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3090
3091   /* Follow the next-chain of the current node and optimize
3092   away all the NOTHINGs from it.  */
3093   if (OP(scan) != CURLYX) {
3094    const int max = (reg_off_by_arg[OP(scan)]
3095      ? I32_MAX
3096      /* I32 may be smaller than U16 on CRAYs! */
3097      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3098    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3099    int noff;
3100    regnode *n = scan;
3101
3102    /* Skip NOTHING and LONGJMP. */
3103    while ((n = regnext(n))
3104     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3105      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3106     && off + noff < max)
3107     off += noff;
3108    if (reg_off_by_arg[OP(scan)])
3109     ARG(scan) = off;
3110    else
3111     NEXT_OFF(scan) = off;
3112   }
3113
3114
3115
3116   /* The principal pseudo-switch.  Cannot be a switch, since we
3117   look into several different things.  */
3118   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3119     || OP(scan) == IFTHEN) {
3120    next = regnext(scan);
3121    code = OP(scan);
3122    /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3123
3124    if (OP(next) == code || code == IFTHEN) {
3125     /* NOTE - There is similar code to this block below for handling
3126     TRIE nodes on a re-study.  If you change stuff here check there
3127     too. */
3128     I32 max1 = 0, min1 = I32_MAX, num = 0;
3129     struct regnode_charclass_class accum;
3130     regnode * const startbranch=scan;
3131
3132     if (flags & SCF_DO_SUBSTR)
3133      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3134     if (flags & SCF_DO_STCLASS)
3135      cl_init_zero(pRExC_state, &accum);
3136
3137     while (OP(scan) == code) {
3138      I32 deltanext, minnext, f = 0, fake;
3139      struct regnode_charclass_class this_class;
3140
3141      num++;
3142      data_fake.flags = 0;
3143      if (data) {
3144       data_fake.whilem_c = data->whilem_c;
3145       data_fake.last_closep = data->last_closep;
3146      }
3147      else
3148       data_fake.last_closep = &fake;
3149
3150      data_fake.pos_delta = delta;
3151      next = regnext(scan);
3152      scan = NEXTOPER(scan);
3153      if (code != BRANCH)
3154       scan = NEXTOPER(scan);
3155      if (flags & SCF_DO_STCLASS) {
3156       cl_init(pRExC_state, &this_class);
3157       data_fake.start_class = &this_class;
3158       f = SCF_DO_STCLASS_AND;
3159      }
3160      if (flags & SCF_WHILEM_VISITED_POS)
3161       f |= SCF_WHILEM_VISITED_POS;
3162
3163      /* we suppose the run is continuous, last=next...*/
3164      minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3165           next, &data_fake,
3166           stopparen, recursed, NULL, f,depth+1);
3167      if (min1 > minnext)
3168       min1 = minnext;
3169      if (deltanext == I32_MAX) {
3170       is_inf = is_inf_internal = 1;
3171       max1 = I32_MAX;
3172      } else if (max1 < minnext + deltanext)
3173       max1 = minnext + deltanext;
3174      scan = next;
3175      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3176       pars++;
3177      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3178       if ( stopmin > minnext)
3179        stopmin = min + min1;
3180       flags &= ~SCF_DO_SUBSTR;
3181       if (data)
3182        data->flags |= SCF_SEEN_ACCEPT;
3183      }
3184      if (data) {
3185       if (data_fake.flags & SF_HAS_EVAL)
3186        data->flags |= SF_HAS_EVAL;
3187       data->whilem_c = data_fake.whilem_c;
3188      }
3189      if (flags & SCF_DO_STCLASS)
3190       cl_or(pRExC_state, &accum, &this_class);
3191     }
3192     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3193      min1 = 0;
3194     if (flags & SCF_DO_SUBSTR) {
3195      data->pos_min += min1;
3196      if (data->pos_delta >= I32_MAX - (max1 - min1))
3197       data->pos_delta = I32_MAX;
3198      else
3199       data->pos_delta += max1 - min1;
3200      if (max1 != min1 || is_inf)
3201       data->longest = &(data->longest_float);
3202     }
3203     min += min1;
3204     if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
3205      delta = I32_MAX;
3206     else
3207      delta += max1 - min1;
3208     if (flags & SCF_DO_STCLASS_OR) {
3209      cl_or(pRExC_state, data->start_class, &accum);
3210      if (min1) {
3211       cl_and(data->start_class, and_withp);
3212       flags &= ~SCF_DO_STCLASS;
3213      }
3214     }
3215     else if (flags & SCF_DO_STCLASS_AND) {
3216      if (min1) {
3217       cl_and(data->start_class, &accum);
3218       flags &= ~SCF_DO_STCLASS;
3219      }
3220      else {
3221       /* Switch to OR mode: cache the old value of
3222       * data->start_class */
3223       INIT_AND_WITHP;
3224       StructCopy(data->start_class, and_withp,
3225         struct regnode_charclass_class);
3226       flags &= ~SCF_DO_STCLASS_AND;
3227       StructCopy(&accum, data->start_class,
3228         struct regnode_charclass_class);
3229       flags |= SCF_DO_STCLASS_OR;
3230       SET_SSC_EOS(data->start_class);
3231      }
3232     }
3233
3234     if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3235     /* demq.
3236
3237     Assuming this was/is a branch we are dealing with: 'scan' now
3238     points at the item that follows the branch sequence, whatever
3239     it is. We now start at the beginning of the sequence and look
3240     for subsequences of
3241
3242     BRANCH->EXACT=>x1
3243     BRANCH->EXACT=>x2
3244     tail
3245
3246     which would be constructed from a pattern like /A|LIST|OF|WORDS/
3247
3248     If we can find such a subsequence we need to turn the first
3249     element into a trie and then add the subsequent branch exact
3250     strings to the trie.
3251
3252     We have two cases
3253
3254      1. patterns where the whole set of branches can be converted.
3255
3256      2. patterns where only a subset can be converted.
3257
3258     In case 1 we can replace the whole set with a single regop
3259     for the trie. In case 2 we need to keep the start and end
3260     branches so
3261
3262      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3263      becomes BRANCH TRIE; BRANCH X;
3264
3265     There is an additional case, that being where there is a
3266     common prefix, which gets split out into an EXACT like node
3267     preceding the TRIE node.
3268
3269     If x(1..n)==tail then we can do a simple trie, if not we make
3270     a "jump" trie, such that when we match the appropriate word
3271     we "jump" to the appropriate tail node. Essentially we turn
3272     a nested if into a case structure of sorts.
3273
3274     */
3275
3276      int made=0;
3277      if (!re_trie_maxbuff) {
3278       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3279       if (!SvIOK(re_trie_maxbuff))
3280        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3281      }
3282      if ( SvIV(re_trie_maxbuff)>=0  ) {
3283       regnode *cur;
3284       regnode *first = (regnode *)NULL;
3285       regnode *last = (regnode *)NULL;
3286       regnode *tail = scan;
3287       U8 trietype = 0;
3288       U32 count=0;
3289
3290 #ifdef DEBUGGING
3291       SV * const mysv = sv_newmortal();       /* for dumping */
3292 #endif
3293       /* var tail is used because there may be a TAIL
3294       regop in the way. Ie, the exacts will point to the
3295       thing following the TAIL, but the last branch will
3296       point at the TAIL. So we advance tail. If we
3297       have nested (?:) we may have to move through several
3298       tails.
3299       */
3300
3301       while ( OP( tail ) == TAIL ) {
3302        /* this is the TAIL generated by (?:) */
3303        tail = regnext( tail );
3304       }
3305
3306
3307       DEBUG_TRIE_COMPILE_r({
3308        regprop(RExC_rx, mysv, tail );
3309        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3310         (int)depth * 2 + 2, "",
3311         "Looking for TRIE'able sequences. Tail node is: ",
3312         SvPV_nolen_const( mysv )
3313        );
3314       });
3315
3316       /*
3317
3318        Step through the branches
3319         cur represents each branch,
3320         noper is the first thing to be matched as part of that branch
3321         noper_next is the regnext() of that node.
3322
3323        We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3324        via a "jump trie" but we also support building with NOJUMPTRIE,
3325        which restricts the trie logic to structures like /FOO|BAR/.
3326
3327        If noper is a trieable nodetype then the branch is a possible optimization
3328        target. If we are building under NOJUMPTRIE then we require that noper_next
3329        is the same as scan (our current position in the regex program).
3330
3331        Once we have two or more consecutive such branches we can create a
3332        trie of the EXACT's contents and stitch it in place into the program.
3333
3334        If the sequence represents all of the branches in the alternation we
3335        replace the entire thing with a single TRIE node.
3336
3337        Otherwise when it is a subsequence we need to stitch it in place and
3338        replace only the relevant branches. This means the first branch has
3339        to remain as it is used by the alternation logic, and its next pointer,
3340        and needs to be repointed at the item on the branch chain following
3341        the last branch we have optimized away.
3342
3343        This could be either a BRANCH, in which case the subsequence is internal,
3344        or it could be the item following the branch sequence in which case the
3345        subsequence is at the end (which does not necessarily mean the first node
3346        is the start of the alternation).
3347
3348        TRIE_TYPE(X) is a define which maps the optype to a trietype.
3349
3350         optype          |  trietype
3351         ----------------+-----------
3352         NOTHING         | NOTHING
3353         EXACT           | EXACT
3354         EXACTFU         | EXACTFU
3355         EXACTFU_SS      | EXACTFU
3356         EXACTFU_TRICKYFOLD | EXACTFU
3357         EXACTFA         | 0
3358
3359
3360       */
3361 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3362      ( EXACT == (X) )   ? EXACT :        \
3363      ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3364      0 )
3365
3366       /* dont use tail as the end marker for this traverse */
3367       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3368        regnode * const noper = NEXTOPER( cur );
3369        U8 noper_type = OP( noper );
3370        U8 noper_trietype = TRIE_TYPE( noper_type );
3371 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3372        regnode * const noper_next = regnext( noper );
3373        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3374        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3375 #endif
3376
3377        DEBUG_TRIE_COMPILE_r({
3378         regprop(RExC_rx, mysv, cur);
3379         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3380         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3381
3382         regprop(RExC_rx, mysv, noper);
3383         PerlIO_printf( Perl_debug_log, " -> %s",
3384          SvPV_nolen_const(mysv));
3385
3386         if ( noper_next ) {
3387         regprop(RExC_rx, mysv, noper_next );
3388         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3389          SvPV_nolen_const(mysv));
3390         }
3391         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3392         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3393         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3394         );
3395        });
3396
3397        /* Is noper a trieable nodetype that can be merged with the
3398        * current trie (if there is one)? */
3399        if ( noper_trietype
3400         &&
3401         (
3402           ( noper_trietype == NOTHING)
3403           || ( trietype == NOTHING )
3404           || ( trietype == noper_trietype )
3405         )
3406 #ifdef NOJUMPTRIE
3407         && noper_next == tail
3408 #endif
3409         && count < U16_MAX)
3410        {
3411         /* Handle mergable triable node
3412         * Either we are the first node in a new trieable sequence,
3413         * in which case we do some bookkeeping, otherwise we update
3414         * the end pointer. */
3415         if ( !first ) {
3416          first = cur;
3417          if ( noper_trietype == NOTHING ) {
3418 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3419           regnode * const noper_next = regnext( noper );
3420           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3421           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3422 #endif
3423
3424           if ( noper_next_trietype ) {
3425            trietype = noper_next_trietype;
3426           } else if (noper_next_type)  {
3427            /* a NOTHING regop is 1 regop wide. We need at least two
3428            * for a trie so we can't merge this in */
3429            first = NULL;
3430           }
3431          } else {
3432           trietype = noper_trietype;
3433          }
3434         } else {
3435          if ( trietype == NOTHING )
3436           trietype = noper_trietype;
3437          last = cur;
3438         }
3439         if (first)
3440          count++;
3441        } /* end handle mergable triable node */
3442        else {
3443         /* handle unmergable node -
3444         * noper may either be a triable node which can not be tried
3445         * together with the current trie, or a non triable node */
3446         if ( last ) {
3447          /* If last is set and trietype is not NOTHING then we have found
3448          * at least two triable branch sequences in a row of a similar
3449          * trietype so we can turn them into a trie. If/when we
3450          * allow NOTHING to start a trie sequence this condition will be
3451          * required, and it isn't expensive so we leave it in for now. */
3452          if ( trietype && trietype != NOTHING )
3453           make_trie( pRExC_state,
3454             startbranch, first, cur, tail, count,
3455             trietype, depth+1 );
3456          last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3457         }
3458         if ( noper_trietype
3459 #ifdef NOJUMPTRIE
3460          && noper_next == tail
3461 #endif
3462         ){
3463          /* noper is triable, so we can start a new trie sequence */
3464          count = 1;
3465          first = cur;
3466          trietype = noper_trietype;
3467         } else if (first) {
3468          /* if we already saw a first but the current node is not triable then we have
3469          * to reset the first information. */
3470          count = 0;
3471          first = NULL;
3472          trietype = 0;
3473         }
3474        } /* end handle unmergable node */
3475       } /* loop over branches */
3476       DEBUG_TRIE_COMPILE_r({
3477        regprop(RExC_rx, mysv, cur);
3478        PerlIO_printf( Perl_debug_log,
3479        "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3480        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3481
3482       });
3483       if ( last && trietype ) {
3484        if ( trietype != NOTHING ) {
3485         /* the last branch of the sequence was part of a trie,
3486         * so we have to construct it here outside of the loop
3487         */
3488         made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3489 #ifdef TRIE_STUDY_OPT
3490         if ( ((made == MADE_EXACT_TRIE &&
3491          startbranch == first)
3492          || ( first_non_open == first )) &&
3493          depth==0 ) {
3494          flags |= SCF_TRIE_RESTUDY;
3495          if ( startbranch == first
3496           && scan == tail )
3497          {
3498           RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3499          }
3500         }
3501 #endif
3502        } else {
3503         /* at this point we know whatever we have is a NOTHING sequence/branch
3504         * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3505         */
3506         if ( startbranch == first ) {
3507          regnode *opt;
3508          /* the entire thing is a NOTHING sequence, something like this:
3509          * (?:|) So we can turn it into a plain NOTHING op. */
3510          DEBUG_TRIE_COMPILE_r({
3511           regprop(RExC_rx, mysv, cur);
3512           PerlIO_printf( Perl_debug_log,
3513           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3514           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3515
3516          });
3517          OP(startbranch)= NOTHING;
3518          NEXT_OFF(startbranch)= tail - startbranch;
3519          for ( opt= startbranch + 1; opt < tail ; opt++ )
3520           OP(opt)= OPTIMIZED;
3521         }
3522        }
3523       } /* end if ( last) */
3524      } /* TRIE_MAXBUF is non zero */
3525
3526     } /* do trie */
3527
3528    }
3529    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3530     scan = NEXTOPER(NEXTOPER(scan));
3531    } else   /* single branch is optimized. */
3532     scan = NEXTOPER(scan);
3533    continue;
3534   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3535    scan_frame *newframe = NULL;
3536    I32 paren;
3537    regnode *start;
3538    regnode *end;
3539
3540    if (OP(scan) != SUSPEND) {
3541    /* set the pointer */
3542     if (OP(scan) == GOSUB) {
3543      paren = ARG(scan);
3544      RExC_recurse[ARG2L(scan)] = scan;
3545      start = RExC_open_parens[paren-1];
3546      end   = RExC_close_parens[paren-1];
3547     } else {
3548      paren = 0;
3549      start = RExC_rxi->program + 1;
3550      end   = RExC_opend;
3551     }
3552     if (!recursed) {
3553      Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3554      SAVEFREEPV(recursed);
3555     }
3556     if (!PAREN_TEST(recursed,paren+1)) {
3557      PAREN_SET(recursed,paren+1);
3558      Newx(newframe,1,scan_frame);
3559     } else {
3560      if (flags & SCF_DO_SUBSTR) {
3561       SCAN_COMMIT(pRExC_state,data,minlenp);
3562       data->longest = &(data->longest_float);
3563      }
3564      is_inf = is_inf_internal = 1;
3565      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3566       cl_anything(pRExC_state, data->start_class);
3567      flags &= ~SCF_DO_STCLASS;
3568     }
3569    } else {
3570     Newx(newframe,1,scan_frame);
3571     paren = stopparen;
3572     start = scan+2;
3573     end = regnext(scan);
3574    }
3575    if (newframe) {
3576     assert(start);
3577     assert(end);
3578     SAVEFREEPV(newframe);
3579     newframe->next = regnext(scan);
3580     newframe->last = last;
3581     newframe->stop = stopparen;
3582     newframe->prev = frame;
3583
3584     frame = newframe;
3585     scan =  start;
3586     stopparen = paren;
3587     last = end;
3588
3589     continue;
3590    }
3591   }
3592   else if (OP(scan) == EXACT) {
3593    I32 l = STR_LEN(scan);
3594    UV uc;
3595    if (UTF) {
3596     const U8 * const s = (U8*)STRING(scan);
3597     uc = utf8_to_uvchr_buf(s, s + l, NULL);
3598     l = utf8_length(s, s + l);
3599    } else {
3600     uc = *((U8*)STRING(scan));
3601    }
3602    min += l;
3603    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3604     /* The code below prefers earlier match for fixed
3605     offset, later match for variable offset.  */
3606     if (data->last_end == -1) { /* Update the start info. */
3607      data->last_start_min = data->pos_min;
3608      data->last_start_max = is_inf
3609       ? I32_MAX : data->pos_min + data->pos_delta;
3610     }
3611     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3612     if (UTF)
3613      SvUTF8_on(data->last_found);
3614     {
3615      SV * const sv = data->last_found;
3616      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3617       mg_find(sv, PERL_MAGIC_utf8) : NULL;
3618      if (mg && mg->mg_len >= 0)
3619       mg->mg_len += utf8_length((U8*)STRING(scan),
3620             (U8*)STRING(scan)+STR_LEN(scan));
3621     }
3622     data->last_end = data->pos_min + l;
3623     data->pos_min += l; /* As in the first entry. */
3624     data->flags &= ~SF_BEFORE_EOL;
3625    }
3626    if (flags & SCF_DO_STCLASS_AND) {
3627     /* Check whether it is compatible with what we know already! */
3628     int compat = 1;
3629
3630
3631     /* If compatible, we or it in below.  It is compatible if is
3632     * in the bitmp and either 1) its bit or its fold is set, or 2)
3633     * it's for a locale.  Even if there isn't unicode semantics
3634     * here, at runtime there may be because of matching against a
3635     * utf8 string, so accept a possible false positive for
3636     * latin1-range folds */
3637     if (uc >= 0x100 ||
3638      (!(data->start_class->flags & ANYOF_LOCALE)
3639      && !ANYOF_BITMAP_TEST(data->start_class, uc)
3640      && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3641       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3642      )
3643     {
3644      compat = 0;
3645     }
3646     ANYOF_CLASS_ZERO(data->start_class);
3647     ANYOF_BITMAP_ZERO(data->start_class);
3648     if (compat)
3649      ANYOF_BITMAP_SET(data->start_class, uc);
3650     else if (uc >= 0x100) {
3651      int i;
3652
3653      /* Some Unicode code points fold to the Latin1 range; as
3654      * XXX temporary code, instead of figuring out if this is
3655      * one, just assume it is and set all the start class bits
3656      * that could be some such above 255 code point's fold
3657      * which will generate fals positives.  As the code
3658      * elsewhere that does compute the fold settles down, it
3659      * can be extracted out and re-used here */
3660      for (i = 0; i < 256; i++){
3661       if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3662        ANYOF_BITMAP_SET(data->start_class, i);
3663       }
3664      }
3665     }
3666     CLEAR_SSC_EOS(data->start_class);
3667     if (uc < 0x100)
3668     data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3669    }
3670    else if (flags & SCF_DO_STCLASS_OR) {
3671     /* false positive possible if the class is case-folded */
3672     if (uc < 0x100)
3673      ANYOF_BITMAP_SET(data->start_class, uc);
3674     else
3675      data->start_class->flags |= ANYOF_UNICODE_ALL;
3676     CLEAR_SSC_EOS(data->start_class);
3677     cl_and(data->start_class, and_withp);
3678    }
3679    flags &= ~SCF_DO_STCLASS;
3680   }
3681   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3682    I32 l = STR_LEN(scan);
3683    UV uc = *((U8*)STRING(scan));
3684
3685    /* Search for fixed substrings supports EXACT only. */
3686    if (flags & SCF_DO_SUBSTR) {
3687     assert(data);
3688     SCAN_COMMIT(pRExC_state, data, minlenp);
3689    }
3690    if (UTF) {
3691     const U8 * const s = (U8 *)STRING(scan);
3692     uc = utf8_to_uvchr_buf(s, s + l, NULL);
3693     l = utf8_length(s, s + l);
3694    }
3695    if (has_exactf_sharp_s) {
3696     RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3697    }
3698    min += l - min_subtract;
3699    assert (min >= 0);
3700    delta += min_subtract;
3701    if (flags & SCF_DO_SUBSTR) {
3702     data->pos_min += l - min_subtract;
3703     if (data->pos_min < 0) {
3704      data->pos_min = 0;
3705     }
3706     data->pos_delta += min_subtract;
3707     if (min_subtract) {
3708      data->longest = &(data->longest_float);
3709     }
3710    }
3711    if (flags & SCF_DO_STCLASS_AND) {
3712     /* Check whether it is compatible with what we know already! */
3713     int compat = 1;
3714     if (uc >= 0x100 ||
3715     (!(data->start_class->flags & ANYOF_LOCALE)
3716     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3717     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3718     {
3719      compat = 0;
3720     }
3721     ANYOF_CLASS_ZERO(data->start_class);
3722     ANYOF_BITMAP_ZERO(data->start_class);
3723     if (compat) {
3724      ANYOF_BITMAP_SET(data->start_class, uc);
3725      CLEAR_SSC_EOS(data->start_class);
3726      if (OP(scan) == EXACTFL) {
3727       /* XXX This set is probably no longer necessary, and
3728       * probably wrong as LOCALE now is on in the initial
3729       * state */
3730       data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3731      }
3732      else {
3733
3734       /* Also set the other member of the fold pair.  In case
3735       * that unicode semantics is called for at runtime, use
3736       * the full latin1 fold.  (Can't do this for locale,
3737       * because not known until runtime) */
3738       ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3739
3740       /* All other (EXACTFL handled above) folds except under
3741       * /iaa that include s, S, and sharp_s also may include
3742       * the others */
3743       if (OP(scan) != EXACTFA) {
3744        if (uc == 's' || uc == 'S') {
3745         ANYOF_BITMAP_SET(data->start_class,
3746             LATIN_SMALL_LETTER_SHARP_S);
3747        }
3748        else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3749         ANYOF_BITMAP_SET(data->start_class, 's');
3750         ANYOF_BITMAP_SET(data->start_class, 'S');
3751        }
3752       }
3753      }
3754     }
3755     else if (uc >= 0x100) {
3756      int i;
3757      for (i = 0; i < 256; i++){
3758       if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3759        ANYOF_BITMAP_SET(data->start_class, i);
3760       }
3761      }
3762     }
3763    }
3764    else if (flags & SCF_DO_STCLASS_OR) {
3765     if (data->start_class->flags & ANYOF_LOC_FOLD) {
3766      /* false positive possible if the class is case-folded.
3767      Assume that the locale settings are the same... */
3768      if (uc < 0x100) {
3769       ANYOF_BITMAP_SET(data->start_class, uc);
3770       if (OP(scan) != EXACTFL) {
3771
3772        /* And set the other member of the fold pair, but
3773        * can't do that in locale because not known until
3774        * run-time */
3775        ANYOF_BITMAP_SET(data->start_class,
3776            PL_fold_latin1[uc]);
3777
3778        /* All folds except under /iaa that include s, S,
3779        * and sharp_s also may include the others */
3780        if (OP(scan) != EXACTFA) {
3781         if (uc == 's' || uc == 'S') {
3782          ANYOF_BITMAP_SET(data->start_class,
3783             LATIN_SMALL_LETTER_SHARP_S);
3784         }
3785         else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3786          ANYOF_BITMAP_SET(data->start_class, 's');
3787          ANYOF_BITMAP_SET(data->start_class, 'S');
3788         }
3789        }
3790       }
3791      }
3792      CLEAR_SSC_EOS(data->start_class);
3793     }
3794     cl_and(data->start_class, and_withp);
3795    }
3796    flags &= ~SCF_DO_STCLASS;
3797   }
3798   else if (REGNODE_VARIES(OP(scan))) {
3799    I32 mincount, maxcount, minnext, deltanext, fl = 0;
3800    I32 f = flags, pos_before = 0;
3801    regnode * const oscan = scan;
3802    struct regnode_charclass_class this_class;
3803    struct regnode_charclass_class *oclass = NULL;
3804    I32 next_is_eval = 0;
3805
3806    switch (PL_regkind[OP(scan)]) {
3807    case WHILEM:  /* End of (?:...)* . */
3808     scan = NEXTOPER(scan);
3809     goto finish;
3810    case PLUS:
3811     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3812      next = NEXTOPER(scan);
3813      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3814       mincount = 1;
3815       maxcount = REG_INFTY;
3816       next = regnext(scan);
3817       scan = NEXTOPER(scan);
3818       goto do_curly;
3819      }
3820     }
3821     if (flags & SCF_DO_SUBSTR)
3822      data->pos_min++;
3823     min++;
3824     /* Fall through. */
3825    case STAR:
3826     if (flags & SCF_DO_STCLASS) {
3827      mincount = 0;
3828      maxcount = REG_INFTY;
3829      next = regnext(scan);
3830      scan = NEXTOPER(scan);
3831      goto do_curly;
3832     }
3833     is_inf = is_inf_internal = 1;
3834     scan = regnext(scan);
3835     if (flags & SCF_DO_SUBSTR) {
3836      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3837      data->longest = &(data->longest_float);
3838     }
3839     goto optimize_curly_tail;
3840    case CURLY:
3841     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3842      && (scan->flags == stopparen))
3843     {
3844      mincount = 1;
3845      maxcount = 1;
3846     } else {
3847      mincount = ARG1(scan);
3848      maxcount = ARG2(scan);
3849     }
3850     next = regnext(scan);
3851     if (OP(scan) == CURLYX) {
3852      I32 lp = (data ? *(data->last_closep) : 0);
3853      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3854     }
3855     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3856     next_is_eval = (OP(scan) == EVAL);
3857    do_curly:
3858     if (flags & SCF_DO_SUBSTR) {
3859      if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3860      pos_before = data->pos_min;
3861     }
3862     if (data) {
3863      fl = data->flags;
3864      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3865      if (is_inf)
3866       data->flags |= SF_IS_INF;
3867     }
3868     if (flags & SCF_DO_STCLASS) {
3869      cl_init(pRExC_state, &this_class);
3870      oclass = data->start_class;
3871      data->start_class = &this_class;
3872      f |= SCF_DO_STCLASS_AND;
3873      f &= ~SCF_DO_STCLASS_OR;
3874     }
3875     /* Exclude from super-linear cache processing any {n,m}
3876     regops for which the combination of input pos and regex
3877     pos is not enough information to determine if a match
3878     will be possible.
3879
3880     For example, in the regex /foo(bar\s*){4,8}baz/ with the
3881     regex pos at the \s*, the prospects for a match depend not
3882     only on the input position but also on how many (bar\s*)
3883     repeats into the {4,8} we are. */
3884    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3885      f &= ~SCF_WHILEM_VISITED_POS;
3886
3887     /* This will finish on WHILEM, setting scan, or on NULL: */
3888     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3889          last, data, stopparen, recursed, NULL,
3890          (mincount == 0
3891           ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3892
3893     if (flags & SCF_DO_STCLASS)
3894      data->start_class = oclass;
3895     if (mincount == 0 || minnext == 0) {
3896      if (flags & SCF_DO_STCLASS_OR) {
3897       cl_or(pRExC_state, data->start_class, &this_class);
3898      }
3899      else if (flags & SCF_DO_STCLASS_AND) {
3900       /* Switch to OR mode: cache the old value of
3901       * data->start_class */
3902       INIT_AND_WITHP;
3903       StructCopy(data->start_class, and_withp,
3904         struct regnode_charclass_class);
3905       flags &= ~SCF_DO_STCLASS_AND;
3906       StructCopy(&this_class, data->start_class,
3907         struct regnode_charclass_class);
3908       flags |= SCF_DO_STCLASS_OR;
3909       SET_SSC_EOS(data->start_class);
3910      }
3911     } else {  /* Non-zero len */
3912      if (flags & SCF_DO_STCLASS_OR) {
3913       cl_or(pRExC_state, data->start_class, &this_class);
3914       cl_and(data->start_class, and_withp);
3915      }
3916      else if (flags & SCF_DO_STCLASS_AND)
3917       cl_and(data->start_class, &this_class);
3918      flags &= ~SCF_DO_STCLASS;
3919     }
3920     if (!scan)   /* It was not CURLYX, but CURLY. */
3921      scan = next;
3922     if (!(flags & SCF_TRIE_DOING_RESTUDY)
3923      /* ? quantifier ok, except for (?{ ... }) */
3924      && (next_is_eval || !(mincount == 0 && maxcount == 1))
3925      && (minnext == 0) && (deltanext == 0)
3926      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3927      && maxcount <= REG_INFTY/3) /* Complement check for big count */
3928     {
3929      /* Fatal warnings may leak the regexp without this: */
3930      SAVEFREESV(RExC_rx_sv);
3931      ckWARNreg(RExC_parse,
3932        "Quantifier unexpected on zero-length expression");
3933      (void)ReREFCNT_inc(RExC_rx_sv);
3934     }
3935
3936     min += minnext * mincount;
3937     is_inf_internal |= deltanext == I32_MAX
3938          || (maxcount == REG_INFTY && minnext + deltanext > 0);
3939     is_inf |= is_inf_internal;
3940     if (is_inf)
3941      delta = I32_MAX;
3942     else
3943      delta += (minnext + deltanext) * maxcount - minnext * mincount;
3944
3945     /* Try powerful optimization CURLYX => CURLYN. */
3946     if (  OP(oscan) == CURLYX && data
3947      && data->flags & SF_IN_PAR
3948      && !(data->flags & SF_HAS_EVAL)
3949      && !deltanext && minnext == 1 ) {
3950      /* Try to optimize to CURLYN.  */
3951      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3952      regnode * const nxt1 = nxt;
3953 #ifdef DEBUGGING
3954      regnode *nxt2;
3955 #endif
3956
3957      /* Skip open. */
3958      nxt = regnext(nxt);
3959      if (!REGNODE_SIMPLE(OP(nxt))
3960       && !(PL_regkind[OP(nxt)] == EXACT
3961        && STR_LEN(nxt) == 1))
3962       goto nogo;
3963 #ifdef DEBUGGING
3964      nxt2 = nxt;
3965 #endif
3966      nxt = regnext(nxt);
3967      if (OP(nxt) != CLOSE)
3968       goto nogo;
3969      if (RExC_open_parens) {
3970       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3971       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3972      }
3973      /* Now we know that nxt2 is the only contents: */
3974      oscan->flags = (U8)ARG(nxt);
3975      OP(oscan) = CURLYN;
3976      OP(nxt1) = NOTHING; /* was OPEN. */
3977
3978 #ifdef DEBUGGING
3979      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3980      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3981      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3982      OP(nxt) = OPTIMIZED; /* was CLOSE. */
3983      OP(nxt + 1) = OPTIMIZED; /* was count. */
3984      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3985 #endif
3986     }
3987    nogo:
3988
3989     /* Try optimization CURLYX => CURLYM. */
3990     if (  OP(oscan) == CURLYX && data
3991      && !(data->flags & SF_HAS_PAR)
3992      && !(data->flags & SF_HAS_EVAL)
3993      && !deltanext /* atom is fixed width */
3994      && minnext != 0 /* CURLYM can't handle zero width */
3995      && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3996     ) {
3997      /* XXXX How to optimize if data == 0? */
3998      /* Optimize to a simpler form.  */
3999      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4000      regnode *nxt2;
4001
4002      OP(oscan) = CURLYM;
4003      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4004        && (OP(nxt2) != WHILEM))
4005       nxt = nxt2;
4006      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
4007      /* Need to optimize away parenths. */
4008      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4009       /* Set the parenth number.  */
4010       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4011
4012       oscan->flags = (U8)ARG(nxt);
4013       if (RExC_open_parens) {
4014        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4015        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4016       }
4017       OP(nxt1) = OPTIMIZED; /* was OPEN. */
4018       OP(nxt) = OPTIMIZED; /* was CLOSE. */
4019
4020 #ifdef DEBUGGING
4021       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4022       OP(nxt + 1) = OPTIMIZED; /* was count. */
4023       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4024       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4025 #endif
4026 #if 0
4027       while ( nxt1 && (OP(nxt1) != WHILEM)) {
4028        regnode *nnxt = regnext(nxt1);
4029        if (nnxt == nxt) {
4030         if (reg_off_by_arg[OP(nxt1)])
4031          ARG_SET(nxt1, nxt2 - nxt1);
4032         else if (nxt2 - nxt1 < U16_MAX)
4033          NEXT_OFF(nxt1) = nxt2 - nxt1;
4034         else
4035          OP(nxt) = NOTHING; /* Cannot beautify */
4036        }
4037        nxt1 = nnxt;
4038       }
4039 #endif
4040       /* Optimize again: */
4041       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4042          NULL, stopparen, recursed, NULL, 0,depth+1);
4043      }
4044      else
4045       oscan->flags = 0;
4046     }
4047     else if ((OP(oscan) == CURLYX)
4048       && (flags & SCF_WHILEM_VISITED_POS)
4049       /* See the comment on a similar expression above.
4050        However, this time it's not a subexpression
4051        we care about, but the expression itself. */
4052       && (maxcount == REG_INFTY)
4053       && data && ++data->whilem_c < 16) {
4054      /* This stays as CURLYX, we can put the count/of pair. */
4055      /* Find WHILEM (as in regexec.c) */
4056      regnode *nxt = oscan + NEXT_OFF(oscan);
4057
4058      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4059       nxt += ARG(nxt);
4060      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4061       | (RExC_whilem_seen << 4)); /* On WHILEM */
4062     }
4063     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4064      pars++;
4065     if (flags & SCF_DO_SUBSTR) {
4066      SV *last_str = NULL;
4067      int counted = mincount != 0;
4068
4069      if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4070 #if defined(SPARC64_GCC_WORKAROUND)
4071       I32 b = 0;
4072       STRLEN l = 0;
4073       const char *s = NULL;
4074       I32 old = 0;
4075
4076       if (pos_before >= data->last_start_min)
4077        b = pos_before;
4078       else
4079        b = data->last_start_min;
4080
4081       l = 0;
4082       s = SvPV_const(data->last_found, l);
4083       old = b - data->last_start_min;
4084
4085 #else
4086       I32 b = pos_before >= data->last_start_min
4087        ? pos_before : data->last_start_min;
4088       STRLEN l;
4089       const char * const s = SvPV_const(data->last_found, l);
4090       I32 old = b - data->last_start_min;
4091 #endif
4092
4093       if (UTF)
4094        old = utf8_hop((U8*)s, old) - (U8*)s;
4095       l -= old;
4096       /* Get the added string: */
4097       last_str = newSVpvn_utf8(s  + old, l, UTF);
4098       if (deltanext == 0 && pos_before == b) {
4099        /* What was added is a constant string */
4100        if (mincount > 1) {
4101         SvGROW(last_str, (mincount * l) + 1);
4102         repeatcpy(SvPVX(last_str) + l,
4103           SvPVX_const(last_str), l, mincount - 1);
4104         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4105         /* Add additional parts. */
4106         SvCUR_set(data->last_found,
4107           SvCUR(data->last_found) - l);
4108         sv_catsv(data->last_found, last_str);
4109         {
4110          SV * sv = data->last_found;
4111          MAGIC *mg =
4112           SvUTF8(sv) && SvMAGICAL(sv) ?
4113           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4114          if (mg && mg->mg_len >= 0)
4115           mg->mg_len += CHR_SVLEN(last_str) - l;
4116         }
4117         data->last_end += l * (mincount - 1);
4118        }
4119       } else {
4120        /* start offset must point into the last copy */
4121        data->last_start_min += minnext * (mincount - 1);
4122        data->last_start_max += is_inf ? I32_MAX
4123         : (maxcount - 1) * (minnext + data->pos_delta);
4124       }
4125      }
4126      /* It is counted once already... */
4127      data->pos_min += minnext * (mincount - counted);
4128 #if 0
4129 PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
4130  counted, deltanext, I32_MAX, minnext, maxcount, mincount);
4131 if (deltanext != I32_MAX)
4132 PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
4133 #endif
4134      if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
4135       data->pos_delta = I32_MAX;
4136      else
4137       data->pos_delta += - counted * deltanext +
4138       (minnext + deltanext) * maxcount - minnext * mincount;
4139      if (mincount != maxcount) {
4140       /* Cannot extend fixed substrings found inside
4141        the group.  */
4142       SCAN_COMMIT(pRExC_state,data,minlenp);
4143       if (mincount && last_str) {
4144        SV * const sv = data->last_found;
4145        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4146         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4147
4148        if (mg)
4149         mg->mg_len = -1;
4150        sv_setsv(sv, last_str);
4151        data->last_end = data->pos_min;
4152        data->last_start_min =
4153         data->pos_min - CHR_SVLEN(last_str);
4154        data->last_start_max = is_inf
4155         ? I32_MAX
4156         : data->pos_min + data->pos_delta
4157         - CHR_SVLEN(last_str);
4158       }
4159       data->longest = &(data->longest_float);
4160      }
4161      SvREFCNT_dec(last_str);
4162     }
4163     if (data && (fl & SF_HAS_EVAL))
4164      data->flags |= SF_HAS_EVAL;
4165    optimize_curly_tail:
4166     if (OP(oscan) != CURLYX) {
4167      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4168       && NEXT_OFF(next))
4169       NEXT_OFF(oscan) += NEXT_OFF(next);
4170     }
4171     continue;
4172    default:   /* REF, and CLUMP only? */
4173     if (flags & SCF_DO_SUBSTR) {
4174      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4175      data->longest = &(data->longest_float);
4176     }
4177     is_inf = is_inf_internal = 1;
4178     if (flags & SCF_DO_STCLASS_OR)
4179      cl_anything(pRExC_state, data->start_class);
4180     flags &= ~SCF_DO_STCLASS;
4181     break;
4182    }
4183   }
4184   else if (OP(scan) == LNBREAK) {
4185    if (flags & SCF_DO_STCLASS) {
4186     int value = 0;
4187     CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4188      if (flags & SCF_DO_STCLASS_AND) {
4189      for (value = 0; value < 256; value++)
4190       if (!is_VERTWS_cp(value))
4191        ANYOF_BITMAP_CLEAR(data->start_class, value);
4192     }
4193     else {
4194      for (value = 0; value < 256; value++)
4195       if (is_VERTWS_cp(value))
4196        ANYOF_BITMAP_SET(data->start_class, value);
4197     }
4198     if (flags & SCF_DO_STCLASS_OR)
4199      cl_and(data->start_class, and_withp);
4200     flags &= ~SCF_DO_STCLASS;
4201    }
4202    min++;
4203    delta++;    /* Because of the 2 char string cr-lf */
4204    if (flags & SCF_DO_SUBSTR) {
4205      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4206      data->pos_min += 1;
4207     data->pos_delta += 1;
4208     data->longest = &(data->longest_float);
4209     }
4210   }
4211   else if (REGNODE_SIMPLE(OP(scan))) {
4212    int value = 0;
4213
4214    if (flags & SCF_DO_SUBSTR) {
4215     SCAN_COMMIT(pRExC_state,data,minlenp);
4216     data->pos_min++;
4217    }
4218    min++;
4219    if (flags & SCF_DO_STCLASS) {
4220     int loop_max = 256;
4221     CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4222
4223     /* Some of the logic below assumes that switching
4224     locale on will only add false positives. */
4225     switch (PL_regkind[OP(scan)]) {
4226      U8 classnum;
4227
4228     case SANY:
4229     default:
4230 #ifdef DEBUGGING
4231     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4232 #endif
4233     do_default:
4234      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4235       cl_anything(pRExC_state, data->start_class);
4236      break;
4237     case REG_ANY:
4238      if (OP(scan) == SANY)
4239       goto do_default;
4240      if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4241       value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4242         || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4243       cl_anything(pRExC_state, data->start_class);
4244      }
4245      if (flags & SCF_DO_STCLASS_AND || !value)
4246       ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4247      break;
4248     case ANYOF:
4249      if (flags & SCF_DO_STCLASS_AND)
4250       cl_and(data->start_class,
4251        (struct regnode_charclass_class*)scan);
4252      else
4253       cl_or(pRExC_state, data->start_class,
4254        (struct regnode_charclass_class*)scan);
4255      break;
4256     case POSIXA:
4257      loop_max = 128;
4258      /* FALL THROUGH */
4259     case POSIXL:
4260     case POSIXD:
4261     case POSIXU:
4262      classnum = FLAGS(scan);
4263      if (flags & SCF_DO_STCLASS_AND) {
4264       if (!(data->start_class->flags & ANYOF_LOCALE)) {
4265        ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4266        for (value = 0; value < loop_max; value++) {
4267         if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4268          ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4269         }
4270        }
4271       }
4272      }
4273      else {
4274       if (data->start_class->flags & ANYOF_LOCALE) {
4275        ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4276       }
4277       else {
4278
4279       /* Even if under locale, set the bits for non-locale
4280       * in case it isn't a true locale-node.  This will
4281       * create false positives if it truly is locale */
4282       for (value = 0; value < loop_max; value++) {
4283        if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4284         ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4285        }
4286       }
4287       }
4288      }
4289      break;
4290     case NPOSIXA:
4291      loop_max = 128;
4292      /* FALL THROUGH */
4293     case NPOSIXL:
4294     case NPOSIXU:
4295     case NPOSIXD:
4296      classnum = FLAGS(scan);
4297      if (flags & SCF_DO_STCLASS_AND) {
4298       if (!(data->start_class->flags & ANYOF_LOCALE)) {
4299        ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4300        for (value = 0; value < loop_max; value++) {
4301         if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4302          ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4303         }
4304        }
4305       }
4306      }
4307      else {
4308       if (data->start_class->flags & ANYOF_LOCALE) {
4309        ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4310       }
4311       else {
4312
4313       /* Even if under locale, set the bits for non-locale in
4314       * case it isn't a true locale-node.  This will create
4315       * false positives if it truly is locale */
4316       for (value = 0; value < loop_max; value++) {
4317        if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4318         ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4319        }
4320       }
4321       if (PL_regkind[OP(scan)] == NPOSIXD) {
4322        data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4323       }
4324       }
4325      }
4326      break;
4327     }
4328     if (flags & SCF_DO_STCLASS_OR)
4329      cl_and(data->start_class, and_withp);
4330     flags &= ~SCF_DO_STCLASS;
4331    }
4332   }
4333   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4334    data->flags |= (OP(scan) == MEOL
4335        ? SF_BEFORE_MEOL
4336        : SF_BEFORE_SEOL);
4337    SCAN_COMMIT(pRExC_state, data, minlenp);
4338
4339   }
4340   else if (  PL_regkind[OP(scan)] == BRANCHJ
4341     /* Lookbehind, or need to calculate parens/evals/stclass: */
4342     && (scan->flags || data || (flags & SCF_DO_STCLASS))
4343     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4344    if ( OP(scan) == UNLESSM &&
4345     scan->flags == 0 &&
4346     OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4347     OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4348    ) {
4349     regnode *opt;
4350     regnode *upto= regnext(scan);
4351     DEBUG_PARSE_r({
4352      SV * const mysv_val=sv_newmortal();
4353      DEBUG_STUDYDATA("OPFAIL",data,depth);
4354
4355      /*DEBUG_PARSE_MSG("opfail");*/
4356      regprop(RExC_rx, mysv_val, upto);
4357      PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4358         SvPV_nolen_const(mysv_val),
4359         (IV)REG_NODE_NUM(upto),
4360         (IV)(upto - scan)
4361      );
4362     });
4363     OP(scan) = OPFAIL;
4364     NEXT_OFF(scan) = upto - scan;
4365     for (opt= scan + 1; opt < upto ; opt++)
4366      OP(opt) = OPTIMIZED;
4367     scan= upto;
4368     continue;
4369    }
4370    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4371     || OP(scan) == UNLESSM )
4372    {
4373     /* Negative Lookahead/lookbehind
4374     In this case we can't do fixed string optimisation.
4375     */
4376
4377     I32 deltanext, minnext, fake = 0;
4378     regnode *nscan;
4379     struct regnode_charclass_class intrnl;
4380     int f = 0;
4381
4382     data_fake.flags = 0;
4383     if (data) {
4384      data_fake.whilem_c = data->whilem_c;
4385      data_fake.last_closep = data->last_closep;
4386     }
4387     else
4388      data_fake.last_closep = &fake;
4389     data_fake.pos_delta = delta;
4390     if ( flags & SCF_DO_STCLASS && !scan->flags
4391      && OP(scan) == IFMATCH ) { /* Lookahead */
4392      cl_init(pRExC_state, &intrnl);
4393      data_fake.start_class = &intrnl;
4394      f |= SCF_DO_STCLASS_AND;
4395     }
4396     if (flags & SCF_WHILEM_VISITED_POS)
4397      f |= SCF_WHILEM_VISITED_POS;
4398     next = regnext(scan);
4399     nscan = NEXTOPER(NEXTOPER(scan));
4400     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4401      last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4402     if (scan->flags) {
4403      if (deltanext) {
4404       FAIL("Variable length lookbehind not implemented");
4405      }
4406      else if (minnext > (I32)U8_MAX) {
4407       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4408      }
4409      scan->flags = (U8)minnext;
4410     }
4411     if (data) {
4412      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4413       pars++;
4414      if (data_fake.flags & SF_HAS_EVAL)
4415       data->flags |= SF_HAS_EVAL;
4416      data->whilem_c = data_fake.whilem_c;
4417     }
4418     if (f & SCF_DO_STCLASS_AND) {
4419      if (flags & SCF_DO_STCLASS_OR) {
4420       /* OR before, AND after: ideally we would recurse with
4421       * data_fake to get the AND applied by study of the
4422       * remainder of the pattern, and then derecurse;
4423       * *** HACK *** for now just treat as "no information".
4424       * See [perl #56690].
4425       */
4426       cl_init(pRExC_state, data->start_class);
4427      }  else {
4428       /* AND before and after: combine and continue */
4429       const int was = TEST_SSC_EOS(data->start_class);
4430
4431       cl_and(data->start_class, &intrnl);
4432       if (was)
4433        SET_SSC_EOS(data->start_class);
4434      }
4435     }
4436    }
4437 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4438    else {
4439     /* Positive Lookahead/lookbehind
4440     In this case we can do fixed string optimisation,
4441     but we must be careful about it. Note in the case of
4442     lookbehind the positions will be offset by the minimum
4443     length of the pattern, something we won't know about
4444     until after the recurse.
4445     */
4446     I32 deltanext, fake = 0;
4447     regnode *nscan;
4448     struct regnode_charclass_class intrnl;
4449     int f = 0;
4450     /* We use SAVEFREEPV so that when the full compile
4451      is finished perl will clean up the allocated
4452      minlens when it's all done. This way we don't
4453      have to worry about freeing them when we know
4454      they wont be used, which would be a pain.
4455     */
4456     I32 *minnextp;
4457     Newx( minnextp, 1, I32 );
4458     SAVEFREEPV(minnextp);
4459
4460     if (data) {
4461      StructCopy(data, &data_fake, scan_data_t);
4462      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4463       f |= SCF_DO_SUBSTR;
4464       if (scan->flags)
4465        SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4466       data_fake.last_found=newSVsv(data->last_found);
4467      }
4468     }
4469     else
4470      data_fake.last_closep = &fake;
4471     data_fake.flags = 0;
4472     data_fake.pos_delta = delta;
4473     if (is_inf)
4474      data_fake.flags |= SF_IS_INF;
4475     if ( flags & SCF_DO_STCLASS && !scan->flags
4476      && OP(scan) == IFMATCH ) { /* Lookahead */
4477      cl_init(pRExC_state, &intrnl);
4478      data_fake.start_class = &intrnl;
4479      f |= SCF_DO_STCLASS_AND;
4480     }
4481     if (flags & SCF_WHILEM_VISITED_POS)
4482      f |= SCF_WHILEM_VISITED_POS;
4483     next = regnext(scan);
4484     nscan = NEXTOPER(NEXTOPER(scan));
4485
4486     *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4487      last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4488     if (scan->flags) {
4489      if (deltanext) {
4490       FAIL("Variable length lookbehind not implemented");
4491      }
4492      else if (*minnextp > (I32)U8_MAX) {
4493       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4494      }
4495      scan->flags = (U8)*minnextp;
4496     }
4497
4498     *minnextp += min;
4499
4500     if (f & SCF_DO_STCLASS_AND) {
4501      const int was = TEST_SSC_EOS(data.start_class);
4502
4503      cl_and(data->start_class, &intrnl);
4504      if (was)
4505       SET_SSC_EOS(data->start_class);
4506     }
4507     if (data) {
4508      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4509       pars++;
4510      if (data_fake.flags & SF_HAS_EVAL)
4511       data->flags |= SF_HAS_EVAL;
4512      data->whilem_c = data_fake.whilem_c;
4513      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4514       if (RExC_rx->minlen<*minnextp)
4515        RExC_rx->minlen=*minnextp;
4516       SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4517       SvREFCNT_dec_NN(data_fake.last_found);
4518
4519       if ( data_fake.minlen_fixed != minlenp )
4520       {
4521        data->offset_fixed= data_fake.offset_fixed;
4522        data->minlen_fixed= data_fake.minlen_fixed;
4523        data->lookbehind_fixed+= scan->flags;
4524       }
4525       if ( data_fake.minlen_float != minlenp )
4526       {
4527        data->minlen_float= data_fake.minlen_float;
4528        data->offset_float_min=data_fake.offset_float_min;
4529        data->offset_float_max=data_fake.offset_float_max;
4530        data->lookbehind_float+= scan->flags;
4531       }
4532      }
4533     }
4534    }
4535 #endif
4536   }
4537   else if (OP(scan) == OPEN) {
4538    if (stopparen != (I32)ARG(scan))
4539     pars++;
4540   }
4541   else if (OP(scan) == CLOSE) {
4542    if (stopparen == (I32)ARG(scan)) {
4543     break;
4544    }
4545    if ((I32)ARG(scan) == is_par) {
4546     next = regnext(scan);
4547
4548     if ( next && (OP(next) != WHILEM) && next < last)
4549      is_par = 0;  /* Disable optimization */
4550    }
4551    if (data)
4552     *(data->last_closep) = ARG(scan);
4553   }
4554   else if (OP(scan) == EVAL) {
4555     if (data)
4556      data->flags |= SF_HAS_EVAL;
4557   }
4558   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4559    if (flags & SCF_DO_SUBSTR) {
4560     SCAN_COMMIT(pRExC_state,data,minlenp);
4561     flags &= ~SCF_DO_SUBSTR;
4562    }
4563    if (data && OP(scan)==ACCEPT) {
4564     data->flags |= SCF_SEEN_ACCEPT;
4565     if (stopmin > min)
4566      stopmin = min;
4567    }
4568   }
4569   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4570   {
4571     if (flags & SCF_DO_SUBSTR) {
4572      SCAN_COMMIT(pRExC_state,data,minlenp);
4573      data->longest = &(data->longest_float);
4574     }
4575     is_inf = is_inf_internal = 1;
4576     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4577      cl_anything(pRExC_state, data->start_class);
4578     flags &= ~SCF_DO_STCLASS;
4579   }
4580   else if (OP(scan) == GPOS) {
4581    if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4582     !(delta || is_inf || (data && data->pos_delta)))
4583    {
4584     if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4585      RExC_rx->extflags |= RXf_ANCH_GPOS;
4586     if (RExC_rx->gofs < (U32)min)
4587      RExC_rx->gofs = min;
4588    } else {
4589     RExC_rx->extflags |= RXf_GPOS_FLOAT;
4590     RExC_rx->gofs = 0;
4591    }
4592   }
4593 #ifdef TRIE_STUDY_OPT
4594 #ifdef FULL_TRIE_STUDY
4595   else if (PL_regkind[OP(scan)] == TRIE) {
4596    /* NOTE - There is similar code to this block above for handling
4597    BRANCH nodes on the initial study.  If you change stuff here
4598    check there too. */
4599    regnode *trie_node= scan;
4600    regnode *tail= regnext(scan);
4601    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4602    I32 max1 = 0, min1 = I32_MAX;
4603    struct regnode_charclass_class accum;
4604
4605    if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4606     SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4607    if (flags & SCF_DO_STCLASS)
4608     cl_init_zero(pRExC_state, &accum);
4609
4610    if (!trie->jump) {
4611     min1= trie->minlen;
4612     max1= trie->maxlen;
4613    } else {
4614     const regnode *nextbranch= NULL;
4615     U32 word;
4616
4617     for ( word=1 ; word <= trie->wordcount ; word++)
4618     {
4619      I32 deltanext=0, minnext=0, f = 0, fake;
4620      struct regnode_charclass_class this_class;
4621
4622      data_fake.flags = 0;
4623      if (data) {
4624       data_fake.whilem_c = data->whilem_c;
4625       data_fake.last_closep = data->last_closep;
4626      }
4627      else
4628       data_fake.last_closep = &fake;
4629      data_fake.pos_delta = delta;
4630      if (flags & SCF_DO_STCLASS) {
4631       cl_init(pRExC_state, &this_class);
4632       data_fake.start_class = &this_class;
4633       f = SCF_DO_STCLASS_AND;
4634      }
4635      if (flags & SCF_WHILEM_VISITED_POS)
4636       f |= SCF_WHILEM_VISITED_POS;
4637
4638      if (trie->jump[word]) {
4639       if (!nextbranch)
4640        nextbranch = trie_node + trie->jump[0];
4641       scan= trie_node + trie->jump[word];
4642       /* We go from the jump point to the branch that follows
4643       it. Note this means we need the vestigal unused branches
4644       even though they arent otherwise used.
4645       */
4646       minnext = study_chunk(pRExC_state, &scan, minlenp,
4647        &deltanext, (regnode *)nextbranch, &data_fake,
4648        stopparen, recursed, NULL, f,depth+1);
4649      }
4650      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4651       nextbranch= regnext((regnode*)nextbranch);
4652
4653      if (min1 > (I32)(minnext + trie->minlen))
4654       min1 = minnext + trie->minlen;
4655      if (deltanext == I32_MAX) {
4656       is_inf = is_inf_internal = 1;
4657       max1 = I32_MAX;
4658      } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4659       max1 = minnext + deltanext + trie->maxlen;
4660
4661      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4662       pars++;
4663      if (data_fake.flags & SCF_SEEN_ACCEPT) {
4664       if ( stopmin > min + min1)
4665        stopmin = min + min1;
4666       flags &= ~SCF_DO_SUBSTR;
4667       if (data)
4668        data->flags |= SCF_SEEN_ACCEPT;
4669      }
4670      if (data) {
4671       if (data_fake.flags & SF_HAS_EVAL)
4672        data->flags |= SF_HAS_EVAL;
4673       data->whilem_c = data_fake.whilem_c;
4674      }
4675      if (flags & SCF_DO_STCLASS)
4676       cl_or(pRExC_state, &accum, &this_class);
4677     }
4678    }
4679    if (flags & SCF_DO_SUBSTR) {
4680     data->pos_min += min1;
4681     data->pos_delta += max1 - min1;
4682     if (max1 != min1 || is_inf)
4683      data->longest = &(data->longest_float);
4684    }
4685    min += min1;
4686    delta += max1 - min1;
4687    if (flags & SCF_DO_STCLASS_OR) {
4688     cl_or(pRExC_state, data->start_class, &accum);
4689     if (min1) {
4690      cl_and(data->start_class, and_withp);
4691      flags &= ~SCF_DO_STCLASS;
4692     }
4693    }
4694    else if (flags & SCF_DO_STCLASS_AND) {
4695     if (min1) {
4696      cl_and(data->start_class, &accum);
4697      flags &= ~SCF_DO_STCLASS;
4698     }
4699     else {
4700      /* Switch to OR mode: cache the old value of
4701      * data->start_class */
4702      INIT_AND_WITHP;
4703      StructCopy(data->start_class, and_withp,
4704        struct regnode_charclass_class);
4705      flags &= ~SCF_DO_STCLASS_AND;
4706      StructCopy(&accum, data->start_class,
4707        struct regnode_charclass_class);
4708      flags |= SCF_DO_STCLASS_OR;
4709      SET_SSC_EOS(data->start_class);
4710     }
4711    }
4712    scan= tail;
4713    continue;
4714   }
4715 #else
4716   else if (PL_regkind[OP(scan)] == TRIE) {
4717    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4718    U8*bang=NULL;
4719
4720    min += trie->minlen;
4721    delta += (trie->maxlen - trie->minlen);
4722    flags &= ~SCF_DO_STCLASS; /* xxx */
4723    if (flags & SCF_DO_SUBSTR) {
4724      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4725      data->pos_min += trie->minlen;
4726      data->pos_delta += (trie->maxlen - trie->minlen);
4727     if (trie->maxlen != trie->minlen)
4728      data->longest = &(data->longest_float);
4729     }
4730     if (trie->jump) /* no more substrings -- for now /grr*/
4731      flags &= ~SCF_DO_SUBSTR;
4732   }
4733 #endif /* old or new */
4734 #endif /* TRIE_STUDY_OPT */
4735
4736   /* Else: zero-length, ignore. */
4737   scan = regnext(scan);
4738  }
4739  if (frame) {
4740   last = frame->last;
4741   scan = frame->next;
4742   stopparen = frame->stop;
4743   frame = frame->prev;
4744   goto fake_study_recurse;
4745  }
4746
4747   finish:
4748  assert(!frame);
4749  DEBUG_STUDYDATA("pre-fin:",data,depth);
4750
4751  *scanp = scan;
4752  *deltap = is_inf_internal ? I32_MAX : delta;
4753  if (flags & SCF_DO_SUBSTR && is_inf)
4754   data->pos_delta = I32_MAX - data->pos_min;
4755  if (is_par > (I32)U8_MAX)
4756   is_par = 0;
4757  if (is_par && pars==1 && data) {
4758   data->flags |= SF_IN_PAR;
4759   data->flags &= ~SF_HAS_PAR;
4760  }
4761  else if (pars && data) {
4762   data->flags |= SF_HAS_PAR;
4763   data->flags &= ~SF_IN_PAR;
4764  }
4765  if (flags & SCF_DO_STCLASS_OR)
4766   cl_and(data->start_class, and_withp);
4767  if (flags & SCF_TRIE_RESTUDY)
4768   data->flags |=  SCF_TRIE_RESTUDY;
4769
4770  DEBUG_STUDYDATA("post-fin:",data,depth);
4771
4772  return min < stopmin ? min : stopmin;
4773 }
4774
4775 STATIC U32
4776 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4777 {
4778  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4779
4780  PERL_ARGS_ASSERT_ADD_DATA;
4781
4782  Renewc(RExC_rxi->data,
4783   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4784   char, struct reg_data);
4785  if(count)
4786   Renew(RExC_rxi->data->what, count + n, U8);
4787  else
4788   Newx(RExC_rxi->data->what, n, U8);
4789  RExC_rxi->data->count = count + n;
4790  Copy(s, RExC_rxi->data->what + count, n, U8);
4791  return count;
4792 }
4793
4794 /*XXX: todo make this not included in a non debugging perl */
4795 #ifndef PERL_IN_XSUB_RE
4796 void
4797 Perl_reginitcolors(pTHX)
4798 {
4799  dVAR;
4800  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4801  if (s) {
4802   char *t = savepv(s);
4803   int i = 0;
4804   PL_colors[0] = t;
4805   while (++i < 6) {
4806    t = strchr(t, '\t');
4807    if (t) {
4808     *t = '\0';
4809     PL_colors[i] = ++t;
4810    }
4811    else
4812     PL_colors[i] = t = (char *)"";
4813   }
4814  } else {
4815   int i = 0;
4816   while (i < 6)
4817    PL_colors[i++] = (char *)"";
4818  }
4819  PL_colorset = 1;
4820 }
4821 #endif
4822
4823
4824 #ifdef TRIE_STUDY_OPT
4825 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4826  STMT_START {                                            \
4827   if (                                                \
4828    (data.flags & SCF_TRIE_RESTUDY)               \
4829    && ! restudied++                              \
4830   ) {                                                 \
4831    dOsomething;                                    \
4832    goto reStudy;                                   \
4833   }                                                   \
4834  } STMT_END
4835 #else
4836 #define CHECK_RESTUDY_GOTO_butfirst
4837 #endif
4838
4839 /*
4840  * pregcomp - compile a regular expression into internal code
4841  *
4842  * Decides which engine's compiler to call based on the hint currently in
4843  * scope
4844  */
4845
4846 #ifndef PERL_IN_XSUB_RE
4847
4848 /* return the currently in-scope regex engine (or the default if none)  */
4849
4850 regexp_engine const *
4851 Perl_current_re_engine(pTHX)
4852 {
4853  dVAR;
4854
4855  if (IN_PERL_COMPILETIME) {
4856   HV * const table = GvHV(PL_hintgv);
4857   SV **ptr;
4858
4859   if (!table)
4860    return &reh_regexp_engine;
4861   ptr = hv_fetchs(table, "regcomp", FALSE);
4862   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4863    return &reh_regexp_engine;
4864   return INT2PTR(regexp_engine*,SvIV(*ptr));
4865  }
4866  else {
4867   SV *ptr;
4868   if (!PL_curcop->cop_hints_hash)
4869    return &reh_regexp_engine;
4870   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4871   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4872    return &reh_regexp_engine;
4873   return INT2PTR(regexp_engine*,SvIV(ptr));
4874  }
4875 }
4876
4877
4878 REGEXP *
4879 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4880 {
4881  dVAR;
4882  regexp_engine const *eng = current_re_engine();
4883  GET_RE_DEBUG_FLAGS_DECL;
4884
4885  PERL_ARGS_ASSERT_PREGCOMP;
4886
4887  /* Dispatch a request to compile a regexp to correct regexp engine. */
4888  DEBUG_COMPILE_r({
4889   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4890       PTR2UV(eng));
4891  });
4892  return CALLREGCOMP_ENG(eng, pattern, flags);
4893 }
4894 #endif
4895
4896 /* public(ish) entry point for the perl core's own regex compiling code.
4897  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4898  * pattern rather than a list of OPs, and uses the internal engine rather
4899  * than the current one */
4900
4901 REGEXP *
4902 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4903 {
4904  SV *pat = pattern; /* defeat constness! */
4905  PERL_ARGS_ASSERT_RE_COMPILE;
4906  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4907 #ifdef PERL_IN_XSUB_RE
4908         &my_reg_engine,
4909 #else
4910         &reh_regexp_engine,
4911 #endif
4912         NULL, NULL, rx_flags, 0);
4913 }
4914
4915
4916 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
4917  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
4918  * point to the realloced string and length.
4919  *
4920  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
4921  * stuff added */
4922
4923 static void
4924 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
4925      char **pat_p, STRLEN *plen_p, int num_code_blocks)
4926 {
4927  U8 *const src = (U8*)*pat_p;
4928  U8 *dst;
4929  int n=0;
4930  STRLEN s = 0, d = 0;
4931  bool do_end = 0;
4932  GET_RE_DEBUG_FLAGS_DECL;
4933
4934  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4935   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4936
4937  Newx(dst, *plen_p * 2 + 1, U8);
4938
4939  while (s < *plen_p) {
4940   const UV uv = NATIVE_TO_ASCII(src[s]);
4941   if (UNI_IS_INVARIANT(uv))
4942    dst[d]   = (U8)UTF_TO_NATIVE(uv);
4943   else {
4944    dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
4945    dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
4946   }
4947   if (n < num_code_blocks) {
4948    if (!do_end && pRExC_state->code_blocks[n].start == s) {
4949     pRExC_state->code_blocks[n].start = d;
4950     assert(dst[d] == '(');
4951     do_end = 1;
4952    }
4953    else if (do_end && pRExC_state->code_blocks[n].end == s) {
4954     pRExC_state->code_blocks[n].end = d;
4955     assert(dst[d] == ')');
4956     do_end = 0;
4957     n++;
4958    }
4959   }
4960   s++;
4961   d++;
4962  }
4963  dst[d] = '\0';
4964  *plen_p = d;
4965  *pat_p = (char*) dst;
4966  SAVEFREEPV(*pat_p);
4967  RExC_orig_utf8 = RExC_utf8 = 1;
4968 }
4969
4970
4971
4972 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
4973  * while recording any code block indices, and handling overloading,
4974  * nested qr// objects etc.  If pat is null, it will allocate a new
4975  * string, or just return the first arg, if there's only one.
4976  *
4977  * Returns the malloced/updated pat.
4978  * patternp and pat_count is the array of SVs to be concatted;
4979  * oplist is the optional list of ops that generated the SVs;
4980  * recompile_p is a pointer to a boolean that will be set if
4981  *   the regex will need to be recompiled.
4982  * delim, if non-null is an SV that will be inserted between each element
4983  */
4984
4985 static SV*
4986 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
4987     SV *pat, SV ** const patternp, int pat_count,
4988     OP *oplist, bool *recompile_p, SV *delim)
4989 {
4990  SV **svp;
4991  int n = 0;
4992  bool use_delim = FALSE;
4993  bool alloced = FALSE;
4994
4995  /* if we know we have at least two args, create an empty string,
4996  * then concatenate args to that. For no args, return an empty string */
4997  if (!pat && pat_count != 1) {
4998   pat = newSVpvn("", 0);
4999   SAVEFREESV(pat);
5000   alloced = TRUE;
5001  }
5002
5003  for (svp = patternp; svp < patternp + pat_count; svp++) {
5004   SV *sv;
5005   SV *rx  = NULL;
5006   STRLEN orig_patlen = 0;
5007   bool code = 0;
5008   SV *msv = use_delim ? delim : *svp;
5009
5010   /* if we've got a delimiter, we go round the loop twice for each
5011   * svp slot (except the last), using the delimiter the second
5012   * time round */
5013   if (use_delim) {
5014    svp--;
5015    use_delim = FALSE;
5016   }
5017   else if (delim)
5018    use_delim = TRUE;
5019
5020   if (SvTYPE(msv) == SVt_PVAV) {
5021    /* we've encountered an interpolated array within
5022    * the pattern, e.g. /...@a..../. Expand the list of elements,
5023    * then recursively append elements.
5024    * The code in this block is based on S_pushav() */
5025
5026    AV *const av = (AV*)msv;
5027    const I32 maxarg = AvFILL(av) + 1;
5028    SV **array;
5029
5030    if (oplist) {
5031     assert(oplist->op_type == OP_PADAV
5032      || oplist->op_type == OP_RV2AV);
5033     oplist = oplist->op_sibling;;
5034    }
5035
5036    if (SvRMAGICAL(av)) {
5037     U32 i;
5038
5039     Newx(array, maxarg, SV*);
5040     SAVEFREEPV(array);
5041     for (i=0; i < (U32)maxarg; i++) {
5042      SV ** const svp = av_fetch(av, i, FALSE);
5043      array[i] = svp ? *svp : &PL_sv_undef;
5044     }
5045    }
5046    else
5047     array = AvARRAY(av);
5048
5049    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5050         array, maxarg, NULL, recompile_p,
5051         /* $" */
5052         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5053
5054    continue;
5055   }
5056
5057
5058   /* we make the assumption here that each op in the list of
5059   * op_siblings maps to one SV pushed onto the stack,
5060   * except for code blocks, with have both an OP_NULL and
5061   * and OP_CONST.
5062   * This allows us to match up the list of SVs against the
5063   * list of OPs to find the next code block.
5064   *
5065   * Note that       PUSHMARK PADSV PADSV ..
5066   * is optimised to
5067   *                 PADRANGE PADSV  PADSV  ..
5068   * so the alignment still works. */
5069
5070   if (oplist) {
5071    if (oplist->op_type == OP_NULL
5072     && (oplist->op_flags & OPf_SPECIAL))
5073    {
5074     assert(n < pRExC_state->num_code_blocks);
5075     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5076     pRExC_state->code_blocks[n].block = oplist;
5077     pRExC_state->code_blocks[n].src_regex = NULL;
5078     n++;
5079     code = 1;
5080     oplist = oplist->op_sibling; /* skip CONST */
5081     assert(oplist);
5082    }
5083    oplist = oplist->op_sibling;;
5084   }
5085
5086   /* apply magic and QR overloading to arg */
5087
5088   SvGETMAGIC(msv);
5089   if (SvROK(msv) && SvAMAGIC(msv)) {
5090    SV *sv = AMG_CALLunary(msv, regexp_amg);
5091    if (sv) {
5092     if (SvROK(sv))
5093      sv = SvRV(sv);
5094     if (SvTYPE(sv) != SVt_REGEXP)
5095      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5096     msv = sv;
5097    }
5098   }
5099
5100   /* try concatenation overload ... */
5101   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5102     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5103   {
5104    sv_setsv(pat, sv);
5105    /* overloading involved: all bets are off over literal
5106    * code. Pretend we haven't seen it */
5107    pRExC_state->num_code_blocks -= n;
5108    n = 0;
5109   }
5110   else  {
5111    /* ... or failing that, try "" overload */
5112    while (SvAMAGIC(msv)
5113      && (sv = AMG_CALLunary(msv, string_amg))
5114      && sv != msv
5115      &&  !(   SvROK(msv)
5116       && SvROK(sv)
5117       && SvRV(msv) == SvRV(sv))
5118    ) {
5119     msv = sv;
5120     SvGETMAGIC(msv);
5121    }
5122    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5123     msv = SvRV(msv);
5124
5125    if (pat) {
5126     /* this is a partially unrolled
5127     *     sv_catsv_nomg(pat, msv);
5128     * that allows us to adjust code block indices if
5129     * needed */
5130     STRLEN dlen;
5131     char *dst = SvPV_force_nomg(pat, dlen);
5132     orig_patlen = dlen;
5133     if (SvUTF8(msv) && !SvUTF8(pat)) {
5134      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5135      sv_setpvn(pat, dst, dlen);
5136      SvUTF8_on(pat);
5137     }
5138     sv_catsv_nomg(pat, msv);
5139     rx = msv;
5140    }
5141    else
5142     pat = msv;
5143
5144    if (code)
5145     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5146   }
5147
5148   /* extract any code blocks within any embedded qr//'s */
5149   if (rx && SvTYPE(rx) == SVt_REGEXP
5150    && RX_ENGINE((REGEXP*)rx)->op_comp)
5151   {
5152
5153    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5154    if (ri->num_code_blocks) {
5155     int i;
5156     /* the presence of an embedded qr// with code means
5157     * we should always recompile: the text of the
5158     * qr// may not have changed, but it may be a
5159     * different closure than last time */
5160     *recompile_p = 1;
5161     Renew(pRExC_state->code_blocks,
5162      pRExC_state->num_code_blocks + ri->num_code_blocks,
5163      struct reg_code_block);
5164     pRExC_state->num_code_blocks += ri->num_code_blocks;
5165
5166     for (i=0; i < ri->num_code_blocks; i++) {
5167      struct reg_code_block *src, *dst;
5168      STRLEN offset =  orig_patlen
5169       + ReANY((REGEXP *)rx)->pre_prefix;
5170      assert(n < pRExC_state->num_code_blocks);
5171      src = &ri->code_blocks[i];
5172      dst = &pRExC_state->code_blocks[n];
5173      dst->start     = src->start + offset;
5174      dst->end     = src->end   + offset;
5175      dst->block     = src->block;
5176      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5177            src->src_regex
5178             ? src->src_regex
5179             : (REGEXP*)rx);
5180      n++;
5181     }
5182    }
5183   }
5184  }
5185  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5186  if (alloced)
5187   SvSETMAGIC(pat);
5188
5189  return pat;
5190 }
5191
5192
5193
5194 /* see if there are any run-time code blocks in the pattern.
5195  * False positives are allowed */
5196
5197 static bool
5198 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5199      char *pat, STRLEN plen)
5200 {
5201  int n = 0;
5202  STRLEN s;
5203
5204  for (s = 0; s < plen; s++) {
5205   if (n < pRExC_state->num_code_blocks
5206    && s == pRExC_state->code_blocks[n].start)
5207   {
5208    s = pRExC_state->code_blocks[n].end;
5209    n++;
5210    continue;
5211   }
5212   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5213   * positives here */
5214   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5215    (pat[s+2] == '{'
5216     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5217   )
5218    return 1;
5219  }
5220  return 0;
5221 }
5222
5223 /* Handle run-time code blocks. We will already have compiled any direct
5224  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5225  * copy of it, but with any literal code blocks blanked out and
5226  * appropriate chars escaped; then feed it into
5227  *
5228  *    eval "qr'modified_pattern'"
5229  *
5230  * For example,
5231  *
5232  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5233  *
5234  * becomes
5235  *
5236  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5237  *
5238  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5239  * and merge them with any code blocks of the original regexp.
5240  *
5241  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5242  * instead, just save the qr and return FALSE; this tells our caller that
5243  * the original pattern needs upgrading to utf8.
5244  */
5245
5246 static bool
5247 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5248  char *pat, STRLEN plen)
5249 {
5250  SV *qr;
5251
5252  GET_RE_DEBUG_FLAGS_DECL;
5253
5254  if (pRExC_state->runtime_code_qr) {
5255   /* this is the second time we've been called; this should
5256   * only happen if the main pattern got upgraded to utf8
5257   * during compilation; re-use the qr we compiled first time
5258   * round (which should be utf8 too)
5259   */
5260   qr = pRExC_state->runtime_code_qr;
5261   pRExC_state->runtime_code_qr = NULL;
5262   assert(RExC_utf8 && SvUTF8(qr));
5263  }
5264  else {
5265   int n = 0;
5266   STRLEN s;
5267   char *p, *newpat;
5268   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5269   SV *sv, *qr_ref;
5270   dSP;
5271
5272   /* determine how many extra chars we need for ' and \ escaping */
5273   for (s = 0; s < plen; s++) {
5274    if (pat[s] == '\'' || pat[s] == '\\')
5275     newlen++;
5276   }
5277
5278   Newx(newpat, newlen, char);
5279   p = newpat;
5280   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5281
5282   for (s = 0; s < plen; s++) {
5283    if (n < pRExC_state->num_code_blocks
5284     && s == pRExC_state->code_blocks[n].start)
5285    {
5286     /* blank out literal code block */
5287     assert(pat[s] == '(');
5288     while (s <= pRExC_state->code_blocks[n].end) {
5289      *p++ = '_';
5290      s++;
5291     }
5292     s--;
5293     n++;
5294     continue;
5295    }
5296    if (pat[s] == '\'' || pat[s] == '\\')
5297     *p++ = '\\';
5298    *p++ = pat[s];
5299   }
5300   *p++ = '\'';
5301   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5302    *p++ = 'x';
5303   *p++ = '\0';
5304   DEBUG_COMPILE_r({
5305    PerlIO_printf(Perl_debug_log,
5306     "%sre-parsing pattern for runtime code:%s %s\n",
5307     PL_colors[4],PL_colors[5],newpat);
5308   });
5309
5310   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5311   Safefree(newpat);
5312
5313   ENTER;
5314   SAVETMPS;
5315   save_re_context();
5316   PUSHSTACKi(PERLSI_REQUIRE);
5317   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5318   * parsing qr''; normally only q'' does this. It also alters
5319   * hints handling */
5320   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5321   SvREFCNT_dec_NN(sv);
5322   SPAGAIN;
5323   qr_ref = POPs;
5324   PUTBACK;
5325   {
5326    SV * const errsv = ERRSV;
5327    if (SvTRUE_NN(errsv))
5328    {
5329     Safefree(pRExC_state->code_blocks);
5330     /* use croak_sv ? */
5331     Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5332    }
5333   }
5334   assert(SvROK(qr_ref));
5335   qr = SvRV(qr_ref);
5336   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5337   /* the leaving below frees the tmp qr_ref.
5338   * Give qr a life of its own */
5339   SvREFCNT_inc(qr);
5340   POPSTACK;
5341   FREETMPS;
5342   LEAVE;
5343
5344  }
5345
5346  if (!RExC_utf8 && SvUTF8(qr)) {
5347   /* first time through; the pattern got upgraded; save the
5348   * qr for the next time through */
5349   assert(!pRExC_state->runtime_code_qr);
5350   pRExC_state->runtime_code_qr = qr;
5351   return 0;
5352  }
5353
5354
5355  /* extract any code blocks within the returned qr//  */
5356
5357
5358  /* merge the main (r1) and run-time (r2) code blocks into one */
5359  {
5360   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5361   struct reg_code_block *new_block, *dst;
5362   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5363   int i1 = 0, i2 = 0;
5364
5365   if (!r2->num_code_blocks) /* we guessed wrong */
5366   {
5367    SvREFCNT_dec_NN(qr);
5368    return 1;
5369   }
5370
5371   Newx(new_block,
5372    r1->num_code_blocks + r2->num_code_blocks,
5373    struct reg_code_block);
5374   dst = new_block;
5375
5376   while (    i1 < r1->num_code_blocks
5377     || i2 < r2->num_code_blocks)
5378   {
5379    struct reg_code_block *src;
5380    bool is_qr = 0;
5381
5382    if (i1 == r1->num_code_blocks) {
5383     src = &r2->code_blocks[i2++];
5384     is_qr = 1;
5385    }
5386    else if (i2 == r2->num_code_blocks)
5387     src = &r1->code_blocks[i1++];
5388    else if (  r1->code_blocks[i1].start
5389      < r2->code_blocks[i2].start)
5390    {
5391     src = &r1->code_blocks[i1++];
5392     assert(src->end < r2->code_blocks[i2].start);
5393    }
5394    else {
5395     assert(  r1->code_blocks[i1].start
5396      > r2->code_blocks[i2].start);
5397     src = &r2->code_blocks[i2++];
5398     is_qr = 1;
5399     assert(src->end < r1->code_blocks[i1].start);
5400    }
5401
5402    assert(pat[src->start] == '(');
5403    assert(pat[src->end]   == ')');
5404    dst->start     = src->start;
5405    dst->end     = src->end;
5406    dst->block     = src->block;
5407    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5408          : src->src_regex;
5409    dst++;
5410   }
5411   r1->num_code_blocks += r2->num_code_blocks;
5412   Safefree(r1->code_blocks);
5413   r1->code_blocks = new_block;
5414  }
5415
5416  SvREFCNT_dec_NN(qr);
5417  return 1;
5418 }
5419
5420
5421 STATIC bool
5422 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5423 {
5424  /* This is the common code for setting up the floating and fixed length
5425  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
5426  * as to whether succeeded or not */
5427
5428  I32 t,ml;
5429
5430  if (! (longest_length
5431   || (eol /* Can't have SEOL and MULTI */
5432    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5433   )
5434    /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5435   || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5436  {
5437   return FALSE;
5438  }
5439
5440  /* copy the information about the longest from the reg_scan_data
5441   over to the program. */
5442  if (SvUTF8(sv_longest)) {
5443   *rx_utf8 = sv_longest;
5444   *rx_substr = NULL;
5445  } else {
5446   *rx_substr = sv_longest;
5447   *rx_utf8 = NULL;
5448  }
5449  /* end_shift is how many chars that must be matched that
5450   follow this item. We calculate it ahead of time as once the
5451   lookbehind offset is added in we lose the ability to correctly
5452   calculate it.*/
5453  ml = minlen ? *(minlen) : (I32)longest_length;
5454  *rx_end_shift = ml - offset
5455   - longest_length + (SvTAIL(sv_longest) != 0)
5456   + lookbehind;
5457
5458  t = (eol/* Can't have SEOL and MULTI */
5459   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5460  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5461
5462  return TRUE;
5463 }
5464
5465 /*
5466  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5467  * regular expression into internal code.
5468  * The pattern may be passed either as:
5469  *    a list of SVs (patternp plus pat_count)
5470  *    a list of OPs (expr)
5471  * If both are passed, the SV list is used, but the OP list indicates
5472  * which SVs are actually pre-compiled code blocks
5473  *
5474  * The SVs in the list have magic and qr overloading applied to them (and
5475  * the list may be modified in-place with replacement SVs in the latter
5476  * case).
5477  *
5478  * If the pattern hasn't changed from old_re, then old_re will be
5479  * returned.
5480  *
5481  * eng is the current engine. If that engine has an op_comp method, then
5482  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5483  * do the initial concatenation of arguments and pass on to the external
5484  * engine.
5485  *
5486  * If is_bare_re is not null, set it to a boolean indicating whether the
5487  * arg list reduced (after overloading) to a single bare regex which has
5488  * been returned (i.e. /$qr/).
5489  *
5490  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5491  *
5492  * pm_flags contains the PMf_* flags, typically based on those from the
5493  * pm_flags field of the related PMOP. Currently we're only interested in
5494  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5495  *
5496  * We can't allocate space until we know how big the compiled form will be,
5497  * but we can't compile it (and thus know how big it is) until we've got a
5498  * place to put the code.  So we cheat:  we compile it twice, once with code
5499  * generation turned off and size counting turned on, and once "for real".
5500  * This also means that we don't allocate space until we are sure that the
5501  * thing really will compile successfully, and we never have to move the
5502  * code and thus invalidate pointers into it.  (Note that it has to be in
5503  * one piece because free() must be able to free it all.) [NB: not true in perl]
5504  *
5505  * Beware that the optimization-preparation code in here knows about some
5506  * of the structure of the compiled regexp.  [I'll say.]
5507  */
5508
5509 REGEXP *
5510 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5511      OP *expr, const regexp_engine* eng, REGEXP *old_re,
5512      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5513 {
5514  dVAR;
5515  REGEXP *rx;
5516  struct regexp *r;
5517  regexp_internal *ri;
5518  STRLEN plen;
5519  char *exp;
5520  regnode *scan;
5521  I32 flags;
5522  I32 minlen = 0;
5523  U32 rx_flags;
5524  SV *pat;
5525  SV *code_blocksv = NULL;
5526  SV** new_patternp = patternp;
5527
5528  /* these are all flags - maybe they should be turned
5529  * into a single int with different bit masks */
5530  I32 sawlookahead = 0;
5531  I32 sawplus = 0;
5532  I32 sawopen = 0;
5533  I32 sawminmod = 0;
5534
5535  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5536  bool recompile = 0;
5537  bool runtime_code = 0;
5538  scan_data_t data;
5539  RExC_state_t RExC_state;
5540  RExC_state_t * const pRExC_state = &RExC_state;
5541 #ifdef TRIE_STUDY_OPT
5542  int restudied = 0;
5543  RExC_state_t copyRExC_state;
5544 #endif
5545  GET_RE_DEBUG_FLAGS_DECL;
5546
5547  PERL_ARGS_ASSERT_RE_OP_COMPILE;
5548
5549  DEBUG_r(if (!PL_colorset) reginitcolors());
5550
5551 #ifndef PERL_IN_XSUB_RE
5552  /* Initialize these here instead of as-needed, as is quick and avoids
5553  * having to test them each time otherwise */
5554  if (! PL_AboveLatin1) {
5555   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5556   PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5557   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5558
5559   PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5560         = _new_invlist_C_array(L1PosixAlnum_invlist);
5561   PL_Posix_ptrs[_CC_ALPHANUMERIC]
5562         = _new_invlist_C_array(PosixAlnum_invlist);
5563
5564   PL_L1Posix_ptrs[_CC_ALPHA]
5565         = _new_invlist_C_array(L1PosixAlpha_invlist);
5566   PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5567
5568   PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5569   PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5570
5571   /* Cased is the same as Alpha in the ASCII range */
5572   PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5573   PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5574
5575   PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5576   PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5577
5578   PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5579   PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5580
5581   PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5582   PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5583
5584   PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5585   PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5586
5587   PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5588   PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5589
5590   PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5591   PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5592
5593   PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5594   PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5595   PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5596   PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5597
5598   PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5599   PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5600
5601   PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5602
5603   PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5604   PL_L1Posix_ptrs[_CC_WORDCHAR]
5605         = _new_invlist_C_array(L1PosixWord_invlist);
5606
5607   PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5608   PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5609
5610   PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5611  }
5612 #endif
5613
5614  pRExC_state->code_blocks = NULL;
5615  pRExC_state->num_code_blocks = 0;
5616
5617  if (is_bare_re)
5618   *is_bare_re = FALSE;
5619
5620  if (expr && (expr->op_type == OP_LIST ||
5621     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5622   /* allocate code_blocks if needed */
5623   OP *o;
5624   int ncode = 0;
5625
5626   for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5627    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5628     ncode++; /* count of DO blocks */
5629   if (ncode) {
5630    pRExC_state->num_code_blocks = ncode;
5631    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5632   }
5633  }
5634
5635  if (!pat_count) {
5636   /* compile-time pattern with just OP_CONSTs and DO blocks */
5637
5638   int n;
5639   OP *o;
5640
5641   /* find how many CONSTs there are */
5642   assert(expr);
5643   n = 0;
5644   if (expr->op_type == OP_CONST)
5645    n = 1;
5646   else
5647    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5648     if (o->op_type == OP_CONST)
5649      n++;
5650    }
5651
5652   /* fake up an SV array */
5653
5654   assert(!new_patternp);
5655   Newx(new_patternp, n, SV*);
5656   SAVEFREEPV(new_patternp);
5657   pat_count = n;
5658
5659   n = 0;
5660   if (expr->op_type == OP_CONST)
5661    new_patternp[n] = cSVOPx_sv(expr);
5662   else
5663    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5664     if (o->op_type == OP_CONST)
5665      new_patternp[n++] = cSVOPo_sv;
5666    }
5667
5668  }
5669
5670  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5671   "Assembling pattern from %d elements%s\n", pat_count,
5672    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5673
5674  /* set expr to the first arg op */
5675
5676  if (pRExC_state->num_code_blocks
5677   && expr->op_type != OP_CONST)
5678  {
5679    expr = cLISTOPx(expr)->op_first;
5680    assert(   expr->op_type == OP_PUSHMARK
5681     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5682     || expr->op_type == OP_PADRANGE);
5683    expr = expr->op_sibling;
5684  }
5685
5686  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5687       expr, &recompile, NULL);
5688
5689  /* handle bare (possibly after overloading) regex: foo =~ $re */
5690  {
5691   SV *re = pat;
5692   if (SvROK(re))
5693    re = SvRV(re);
5694   if (SvTYPE(re) == SVt_REGEXP) {
5695    if (is_bare_re)
5696     *is_bare_re = TRUE;
5697    SvREFCNT_inc(re);
5698    Safefree(pRExC_state->code_blocks);
5699    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5700     "Precompiled pattern%s\n",
5701      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5702
5703    return (REGEXP*)re;
5704   }
5705  }
5706
5707  exp = SvPV_nomg(pat, plen);
5708
5709  if (!eng->op_comp) {
5710   if ((SvUTF8(pat) && IN_BYTES)
5711     || SvGMAGICAL(pat) || SvAMAGIC(pat))
5712   {
5713    /* make a temporary copy; either to convert to bytes,
5714    * or to avoid repeating get-magic / overloaded stringify */
5715    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5716           (IN_BYTES ? 0 : SvUTF8(pat)));
5717   }
5718   Safefree(pRExC_state->code_blocks);
5719   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5720  }
5721
5722  /* ignore the utf8ness if the pattern is 0 length */
5723  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5724  RExC_uni_semantics = 0;
5725  RExC_contains_locale = 0;
5726  pRExC_state->runtime_code_qr = NULL;
5727
5728  DEBUG_COMPILE_r({
5729    SV *dsv= sv_newmortal();
5730    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
5731    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5732       PL_colors[4],PL_colors[5],s);
5733   });
5734
5735   redo_first_pass:
5736  /* we jump here if we upgrade the pattern to utf8 and have to
5737  * recompile */
5738
5739  if ((pm_flags & PMf_USE_RE_EVAL)
5740     /* this second condition covers the non-regex literal case,
5741     * i.e.  $foo =~ '(?{})'. */
5742     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
5743  )
5744   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
5745
5746  /* return old regex if pattern hasn't changed */
5747  /* XXX: note in the below we have to check the flags as well as the pattern.
5748  *
5749  * Things get a touch tricky as we have to compare the utf8 flag independently
5750  * from the compile flags.
5751  */
5752
5753  if (   old_re
5754   && !recompile
5755   && !!RX_UTF8(old_re) == !!RExC_utf8
5756   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
5757   && RX_PRECOMP(old_re)
5758   && RX_PRELEN(old_re) == plen
5759   && memEQ(RX_PRECOMP(old_re), exp, plen)
5760   && !runtime_code /* with runtime code, always recompile */ )
5761  {
5762   Safefree(pRExC_state->code_blocks);
5763   return old_re;
5764  }
5765
5766  rx_flags = orig_rx_flags;
5767
5768  if (initial_charset == REGEX_LOCALE_CHARSET) {
5769   RExC_contains_locale = 1;
5770  }
5771  else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5772
5773   /* Set to use unicode semantics if the pattern is in utf8 and has the
5774   * 'depends' charset specified, as it means unicode when utf8  */
5775   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5776  }
5777
5778  RExC_precomp = exp;
5779  RExC_flags = rx_flags;
5780  RExC_pm_flags = pm_flags;
5781
5782  if (runtime_code) {
5783   if (TAINTING_get && TAINT_get)
5784    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5785
5786   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5787    /* whoops, we have a non-utf8 pattern, whilst run-time code
5788    * got compiled as utf8. Try again with a utf8 pattern */
5789    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5790          pRExC_state->num_code_blocks);
5791    goto redo_first_pass;
5792   }
5793  }
5794  assert(!pRExC_state->runtime_code_qr);
5795
5796  RExC_sawback = 0;
5797
5798  RExC_seen = 0;
5799  RExC_in_lookbehind = 0;
5800  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5801  RExC_extralen = 0;
5802  RExC_override_recoding = 0;
5803  RExC_in_multi_char_class = 0;
5804
5805  /* First pass: determine size, legality. */
5806  RExC_parse = exp;
5807  RExC_start = exp;
5808  RExC_end = exp + plen;
5809  RExC_naughty = 0;
5810  RExC_npar = 1;
5811  RExC_nestroot = 0;
5812  RExC_size = 0L;
5813  RExC_emit = &RExC_emit_dummy;
5814  RExC_whilem_seen = 0;
5815  RExC_open_parens = NULL;
5816  RExC_close_parens = NULL;
5817  RExC_opend = NULL;
5818  RExC_paren_names = NULL;
5819 #ifdef DEBUGGING
5820  RExC_paren_name_list = NULL;
5821 #endif
5822  RExC_recurse = NULL;
5823  RExC_recurse_count = 0;
5824  pRExC_state->code_index = 0;
5825
5826 #if 0 /* REGC() is (currently) a NOP at the first pass.
5827  * Clever compilers notice this and complain. --jhi */
5828  REGC((U8)REG_MAGIC, (char*)RExC_emit);
5829 #endif
5830  DEBUG_PARSE_r(
5831   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5832   RExC_lastnum=0;
5833   RExC_lastparse=NULL;
5834  );
5835  /* reg may croak on us, not giving us a chance to free
5836  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5837  need it to survive as long as the regexp (qr/(?{})/).
5838  We must check that code_blocksv is not already set, because we may
5839  have jumped back to restart the sizing pass. */
5840  if (pRExC_state->code_blocks && !code_blocksv) {
5841   code_blocksv = newSV_type(SVt_PV);
5842   SAVEFREESV(code_blocksv);
5843   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5844   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5845  }
5846  if (reg(pRExC_state, 0, &flags,1) == NULL) {
5847   /* It's possible to write a regexp in ascii that represents Unicode
5848   codepoints outside of the byte range, such as via \x{100}. If we
5849   detect such a sequence we have to convert the entire pattern to utf8
5850   and then recompile, as our sizing calculation will have been based
5851   on 1 byte == 1 character, but we will need to use utf8 to encode
5852   at least some part of the pattern, and therefore must convert the whole
5853   thing.
5854   -- dmq */
5855   if (flags & RESTART_UTF8) {
5856    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5857          pRExC_state->num_code_blocks);
5858    goto redo_first_pass;
5859   }
5860   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
5861  }
5862  if (code_blocksv)
5863   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5864
5865  DEBUG_PARSE_r({
5866   PerlIO_printf(Perl_debug_log,
5867    "Required size %"IVdf" nodes\n"
5868    "Starting second pass (creation)\n",
5869    (IV)RExC_size);
5870   RExC_lastnum=0;
5871   RExC_lastparse=NULL;
5872  });
5873
5874  /* The first pass could have found things that force Unicode semantics */
5875  if ((RExC_utf8 || RExC_uni_semantics)
5876   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5877  {
5878   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5879  }
5880
5881  /* Small enough for pointer-storage convention?
5882  If extralen==0, this means that we will not need long jumps. */
5883  if (RExC_size >= 0x10000L && RExC_extralen)
5884   RExC_size += RExC_extralen;
5885  else
5886   RExC_extralen = 0;
5887  if (RExC_whilem_seen > 15)
5888   RExC_whilem_seen = 15;
5889
5890  /* Allocate space and zero-initialize. Note, the two step process
5891  of zeroing when in debug mode, thus anything assigned has to
5892  happen after that */
5893  rx = (REGEXP*) newSV_type(SVt_REGEXP);
5894  r = ReANY(rx);
5895  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5896   char, regexp_internal);
5897  if ( r == NULL || ri == NULL )
5898   FAIL("Regexp out of space");
5899 #ifdef DEBUGGING
5900  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5901  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5902 #else
5903  /* bulk initialize base fields with 0. */
5904  Zero(ri, sizeof(regexp_internal), char);
5905 #endif
5906
5907  /* non-zero initialization begins here */
5908  RXi_SET( r, ri );
5909  r->engine= eng;
5910  r->extflags = rx_flags;
5911  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
5912
5913  if (pm_flags & PMf_IS_QR) {
5914   ri->code_blocks = pRExC_state->code_blocks;
5915   ri->num_code_blocks = pRExC_state->num_code_blocks;
5916  }
5917  else
5918  {
5919   int n;
5920   for (n = 0; n < pRExC_state->num_code_blocks; n++)
5921    if (pRExC_state->code_blocks[n].src_regex)
5922     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5923   SAVEFREEPV(pRExC_state->code_blocks);
5924  }
5925
5926  {
5927   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5928   bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5929
5930   /* The caret is output if there are any defaults: if not all the STD
5931   * flags are set, or if no character set specifier is needed */
5932   bool has_default =
5933      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5934      || ! has_charset);
5935   bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5936   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5937        >> RXf_PMf_STD_PMMOD_SHIFT);
5938   const char *fptr = STD_PAT_MODS;        /*"msix"*/
5939   char *p;
5940   /* Allocate for the worst case, which is all the std flags are turned
5941   * on.  If more precision is desired, we could do a population count of
5942   * the flags set.  This could be done with a small lookup table, or by
5943   * shifting, masking and adding, or even, when available, assembly
5944   * language for a machine-language population count.
5945   * We never output a minus, as all those are defaults, so are
5946   * covered by the caret */
5947   const STRLEN wraplen = plen + has_p + has_runon
5948    + has_default       /* If needs a caret */
5949
5950     /* If needs a character set specifier */
5951    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5952    + (sizeof(STD_PAT_MODS) - 1)
5953    + (sizeof("(?:)") - 1);
5954
5955   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5956   r->xpv_len_u.xpvlenu_pv = p;
5957   if (RExC_utf8)
5958    SvFLAGS(rx) |= SVf_UTF8;
5959   *p++='('; *p++='?';
5960
5961   /* If a default, cover it using the caret */
5962   if (has_default) {
5963    *p++= DEFAULT_PAT_MOD;
5964   }
5965   if (has_charset) {
5966    STRLEN len;
5967    const char* const name = get_regex_charset_name(r->extflags, &len);
5968    Copy(name, p, len, char);
5969    p += len;
5970   }
5971   if (has_p)
5972    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5973   {
5974    char ch;
5975    while((ch = *fptr++)) {
5976     if(reganch & 1)
5977      *p++ = ch;
5978     reganch >>= 1;
5979    }
5980   }
5981
5982   *p++ = ':';
5983   Copy(RExC_precomp, p, plen, char);
5984   assert ((RX_WRAPPED(rx) - p) < 16);
5985   r->pre_prefix = p - RX_WRAPPED(rx);
5986   p += plen;
5987   if (has_runon)
5988    *p++ = '\n';
5989   *p++ = ')';
5990   *p = 0;
5991   SvCUR_set(rx, p - RX_WRAPPED(rx));
5992  }
5993
5994  r->intflags = 0;
5995  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5996
5997  if (RExC_seen & REG_SEEN_RECURSE) {
5998   Newxz(RExC_open_parens, RExC_npar,regnode *);
5999   SAVEFREEPV(RExC_open_parens);
6000   Newxz(RExC_close_parens,RExC_npar,regnode *);
6001   SAVEFREEPV(RExC_close_parens);
6002  }
6003
6004  /* Useful during FAIL. */
6005 #ifdef RE_TRACK_PATTERN_OFFSETS
6006  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6007  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6008       "%s %"UVuf" bytes for offset annotations.\n",
6009       ri->u.offsets ? "Got" : "Couldn't get",
6010       (UV)((2*RExC_size+1) * sizeof(U32))));
6011 #endif
6012  SetProgLen(ri,RExC_size);
6013  RExC_rx_sv = rx;
6014  RExC_rx = r;
6015  RExC_rxi = ri;
6016  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6017
6018  /* Second pass: emit code. */
6019  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6020  RExC_pm_flags = pm_flags;
6021  RExC_parse = exp;
6022  RExC_end = exp + plen;
6023  RExC_naughty = 0;
6024  RExC_npar = 1;
6025  RExC_emit_start = ri->program;
6026  RExC_emit = ri->program;
6027  RExC_emit_bound = ri->program + RExC_size + 1;
6028  pRExC_state->code_index = 0;
6029
6030  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6031  if (reg(pRExC_state, 0, &flags,1) == NULL) {
6032   ReREFCNT_dec(rx);
6033   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6034  }
6035  /* XXXX To minimize changes to RE engine we always allocate
6036  3-units-long substrs field. */
6037  Newx(r->substrs, 1, struct reg_substr_data);
6038  if (RExC_recurse_count) {
6039   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6040   SAVEFREEPV(RExC_recurse);
6041  }
6042
6043 reStudy:
6044  r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6045  Zero(r->substrs, 1, struct reg_substr_data);
6046
6047 #ifdef TRIE_STUDY_OPT
6048  if (!restudied) {
6049   StructCopy(&zero_scan_data, &data, scan_data_t);
6050   copyRExC_state = RExC_state;
6051  } else {
6052   U32 seen=RExC_seen;
6053   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6054
6055   RExC_state = copyRExC_state;
6056   if (seen & REG_TOP_LEVEL_BRANCHES)
6057    RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6058   else
6059    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6060   StructCopy(&zero_scan_data, &data, scan_data_t);
6061  }
6062 #else
6063  StructCopy(&zero_scan_data, &data, scan_data_t);
6064 #endif
6065
6066  /* Dig out information for optimizations. */
6067  r->extflags = RExC_flags; /* was pm_op */
6068  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6069
6070  if (UTF)
6071   SvUTF8_on(rx); /* Unicode in it? */
6072  ri->regstclass = NULL;
6073  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6074   r->intflags |= PREGf_NAUGHTY;
6075  scan = ri->program + 1;  /* First BRANCH. */
6076
6077  /* testing for BRANCH here tells us whether there is "must appear"
6078  data in the pattern. If there is then we can use it for optimisations */
6079  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6080   I32 fake;
6081   STRLEN longest_float_length, longest_fixed_length;
6082   struct regnode_charclass_class ch_class; /* pointed to by data */
6083   int stclass_flag;
6084   I32 last_close = 0; /* pointed to by data */
6085   regnode *first= scan;
6086   regnode *first_next= regnext(first);
6087   /*
6088   * Skip introductions and multiplicators >= 1
6089   * so that we can extract the 'meat' of the pattern that must
6090   * match in the large if() sequence following.
6091   * NOTE that EXACT is NOT covered here, as it is normally
6092   * picked up by the optimiser separately.
6093   *
6094   * This is unfortunate as the optimiser isnt handling lookahead
6095   * properly currently.
6096   *
6097   */
6098   while ((OP(first) == OPEN && (sawopen = 1)) ||
6099    /* An OR of *one* alternative - should not happen now. */
6100    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6101    /* for now we can't handle lookbehind IFMATCH*/
6102    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6103    (OP(first) == PLUS) ||
6104    (OP(first) == MINMOD) ||
6105    /* An {n,m} with n>0 */
6106    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6107    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6108   {
6109     /*
6110     * the only op that could be a regnode is PLUS, all the rest
6111     * will be regnode_1 or regnode_2.
6112     *
6113     * (yves doesn't think this is true)
6114     */
6115     if (OP(first) == PLUS)
6116      sawplus = 1;
6117     else {
6118      if (OP(first) == MINMOD)
6119       sawminmod = 1;
6120      first += regarglen[OP(first)];
6121     }
6122     first = NEXTOPER(first);
6123     first_next= regnext(first);
6124   }
6125
6126   /* Starting-point info. */
6127  again:
6128   DEBUG_PEEP("first:",first,0);
6129   /* Ignore EXACT as we deal with it later. */
6130   if (PL_regkind[OP(first)] == EXACT) {
6131    if (OP(first) == EXACT)
6132     NOOP; /* Empty, get anchored substr later. */
6133    else
6134     ri->regstclass = first;
6135   }
6136 #ifdef TRIE_STCLASS
6137   else if (PL_regkind[OP(first)] == TRIE &&
6138     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6139   {
6140    regnode *trie_op;
6141    /* this can happen only on restudy */
6142    if ( OP(first) == TRIE ) {
6143     struct regnode_1 *trieop = (struct regnode_1 *)
6144      PerlMemShared_calloc(1, sizeof(struct regnode_1));
6145     StructCopy(first,trieop,struct regnode_1);
6146     trie_op=(regnode *)trieop;
6147    } else {
6148     struct regnode_charclass *trieop = (struct regnode_charclass *)
6149      PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6150     StructCopy(first,trieop,struct regnode_charclass);
6151     trie_op=(regnode *)trieop;
6152    }
6153    OP(trie_op)+=2;
6154    make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6155    ri->regstclass = trie_op;
6156   }
6157 #endif
6158   else if (REGNODE_SIMPLE(OP(first)))
6159    ri->regstclass = first;
6160   else if (PL_regkind[OP(first)] == BOUND ||
6161     PL_regkind[OP(first)] == NBOUND)
6162    ri->regstclass = first;
6163   else if (PL_regkind[OP(first)] == BOL) {
6164    r->extflags |= (OP(first) == MBOL
6165       ? RXf_ANCH_MBOL
6166       : (OP(first) == SBOL
6167        ? RXf_ANCH_SBOL
6168        : RXf_ANCH_BOL));
6169    first = NEXTOPER(first);
6170    goto again;
6171   }
6172   else if (OP(first) == GPOS) {
6173    r->extflags |= RXf_ANCH_GPOS;
6174    first = NEXTOPER(first);
6175    goto again;
6176   }
6177   else if ((!sawopen || !RExC_sawback) &&
6178    (OP(first) == STAR &&
6179    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6180    !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6181   {
6182    /* turn .* into ^.* with an implied $*=1 */
6183    const int type =
6184     (OP(NEXTOPER(first)) == REG_ANY)
6185      ? RXf_ANCH_MBOL
6186      : RXf_ANCH_SBOL;
6187    r->extflags |= type;
6188    r->intflags |= PREGf_IMPLICIT;
6189    first = NEXTOPER(first);
6190    goto again;
6191   }
6192   if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6193    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6194    /* x+ must match at the 1st pos of run of x's */
6195    r->intflags |= PREGf_SKIP;
6196
6197   /* Scan is after the zeroth branch, first is atomic matcher. */
6198 #ifdef TRIE_STUDY_OPT
6199   DEBUG_PARSE_r(
6200    if (!restudied)
6201     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6202        (IV)(first - scan + 1))
6203   );
6204 #else
6205   DEBUG_PARSE_r(
6206    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6207     (IV)(first - scan + 1))
6208   );
6209 #endif
6210
6211
6212   /*
6213   * If there's something expensive in the r.e., find the
6214   * longest literal string that must appear and make it the
6215   * regmust.  Resolve ties in favor of later strings, since
6216   * the regstart check works with the beginning of the r.e.
6217   * and avoiding duplication strengthens checking.  Not a
6218   * strong reason, but sufficient in the absence of others.
6219   * [Now we resolve ties in favor of the earlier string if
6220   * it happens that c_offset_min has been invalidated, since the
6221   * earlier string may buy us something the later one won't.]
6222   */
6223
6224   data.longest_fixed = newSVpvs("");
6225   data.longest_float = newSVpvs("");
6226   data.last_found = newSVpvs("");
6227   data.longest = &(data.longest_fixed);
6228   ENTER_with_name("study_chunk");
6229   SAVEFREESV(data.longest_fixed);
6230   SAVEFREESV(data.longest_float);
6231   SAVEFREESV(data.last_found);
6232   first = scan;
6233   if (!ri->regstclass) {
6234    cl_init(pRExC_state, &ch_class);
6235    data.start_class = &ch_class;
6236    stclass_flag = SCF_DO_STCLASS_AND;
6237   } else    /* XXXX Check for BOUND? */
6238    stclass_flag = 0;
6239   data.last_closep = &last_close;
6240
6241   minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6242    &data, -1, NULL, NULL,
6243    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6244       | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6245    0);
6246
6247
6248   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6249
6250
6251   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6252    && data.last_start_min == 0 && data.last_end > 0
6253    && !RExC_seen_zerolen
6254    && !(RExC_seen & REG_SEEN_VERBARG)
6255    && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6256    r->extflags |= RXf_CHECK_ALL;
6257   scan_commit(pRExC_state, &data,&minlen,0);
6258
6259   longest_float_length = CHR_SVLEN(data.longest_float);
6260
6261   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6262     && data.offset_fixed == data.offset_float_min
6263     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6264    && S_setup_longest (aTHX_ pRExC_state,
6265          data.longest_float,
6266          &(r->float_utf8),
6267          &(r->float_substr),
6268          &(r->float_end_shift),
6269          data.lookbehind_float,
6270          data.offset_float_min,
6271          data.minlen_float,
6272          longest_float_length,
6273          cBOOL(data.flags & SF_FL_BEFORE_EOL),
6274          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6275   {
6276    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6277    r->float_max_offset = data.offset_float_max;
6278    if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6279     r->float_max_offset -= data.lookbehind_float;
6280    SvREFCNT_inc_simple_void_NN(data.longest_float);
6281   }
6282   else {
6283    r->float_substr = r->float_utf8 = NULL;
6284    longest_float_length = 0;
6285   }
6286
6287   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6288
6289   if (S_setup_longest (aTHX_ pRExC_state,
6290         data.longest_fixed,
6291         &(r->anchored_utf8),
6292         &(r->anchored_substr),
6293         &(r->anchored_end_shift),
6294         data.lookbehind_fixed,
6295         data.offset_fixed,
6296         data.minlen_fixed,
6297         longest_fixed_length,
6298         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6299         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6300   {
6301    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6302    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6303   }
6304   else {
6305    r->anchored_substr = r->anchored_utf8 = NULL;
6306    longest_fixed_length = 0;
6307   }
6308   LEAVE_with_name("study_chunk");
6309
6310   if (ri->regstclass
6311    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6312    ri->regstclass = NULL;
6313
6314   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6315    && stclass_flag
6316    && ! TEST_SSC_EOS(data.start_class)
6317    && !cl_is_anything(data.start_class))
6318   {
6319    const U32 n = add_data(pRExC_state, 1, "f");
6320    OP(data.start_class) = ANYOF_SYNTHETIC;
6321
6322    Newx(RExC_rxi->data->data[n], 1,
6323     struct regnode_charclass_class);
6324    StructCopy(data.start_class,
6325      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6326      struct regnode_charclass_class);
6327    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6328    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6329    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6330      regprop(r, sv, (regnode*)data.start_class);
6331      PerlIO_printf(Perl_debug_log,
6332          "synthetic stclass \"%s\".\n",
6333          SvPVX_const(sv));});
6334   }
6335
6336   /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6337   if (longest_fixed_length > longest_float_length) {
6338    r->check_end_shift = r->anchored_end_shift;
6339    r->check_substr = r->anchored_substr;
6340    r->check_utf8 = r->anchored_utf8;
6341    r->check_offset_min = r->check_offset_max = r->anchored_offset;
6342    if (r->extflags & RXf_ANCH_SINGLE)
6343     r->extflags |= RXf_NOSCAN;
6344   }
6345   else {
6346    r->check_end_shift = r->float_end_shift;
6347    r->check_substr = r->float_substr;
6348    r->check_utf8 = r->float_utf8;
6349    r->check_offset_min = r->float_min_offset;
6350    r->check_offset_max = r->float_max_offset;
6351   }
6352   if ((r->check_substr || r->check_utf8) ) {
6353    r->extflags |= RXf_USE_INTUIT;
6354    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6355     r->extflags |= RXf_INTUIT_TAIL;
6356   }
6357   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6358   if ( (STRLEN)minlen < longest_float_length )
6359    minlen= longest_float_length;
6360   if ( (STRLEN)minlen < longest_fixed_length )
6361    minlen= longest_fixed_length;
6362   */
6363  }
6364  else {
6365   /* Several toplevels. Best we can is to set minlen. */
6366   I32 fake;
6367   struct regnode_charclass_class ch_class;
6368   I32 last_close = 0;
6369
6370   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6371
6372   scan = ri->program + 1;
6373   cl_init(pRExC_state, &ch_class);
6374   data.start_class = &ch_class;
6375   data.last_closep = &last_close;
6376
6377
6378   minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6379    &data, -1, NULL, NULL,
6380    SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6381        |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6382    0);
6383
6384   CHECK_RESTUDY_GOTO_butfirst(NOOP);
6385
6386   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6387     = r->float_substr = r->float_utf8 = NULL;
6388
6389   if (! TEST_SSC_EOS(data.start_class)
6390    && !cl_is_anything(data.start_class))
6391   {
6392    const U32 n = add_data(pRExC_state, 1, "f");
6393    OP(data.start_class) = ANYOF_SYNTHETIC;
6394
6395    Newx(RExC_rxi->data->data[n], 1,
6396     struct regnode_charclass_class);
6397    StructCopy(data.start_class,
6398      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6399      struct regnode_charclass_class);
6400    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6401    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6402    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6403      regprop(r, sv, (regnode*)data.start_class);
6404      PerlIO_printf(Perl_debug_log,
6405          "synthetic stclass \"%s\".\n",
6406          SvPVX_const(sv));});
6407   }
6408  }
6409
6410  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6411  the "real" pattern. */
6412  DEBUG_OPTIMISE_r({
6413   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6414      (IV)minlen, (IV)r->minlen);
6415  });
6416  r->minlenret = minlen;
6417  if (r->minlen < minlen)
6418   r->minlen = minlen;
6419
6420  if (RExC_seen & REG_SEEN_GPOS)
6421   r->extflags |= RXf_GPOS_SEEN;
6422  if (RExC_seen & REG_SEEN_LOOKBEHIND)
6423   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6424  if (pRExC_state->num_code_blocks)
6425   r->extflags |= RXf_EVAL_SEEN;
6426  if (RExC_seen & REG_SEEN_CANY)
6427   r->extflags |= RXf_CANY_SEEN;
6428  if (RExC_seen & REG_SEEN_VERBARG)
6429  {
6430   r->intflags |= PREGf_VERBARG_SEEN;
6431   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6432  }
6433  if (RExC_seen & REG_SEEN_CUTGROUP)
6434   r->intflags |= PREGf_CUTGROUP_SEEN;
6435  if (pm_flags & PMf_USE_RE_EVAL)
6436   r->intflags |= PREGf_USE_RE_EVAL;
6437  if (RExC_paren_names)
6438   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6439  else
6440   RXp_PAREN_NAMES(r) = NULL;
6441
6442  {
6443   regnode *first = ri->program + 1;
6444   U8 fop = OP(first);
6445   regnode *next = NEXTOPER(first);
6446   U8 nop = OP(next);
6447
6448   if (PL_regkind[fop] == NOTHING && nop == END)
6449    r->extflags |= RXf_NULL;
6450   else if (PL_regkind[fop] == BOL && nop == END)
6451    r->extflags |= RXf_START_ONLY;
6452   else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6453    r->extflags |= RXf_WHITE;
6454   else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6455    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6456
6457  }
6458 #ifdef DEBUGGING
6459  if (RExC_paren_names) {
6460   ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6461   ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6462  } else
6463 #endif
6464   ri->name_list_idx = 0;
6465
6466  if (RExC_recurse_count) {
6467   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6468    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6469    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6470   }
6471  }
6472  Newxz(r->offs, RExC_npar, regexp_paren_pair);
6473  /* assume we don't need to swap parens around before we match */
6474
6475  DEBUG_DUMP_r({
6476   PerlIO_printf(Perl_debug_log,"Final program:\n");
6477   regdump(r);
6478  });
6479 #ifdef RE_TRACK_PATTERN_OFFSETS
6480  DEBUG_OFFSETS_r(if (ri->u.offsets) {
6481   const U32 len = ri->u.offsets[0];
6482   U32 i;
6483   GET_RE_DEBUG_FLAGS_DECL;
6484   PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6485   for (i = 1; i <= len; i++) {
6486    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6487     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6488     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6489    }
6490   PerlIO_printf(Perl_debug_log, "\n");
6491  });
6492 #endif
6493
6494 #ifdef USE_ITHREADS
6495  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6496  * by setting the regexp SV to readonly-only instead. If the
6497  * pattern's been recompiled, the USEDness should remain. */
6498  if (old_re && SvREADONLY(old_re))
6499   SvREADONLY_on(rx);
6500 #endif
6501  return rx;
6502 }
6503
6504
6505 SV*
6506 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6507      const U32 flags)
6508 {
6509  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6510
6511  PERL_UNUSED_ARG(value);
6512
6513  if (flags & RXapif_FETCH) {
6514   return reg_named_buff_fetch(rx, key, flags);
6515  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6516   Perl_croak_no_modify();
6517   return NULL;
6518  } else if (flags & RXapif_EXISTS) {
6519   return reg_named_buff_exists(rx, key, flags)
6520    ? &PL_sv_yes
6521    : &PL_sv_no;
6522  } else if (flags & RXapif_REGNAMES) {
6523   return reg_named_buff_all(rx, flags);
6524  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6525   return reg_named_buff_scalar(rx, flags);
6526  } else {
6527   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6528   return NULL;
6529  }
6530 }
6531
6532 SV*
6533 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6534       const U32 flags)
6535 {
6536  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6537  PERL_UNUSED_ARG(lastkey);
6538
6539  if (flags & RXapif_FIRSTKEY)
6540   return reg_named_buff_firstkey(rx, flags);
6541  else if (flags & RXapif_NEXTKEY)
6542   return reg_named_buff_nextkey(rx, flags);
6543  else {
6544   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6545   return NULL;
6546  }
6547 }
6548
6549 SV*
6550 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6551       const U32 flags)
6552 {
6553  AV *retarray = NULL;
6554  SV *ret;
6555  struct regexp *const rx = ReANY(r);
6556
6557  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6558
6559  if (flags & RXapif_ALL)
6560   retarray=newAV();
6561
6562  if (rx && RXp_PAREN_NAMES(rx)) {
6563   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6564   if (he_str) {
6565    IV i;
6566    SV* sv_dat=HeVAL(he_str);
6567    I32 *nums=(I32*)SvPVX(sv_dat);
6568    for ( i=0; i<SvIVX(sv_dat); i++ ) {
6569     if ((I32)(rx->nparens) >= nums[i]
6570      && rx->offs[nums[i]].start != -1
6571      && rx->offs[nums[i]].end != -1)
6572     {
6573      ret = newSVpvs("");
6574      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6575      if (!retarray)
6576       return ret;
6577     } else {
6578      if (retarray)
6579       ret = newSVsv(&PL_sv_undef);
6580     }
6581     if (retarray)
6582      av_push(retarray, ret);
6583    }
6584    if (retarray)
6585     return newRV_noinc(MUTABLE_SV(retarray));
6586   }
6587  }
6588  return NULL;
6589 }
6590
6591 bool
6592 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6593       const U32 flags)
6594 {
6595  struct regexp *const rx = ReANY(r);
6596
6597  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6598
6599  if (rx && RXp_PAREN_NAMES(rx)) {
6600   if (flags & RXapif_ALL) {
6601    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6602   } else {
6603    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6604    if (sv) {
6605     SvREFCNT_dec_NN(sv);
6606     return TRUE;
6607    } else {
6608     return FALSE;
6609    }
6610   }
6611  } else {
6612   return FALSE;
6613  }
6614 }
6615
6616 SV*
6617 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6618 {
6619  struct regexp *const rx = ReANY(r);
6620
6621  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6622
6623  if ( rx && RXp_PAREN_NAMES(rx) ) {
6624   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6625
6626   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6627  } else {
6628   return FALSE;
6629  }
6630 }
6631
6632 SV*
6633 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6634 {
6635  struct regexp *const rx = ReANY(r);
6636  GET_RE_DEBUG_FLAGS_DECL;
6637
6638  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6639
6640  if (rx && RXp_PAREN_NAMES(rx)) {
6641   HV *hv = RXp_PAREN_NAMES(rx);
6642   HE *temphe;
6643   while ( (temphe = hv_iternext_flags(hv,0)) ) {
6644    IV i;
6645    IV parno = 0;
6646    SV* sv_dat = HeVAL(temphe);
6647    I32 *nums = (I32*)SvPVX(sv_dat);
6648    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6649     if ((I32)(rx->lastparen) >= nums[i] &&
6650      rx->offs[nums[i]].start != -1 &&
6651      rx->offs[nums[i]].end != -1)
6652     {
6653      parno = nums[i];
6654      break;
6655     }
6656    }
6657    if (parno || flags & RXapif_ALL) {
6658     return newSVhek(HeKEY_hek(temphe));
6659    }
6660   }
6661  }
6662  return NULL;
6663 }
6664
6665 SV*
6666 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6667 {
6668  SV *ret;
6669  AV *av;
6670  I32 length;
6671  struct regexp *const rx = ReANY(r);
6672
6673  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6674
6675  if (rx && RXp_PAREN_NAMES(rx)) {
6676   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6677    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6678   } else if (flags & RXapif_ONE) {
6679    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6680    av = MUTABLE_AV(SvRV(ret));
6681    length = av_len(av);
6682    SvREFCNT_dec_NN(ret);
6683    return newSViv(length + 1);
6684   } else {
6685    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6686    return NULL;
6687   }
6688  }
6689  return &PL_sv_undef;
6690 }
6691
6692 SV*
6693 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6694 {
6695  struct regexp *const rx = ReANY(r);
6696  AV *av = newAV();
6697
6698  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6699
6700  if (rx && RXp_PAREN_NAMES(rx)) {
6701   HV *hv= RXp_PAREN_NAMES(rx);
6702   HE *temphe;
6703   (void)hv_iterinit(hv);
6704   while ( (temphe = hv_iternext_flags(hv,0)) ) {
6705    IV i;
6706    IV parno = 0;
6707    SV* sv_dat = HeVAL(temphe);
6708    I32 *nums = (I32*)SvPVX(sv_dat);
6709    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6710     if ((I32)(rx->lastparen) >= nums[i] &&
6711      rx->offs[nums[i]].start != -1 &&
6712      rx->offs[nums[i]].end != -1)
6713     {
6714      parno = nums[i];
6715      break;
6716     }
6717    }
6718    if (parno || flags & RXapif_ALL) {
6719     av_push(av, newSVhek(HeKEY_hek(temphe)));
6720    }
6721   }
6722  }
6723
6724  return newRV_noinc(MUTABLE_SV(av));
6725 }
6726
6727 void
6728 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6729        SV * const sv)
6730 {
6731  struct regexp *const rx = ReANY(r);
6732  char *s = NULL;
6733  I32 i = 0;
6734  I32 s1, t1;
6735  I32 n = paren;
6736
6737  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6738
6739  if (      n == RX_BUFF_IDX_CARET_PREMATCH
6740   || n == RX_BUFF_IDX_CARET_FULLMATCH
6741   || n == RX_BUFF_IDX_CARET_POSTMATCH
6742  )
6743  {
6744   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6745   if (!keepcopy) {
6746    /* on something like
6747    *    $r = qr/.../;
6748    *    /$qr/p;
6749    * the KEEPCOPY is set on the PMOP rather than the regex */
6750    if (PL_curpm && r == PM_GETRE(PL_curpm))
6751     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6752   }
6753   if (!keepcopy)
6754    goto ret_undef;
6755  }
6756
6757  if (!rx->subbeg)
6758   goto ret_undef;
6759
6760  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6761   /* no need to distinguish between them any more */
6762   n = RX_BUFF_IDX_FULLMATCH;
6763
6764  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6765   && rx->offs[0].start != -1)
6766  {
6767   /* $`, ${^PREMATCH} */
6768   i = rx->offs[0].start;
6769   s = rx->subbeg;
6770  }
6771  else
6772  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6773   && rx->offs[0].end != -1)
6774  {
6775   /* $', ${^POSTMATCH} */
6776   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6777   i = rx->sublen + rx->suboffset - rx->offs[0].end;
6778  }
6779  else
6780  if ( 0 <= n && n <= (I32)rx->nparens &&
6781   (s1 = rx->offs[n].start) != -1 &&
6782   (t1 = rx->offs[n].end) != -1)
6783  {
6784   /* $&, ${^MATCH},  $1 ... */
6785   i = t1 - s1;
6786   s = rx->subbeg + s1 - rx->suboffset;
6787  } else {
6788   goto ret_undef;
6789  }
6790
6791  assert(s >= rx->subbeg);
6792  assert(rx->sublen >= (s - rx->subbeg) + i );
6793  if (i >= 0) {
6794 #if NO_TAINT_SUPPORT
6795   sv_setpvn(sv, s, i);
6796 #else
6797   const int oldtainted = TAINT_get;
6798   TAINT_NOT;
6799   sv_setpvn(sv, s, i);
6800   TAINT_set(oldtainted);
6801 #endif
6802   if ( (rx->extflags & RXf_CANY_SEEN)
6803    ? (RXp_MATCH_UTF8(rx)
6804       && (!i || is_utf8_string((U8*)s, i)))
6805    : (RXp_MATCH_UTF8(rx)) )
6806   {
6807    SvUTF8_on(sv);
6808   }
6809   else
6810    SvUTF8_off(sv);
6811   if (TAINTING_get) {
6812    if (RXp_MATCH_TAINTED(rx)) {
6813     if (SvTYPE(sv) >= SVt_PVMG) {
6814      MAGIC* const mg = SvMAGIC(sv);
6815      MAGIC* mgt;
6816      TAINT;
6817      SvMAGIC_set(sv, mg->mg_moremagic);
6818      SvTAINT(sv);
6819      if ((mgt = SvMAGIC(sv))) {
6820       mg->mg_moremagic = mgt;
6821       SvMAGIC_set(sv, mg);
6822      }
6823     } else {
6824      TAINT;
6825      SvTAINT(sv);
6826     }
6827    } else
6828     SvTAINTED_off(sv);
6829   }
6830  } else {
6831  ret_undef:
6832   sv_setsv(sv,&PL_sv_undef);
6833   return;
6834  }
6835 }
6836
6837 void
6838 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6839               SV const * const value)
6840 {
6841  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6842
6843  PERL_UNUSED_ARG(rx);
6844  PERL_UNUSED_ARG(paren);
6845  PERL_UNUSED_ARG(value);
6846
6847  if (!PL_localizing)
6848   Perl_croak_no_modify();
6849 }
6850
6851 I32
6852 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6853        const I32 paren)
6854 {
6855  struct regexp *const rx = ReANY(r);
6856  I32 i;
6857  I32 s1, t1;
6858
6859  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6860
6861  if (   paren == RX_BUFF_IDX_CARET_PREMATCH
6862   || paren == RX_BUFF_IDX_CARET_FULLMATCH
6863   || paren == RX_BUFF_IDX_CARET_POSTMATCH
6864  )
6865  {
6866   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6867   if (!keepcopy) {
6868    /* on something like
6869    *    $r = qr/.../;
6870    *    /$qr/p;
6871    * the KEEPCOPY is set on the PMOP rather than the regex */
6872    if (PL_curpm && r == PM_GETRE(PL_curpm))
6873     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6874   }
6875   if (!keepcopy)
6876    goto warn_undef;
6877  }
6878
6879  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6880  switch (paren) {
6881  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6882  case RX_BUFF_IDX_PREMATCH:       /* $` */
6883   if (rx->offs[0].start != -1) {
6884       i = rx->offs[0].start;
6885       if (i > 0) {
6886         s1 = 0;
6887         t1 = i;
6888         goto getlen;
6889       }
6890    }
6891   return 0;
6892
6893  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6894  case RX_BUFF_IDX_POSTMATCH:       /* $' */
6895    if (rx->offs[0].end != -1) {
6896       i = rx->sublen - rx->offs[0].end;
6897       if (i > 0) {
6898         s1 = rx->offs[0].end;
6899         t1 = rx->sublen;
6900         goto getlen;
6901       }
6902    }
6903   return 0;
6904
6905  default: /* $& / ${^MATCH}, $1, $2, ... */
6906    if (paren <= (I32)rx->nparens &&
6907    (s1 = rx->offs[paren].start) != -1 &&
6908    (t1 = rx->offs[paren].end) != -1)
6909    {
6910    i = t1 - s1;
6911    goto getlen;
6912   } else {
6913   warn_undef:
6914    if (ckWARN(WARN_UNINITIALIZED))
6915     report_uninit((const SV *)sv);
6916    return 0;
6917   }
6918  }
6919   getlen:
6920  if (i > 0 && RXp_MATCH_UTF8(rx)) {
6921   const char * const s = rx->subbeg - rx->suboffset + s1;
6922   const U8 *ep;
6923   STRLEN el;
6924
6925   i = t1 - s1;
6926   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6927       i = el;
6928  }
6929  return i;
6930 }
6931
6932 SV*
6933 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6934 {
6935  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6936   PERL_UNUSED_ARG(rx);
6937   if (0)
6938    return NULL;
6939   else
6940    return newSVpvs("Regexp");
6941 }
6942
6943 /* Scans the name of a named buffer from the pattern.
6944  * If flags is REG_RSN_RETURN_NULL returns null.
6945  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6946  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6947  * to the parsed name as looked up in the RExC_paren_names hash.
6948  * If there is an error throws a vFAIL().. type exception.
6949  */
6950
6951 #define REG_RSN_RETURN_NULL    0
6952 #define REG_RSN_RETURN_NAME    1
6953 #define REG_RSN_RETURN_DATA    2
6954
6955 STATIC SV*
6956 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6957 {
6958  char *name_start = RExC_parse;
6959
6960  PERL_ARGS_ASSERT_REG_SCAN_NAME;
6961
6962  if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6963   /* skip IDFIRST by using do...while */
6964   if (UTF)
6965    do {
6966     RExC_parse += UTF8SKIP(RExC_parse);
6967    } while (isWORDCHAR_utf8((U8*)RExC_parse));
6968   else
6969    do {
6970     RExC_parse++;
6971    } while (isWORDCHAR(*RExC_parse));
6972  } else {
6973   RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6974   vFAIL("Group name must start with a non-digit word character");
6975  }
6976  if ( flags ) {
6977   SV* sv_name
6978    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6979        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6980   if ( flags == REG_RSN_RETURN_NAME)
6981    return sv_name;
6982   else if (flags==REG_RSN_RETURN_DATA) {
6983    HE *he_str = NULL;
6984    SV *sv_dat = NULL;
6985    if ( ! sv_name )      /* should not happen*/
6986     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6987    if (RExC_paren_names)
6988     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6989    if ( he_str )
6990     sv_dat = HeVAL(he_str);
6991    if ( ! sv_dat )
6992     vFAIL("Reference to nonexistent named group");
6993    return sv_dat;
6994   }
6995   else {
6996    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6997      (unsigned long) flags);
6998   }
6999   assert(0); /* NOT REACHED */
7000  }
7001  return NULL;
7002 }
7003
7004 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
7005  int rem=(int)(RExC_end - RExC_parse);                       \
7006  int cut;                                                    \
7007  int num;                                                    \
7008  int iscut=0;                                                \
7009  if (rem>10) {                                               \
7010   rem=10;                                                 \
7011   iscut=1;                                                \
7012  }                                                           \
7013  cut=10-rem;                                                 \
7014  if (RExC_lastparse!=RExC_parse)                             \
7015   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
7016    rem, RExC_parse,                                    \
7017    cut + 4,                                            \
7018    iscut ? "..." : "<"                                 \
7019   );                                                      \
7020  else                                                        \
7021   PerlIO_printf(Perl_debug_log,"%16s","");                \
7022                 \
7023  if (SIZE_ONLY)                                              \
7024  num = RExC_size + 1;                                     \
7025  else                                                        \
7026  num=REG_NODE_NUM(RExC_emit);                             \
7027  if (RExC_lastnum!=num)                                      \
7028  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
7029  else                                                        \
7030  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
7031  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
7032   (int)((depth*2)), "",                                   \
7033   (funcname)                                              \
7034  );                                                          \
7035  RExC_lastnum=num;                                           \
7036  RExC_lastparse=RExC_parse;                                  \
7037 })
7038
7039
7040
7041 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
7042  DEBUG_PARSE_MSG((funcname));                            \
7043  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7044 })
7045 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7046  DEBUG_PARSE_MSG((funcname));                            \
7047  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7048 })
7049
7050 /* This section of code defines the inversion list object and its methods.  The
7051  * interfaces are highly subject to change, so as much as possible is static to
7052  * this file.  An inversion list is here implemented as a malloc'd C UV array
7053  * as an SVt_INVLIST scalar.
7054  *
7055  * An inversion list for Unicode is an array of code points, sorted by ordinal
7056  * number.  The zeroth element is the first code point in the list.  The 1th
7057  * element is the first element beyond that not in the list.  In other words,
7058  * the first range is
7059  *  invlist[0]..(invlist[1]-1)
7060  * The other ranges follow.  Thus every element whose index is divisible by two
7061  * marks the beginning of a range that is in the list, and every element not
7062  * divisible by two marks the beginning of a range not in the list.  A single
7063  * element inversion list that contains the single code point N generally
7064  * consists of two elements
7065  *  invlist[0] == N
7066  *  invlist[1] == N+1
7067  * (The exception is when N is the highest representable value on the
7068  * machine, in which case the list containing just it would be a single
7069  * element, itself.  By extension, if the last range in the list extends to
7070  * infinity, then the first element of that range will be in the inversion list
7071  * at a position that is divisible by two, and is the final element in the
7072  * list.)
7073  * Taking the complement (inverting) an inversion list is quite simple, if the
7074  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7075  * This implementation reserves an element at the beginning of each inversion
7076  * list to always contain 0; there is an additional flag in the header which
7077  * indicates if the list begins at the 0, or is offset to begin at the next
7078  * element.
7079  *
7080  * More about inversion lists can be found in "Unicode Demystified"
7081  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7082  * More will be coming when functionality is added later.
7083  *
7084  * The inversion list data structure is currently implemented as an SV pointing
7085  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7086  * array of UV whose memory management is automatically handled by the existing
7087  * facilities for SV's.
7088  *
7089  * Some of the methods should always be private to the implementation, and some
7090  * should eventually be made public */
7091
7092 /* The header definitions are in F<inline_invlist.c> */
7093
7094 PERL_STATIC_INLINE UV*
7095 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7096 {
7097  /* Returns a pointer to the first element in the inversion list's array.
7098  * This is called upon initialization of an inversion list.  Where the
7099  * array begins depends on whether the list has the code point U+0000 in it
7100  * or not.  The other parameter tells it whether the code that follows this
7101  * call is about to put a 0 in the inversion list or not.  The first
7102  * element is either the element reserved for 0, if TRUE, or the element
7103  * after it, if FALSE */
7104
7105  bool* offset = get_invlist_offset_addr(invlist);
7106  UV* zero_addr = (UV *) SvPVX(invlist);
7107
7108  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7109
7110  /* Must be empty */
7111  assert(! _invlist_len(invlist));
7112
7113  *zero_addr = 0;
7114
7115  /* 1^1 = 0; 1^0 = 1 */
7116  *offset = 1 ^ will_have_0;
7117  return zero_addr + *offset;
7118 }
7119
7120 PERL_STATIC_INLINE UV*
7121 S_invlist_array(pTHX_ SV* const invlist)
7122 {
7123  /* Returns the pointer to the inversion list's array.  Every time the
7124  * length changes, this needs to be called in case malloc or realloc moved
7125  * it */
7126
7127  PERL_ARGS_ASSERT_INVLIST_ARRAY;
7128
7129  /* Must not be empty.  If these fail, you probably didn't check for <len>
7130  * being non-zero before trying to get the array */
7131  assert(_invlist_len(invlist));
7132
7133  /* The very first element always contains zero, The array begins either
7134  * there, or if the inversion list is offset, at the element after it.
7135  * The offset header field determines which; it contains 0 or 1 to indicate
7136  * how much additionally to add */
7137  assert(0 == *(SvPVX(invlist)));
7138  return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7139 }
7140
7141 PERL_STATIC_INLINE void
7142 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7143 {
7144  /* Sets the current number of elements stored in the inversion list.
7145  * Updates SvCUR correspondingly */
7146
7147  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7148
7149  assert(SvTYPE(invlist) == SVt_INVLIST);
7150
7151  SvCUR_set(invlist,
7152    (len == 0)
7153    ? 0
7154    : TO_INTERNAL_SIZE(len + offset));
7155  assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7156 }
7157
7158 PERL_STATIC_INLINE IV*
7159 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7160 {
7161  /* Return the address of the IV that is reserved to hold the cached index
7162  * */
7163
7164  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7165
7166  assert(SvTYPE(invlist) == SVt_INVLIST);
7167
7168  return &(((XINVLIST*) SvANY(invlist))->prev_index);
7169 }
7170
7171 PERL_STATIC_INLINE IV
7172 S_invlist_previous_index(pTHX_ SV* const invlist)
7173 {
7174  /* Returns cached index of previous search */
7175
7176  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7177
7178  return *get_invlist_previous_index_addr(invlist);
7179 }
7180
7181 PERL_STATIC_INLINE void
7182 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7183 {
7184  /* Caches <index> for later retrieval */
7185
7186  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7187
7188  assert(index == 0 || index < (int) _invlist_len(invlist));
7189
7190  *get_invlist_previous_index_addr(invlist) = index;
7191 }
7192
7193 PERL_STATIC_INLINE UV
7194 S_invlist_max(pTHX_ SV* const invlist)
7195 {
7196  /* Returns the maximum number of elements storable in the inversion list's
7197  * array, without having to realloc() */
7198
7199  PERL_ARGS_ASSERT_INVLIST_MAX;
7200
7201  assert(SvTYPE(invlist) == SVt_INVLIST);
7202
7203  /* Assumes worst case, in which the 0 element is not counted in the
7204  * inversion list, so subtracts 1 for that */
7205  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7206   ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7207   : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7208 }
7209
7210 #ifndef PERL_IN_XSUB_RE
7211 SV*
7212 Perl__new_invlist(pTHX_ IV initial_size)
7213 {
7214
7215  /* Return a pointer to a newly constructed inversion list, with enough
7216  * space to store 'initial_size' elements.  If that number is negative, a
7217  * system default is used instead */
7218
7219  SV* new_list;
7220
7221  if (initial_size < 0) {
7222   initial_size = 10;
7223  }
7224
7225  /* Allocate the initial space */
7226  new_list = newSV_type(SVt_INVLIST);
7227
7228  /* First 1 is in case the zero element isn't in the list; second 1 is for
7229  * trailing NUL */
7230  SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7231  invlist_set_len(new_list, 0, 0);
7232
7233  /* Force iterinit() to be used to get iteration to work */
7234  *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7235
7236  *get_invlist_previous_index_addr(new_list) = 0;
7237
7238  return new_list;
7239 }
7240 #endif
7241
7242 STATIC SV*
7243 S__new_invlist_C_array(pTHX_ const UV* const list)
7244 {
7245  /* Return a pointer to a newly constructed inversion list, initialized to
7246  * point to <list>, which has to be in the exact correct inversion list
7247  * form, including internal fields.  Thus this is a dangerous routine that
7248  * should not be used in the wrong hands.  The passed in 'list' contains
7249  * several header fields at the beginning that are not part of the
7250  * inversion list body proper */
7251
7252  const STRLEN length = (STRLEN) list[0];
7253  const UV version_id =          list[1];
7254  const bool offset   =    cBOOL(list[2]);
7255 #define HEADER_LENGTH 3
7256  /* If any of the above changes in any way, you must change HEADER_LENGTH
7257  * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7258  *      perl -E 'say int(rand 2**31-1)'
7259  */
7260 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7261           data structure type, so that one being
7262           passed in can be validated to be an
7263           inversion list of the correct vintage.
7264          */
7265
7266  SV* invlist = newSV_type(SVt_INVLIST);
7267
7268  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7269
7270  if (version_id != INVLIST_VERSION_ID) {
7271   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7272  }
7273
7274  /* The generated array passed in includes header elements that aren't part
7275  * of the list proper, so start it just after them */
7276  SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7277
7278  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7279        shouldn't touch it */
7280
7281  *(get_invlist_offset_addr(invlist)) = offset;
7282
7283  /* The 'length' passed to us is the physical number of elements in the
7284  * inversion list.  But if there is an offset the logical number is one
7285  * less than that */
7286  invlist_set_len(invlist, length  - offset, offset);
7287
7288  invlist_set_previous_index(invlist, 0);
7289
7290  /* Initialize the iteration pointer. */
7291  invlist_iterfinish(invlist);
7292
7293  return invlist;
7294 }
7295
7296 STATIC void
7297 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7298 {
7299  /* Grow the maximum size of an inversion list */
7300
7301  PERL_ARGS_ASSERT_INVLIST_EXTEND;
7302
7303  assert(SvTYPE(invlist) == SVt_INVLIST);
7304
7305  /* Add one to account for the zero element at the beginning which may not
7306  * be counted by the calling parameters */
7307  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7308 }
7309
7310 PERL_STATIC_INLINE void
7311 S_invlist_trim(pTHX_ SV* const invlist)
7312 {
7313  PERL_ARGS_ASSERT_INVLIST_TRIM;
7314
7315  assert(SvTYPE(invlist) == SVt_INVLIST);
7316
7317  /* Change the length of the inversion list to how many entries it currently
7318  * has */
7319  SvPV_shrink_to_cur((SV *) invlist);
7320 }
7321
7322 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7323
7324 STATIC void
7325 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7326 {
7327    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7328  * the end of the inversion list.  The range must be above any existing
7329  * ones. */
7330
7331  UV* array;
7332  UV max = invlist_max(invlist);
7333  UV len = _invlist_len(invlist);
7334  bool offset;
7335
7336  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7337
7338  if (len == 0) { /* Empty lists must be initialized */
7339   offset = start != 0;
7340   array = _invlist_array_init(invlist, ! offset);
7341  }
7342  else {
7343   /* Here, the existing list is non-empty. The current max entry in the
7344   * list is generally the first value not in the set, except when the
7345   * set extends to the end of permissible values, in which case it is
7346   * the first entry in that final set, and so this call is an attempt to
7347   * append out-of-order */
7348
7349   UV final_element = len - 1;
7350   array = invlist_array(invlist);
7351   if (array[final_element] > start
7352    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7353   {
7354    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7355      array[final_element], start,
7356      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7357   }
7358
7359   /* Here, it is a legal append.  If the new range begins with the first
7360   * value not in the set, it is extending the set, so the new first
7361   * value not in the set is one greater than the newly extended range.
7362   * */
7363   offset = *get_invlist_offset_addr(invlist);
7364   if (array[final_element] == start) {
7365    if (end != UV_MAX) {
7366     array[final_element] = end + 1;
7367    }
7368    else {
7369     /* But if the end is the maximum representable on the machine,
7370     * just let the range that this would extend to have no end */
7371     invlist_set_len(invlist, len - 1, offset);
7372    }
7373    return;
7374   }
7375  }
7376
7377  /* Here the new range doesn't extend any existing set.  Add it */
7378
7379  len += 2; /* Includes an element each for the start and end of range */
7380
7381  /* If wll overflow the existing space, extend, which may cause the array to
7382  * be moved */
7383  if (max < len) {
7384   invlist_extend(invlist, len);
7385
7386   /* Have to set len here to avoid assert failure in invlist_array() */
7387   invlist_set_len(invlist, len, offset);
7388
7389   array = invlist_array(invlist);
7390  }
7391  else {
7392   invlist_set_len(invlist, len, offset);
7393  }
7394
7395  /* The next item on the list starts the range, the one after that is
7396  * one past the new range.  */
7397  array[len - 2] = start;
7398  if (end != UV_MAX) {
7399   array[len - 1] = end + 1;
7400  }
7401  else {
7402   /* But if the end is the maximum representable on the machine, just let
7403   * the range have no end */
7404   invlist_set_len(invlist, len - 1, offset);
7405  }
7406 }
7407
7408 #ifndef PERL_IN_XSUB_RE
7409
7410 IV
7411 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7412 {
7413  /* Searches the inversion list for the entry that contains the input code
7414  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7415  * return value is the index into the list's array of the range that
7416  * contains <cp> */
7417
7418  IV low = 0;
7419  IV mid;
7420  IV high = _invlist_len(invlist);
7421  const IV highest_element = high - 1;
7422  const UV* array;
7423
7424  PERL_ARGS_ASSERT__INVLIST_SEARCH;
7425
7426  /* If list is empty, return failure. */
7427  if (high == 0) {
7428   return -1;
7429  }
7430
7431  /* (We can't get the array unless we know the list is non-empty) */
7432  array = invlist_array(invlist);
7433
7434  mid = invlist_previous_index(invlist);
7435  assert(mid >=0 && mid <= highest_element);
7436
7437  /* <mid> contains the cache of the result of the previous call to this
7438  * function (0 the first time).  See if this call is for the same result,
7439  * or if it is for mid-1.  This is under the theory that calls to this
7440  * function will often be for related code points that are near each other.
7441  * And benchmarks show that caching gives better results.  We also test
7442  * here if the code point is within the bounds of the list.  These tests
7443  * replace others that would have had to be made anyway to make sure that
7444  * the array bounds were not exceeded, and these give us extra information
7445  * at the same time */
7446  if (cp >= array[mid]) {
7447   if (cp >= array[highest_element]) {
7448    return highest_element;
7449   }
7450
7451   /* Here, array[mid] <= cp < array[highest_element].  This means that
7452   * the final element is not the answer, so can exclude it; it also
7453   * means that <mid> is not the final element, so can refer to 'mid + 1'
7454   * safely */
7455   if (cp < array[mid + 1]) {
7456    return mid;
7457   }
7458   high--;
7459   low = mid + 1;
7460  }
7461  else { /* cp < aray[mid] */
7462   if (cp < array[0]) { /* Fail if outside the array */
7463    return -1;
7464   }
7465   high = mid;
7466   if (cp >= array[mid - 1]) {
7467    goto found_entry;
7468   }
7469  }
7470
7471  /* Binary search.  What we are looking for is <i> such that
7472  * array[i] <= cp < array[i+1]
7473  * The loop below converges on the i+1.  Note that there may not be an
7474  * (i+1)th element in the array, and things work nonetheless */
7475  while (low < high) {
7476   mid = (low + high) / 2;
7477   assert(mid <= highest_element);
7478   if (array[mid] <= cp) { /* cp >= array[mid] */
7479    low = mid + 1;
7480
7481    /* We could do this extra test to exit the loop early.
7482    if (cp < array[low]) {
7483     return mid;
7484    }
7485    */
7486   }
7487   else { /* cp < array[mid] */
7488    high = mid;
7489   }
7490  }
7491
7492   found_entry:
7493  high--;
7494  invlist_set_previous_index(invlist, high);
7495  return high;
7496 }
7497
7498 void
7499 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7500 {
7501  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7502  * but is used when the swash has an inversion list.  This makes this much
7503  * faster, as it uses a binary search instead of a linear one.  This is
7504  * intimately tied to that function, and perhaps should be in utf8.c,
7505  * except it is intimately tied to inversion lists as well.  It assumes
7506  * that <swatch> is all 0's on input */
7507
7508  UV current = start;
7509  const IV len = _invlist_len(invlist);
7510  IV i;
7511  const UV * array;
7512
7513  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7514
7515  if (len == 0) { /* Empty inversion list */
7516   return;
7517  }
7518
7519  array = invlist_array(invlist);
7520
7521  /* Find which element it is */
7522  i = _invlist_search(invlist, start);
7523
7524  /* We populate from <start> to <end> */
7525  while (current < end) {
7526   UV upper;
7527
7528   /* The inversion list gives the results for every possible code point
7529   * after the first one in the list.  Only those ranges whose index is
7530   * even are ones that the inversion list matches.  For the odd ones,
7531   * and if the initial code point is not in the list, we have to skip
7532   * forward to the next element */
7533   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7534    i++;
7535    if (i >= len) { /* Finished if beyond the end of the array */
7536     return;
7537    }
7538    current = array[i];
7539    if (current >= end) {   /* Finished if beyond the end of what we
7540          are populating */
7541     if (LIKELY(end < UV_MAX)) {
7542      return;
7543     }
7544
7545     /* We get here when the upper bound is the maximum
7546     * representable on the machine, and we are looking for just
7547     * that code point.  Have to special case it */
7548     i = len;
7549     goto join_end_of_list;
7550    }
7551   }
7552   assert(current >= start);
7553
7554   /* The current range ends one below the next one, except don't go past
7555   * <end> */
7556   i++;
7557   upper = (i < len && array[i] < end) ? array[i] : end;
7558
7559   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7560   * for each code point in it */
7561   for (; current < upper; current++) {
7562    const STRLEN offset = (STRLEN)(current - start);
7563    swatch[offset >> 3] |= 1 << (offset & 7);
7564   }
7565
7566  join_end_of_list:
7567
7568   /* Quit if at the end of the list */
7569   if (i >= len) {
7570
7571    /* But first, have to deal with the highest possible code point on
7572    * the platform.  The previous code assumes that <end> is one
7573    * beyond where we want to populate, but that is impossible at the
7574    * platform's infinity, so have to handle it specially */
7575    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7576    {
7577     const STRLEN offset = (STRLEN)(end - start);
7578     swatch[offset >> 3] |= 1 << (offset & 7);
7579    }
7580    return;
7581   }
7582
7583   /* Advance to the next range, which will be for code points not in the
7584   * inversion list */
7585   current = array[i];
7586  }
7587
7588  return;
7589 }
7590
7591 void
7592 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
7593 {
7594  /* Take the union of two inversion lists and point <output> to it.  *output
7595  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7596  * the reference count to that list will be decremented.  The first list,
7597  * <a>, may be NULL, in which case a copy of the second list is returned.
7598  * If <complement_b> is TRUE, the union is taken of the complement
7599  * (inversion) of <b> instead of b itself.
7600  *
7601  * The basis for this comes from "Unicode Demystified" Chapter 13 by
7602  * Richard Gillam, published by Addison-Wesley, and explained at some
7603  * length there.  The preface says to incorporate its examples into your
7604  * code at your own risk.
7605  *
7606  * The algorithm is like a merge sort.
7607  *
7608  * XXX A potential performance improvement is to keep track as we go along
7609  * if only one of the inputs contributes to the result, meaning the other
7610  * is a subset of that one.  In that case, we can skip the final copy and
7611  * return the larger of the input lists, but then outside code might need
7612  * to keep track of whether to free the input list or not */
7613
7614  const UV* array_a;    /* a's array */
7615  const UV* array_b;
7616  UV len_a;     /* length of a's array */
7617  UV len_b;
7618
7619  SV* u;   /* the resulting union */
7620  UV* array_u;
7621  UV len_u;
7622
7623  UV i_a = 0;      /* current index into a's array */
7624  UV i_b = 0;
7625  UV i_u = 0;
7626
7627  /* running count, as explained in the algorithm source book; items are
7628  * stopped accumulating and are output when the count changes to/from 0.
7629  * The count is incremented when we start a range that's in the set, and
7630  * decremented when we start a range that's not in the set.  So its range
7631  * is 0 to 2.  Only when the count is zero is something not in the set.
7632  */
7633  UV count = 0;
7634
7635  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7636  assert(a != b);
7637
7638  /* If either one is empty, the union is the other one */
7639  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7640   if (*output == a) {
7641    if (a != NULL) {
7642     SvREFCNT_dec_NN(a);
7643    }
7644   }
7645   if (*output != b) {
7646    *output = invlist_clone(b);
7647    if (complement_b) {
7648     _invlist_invert(*output);
7649    }
7650   } /* else *output already = b; */
7651   return;
7652  }
7653  else if ((len_b = _invlist_len(b)) == 0) {
7654   if (*output == b) {
7655    SvREFCNT_dec_NN(b);
7656   }
7657
7658   /* The complement of an empty list is a list that has everything in it,
7659   * so the union with <a> includes everything too */
7660   if (complement_b) {
7661    if (a == *output) {
7662     SvREFCNT_dec_NN(a);
7663    }
7664    *output = _new_invlist(1);
7665    _append_range_to_invlist(*output, 0, UV_MAX);
7666   }
7667   else if (*output != a) {
7668    *output = invlist_clone(a);
7669   }
7670   /* else *output already = a; */
7671   return;
7672  }
7673
7674  /* Here both lists exist and are non-empty */
7675  array_a = invlist_array(a);
7676  array_b = invlist_array(b);
7677
7678  /* If are to take the union of 'a' with the complement of b, set it
7679  * up so are looking at b's complement. */
7680  if (complement_b) {
7681
7682   /* To complement, we invert: if the first element is 0, remove it.  To
7683   * do this, we just pretend the array starts one later */
7684   if (array_b[0] == 0) {
7685    array_b++;
7686    len_b--;
7687   }
7688   else {
7689
7690    /* But if the first element is not zero, we pretend the list starts
7691    * at the 0 that is always stored immediately before the array. */
7692    array_b--;
7693    len_b++;
7694   }
7695  }
7696
7697  /* Size the union for the worst case: that the sets are completely
7698  * disjoint */
7699  u = _new_invlist(len_a + len_b);
7700
7701  /* Will contain U+0000 if either component does */
7702  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7703          || (len_b > 0 && array_b[0] == 0));
7704
7705  /* Go through each list item by item, stopping when exhausted one of
7706  * them */
7707  while (i_a < len_a && i_b < len_b) {
7708   UV cp;     /* The element to potentially add to the union's array */
7709   bool cp_in_set;   /* is it in the the input list's set or not */
7710
7711   /* We need to take one or the other of the two inputs for the union.
7712   * Since we are merging two sorted lists, we take the smaller of the
7713   * next items.  In case of a tie, we take the one that is in its set
7714   * first.  If we took one not in the set first, it would decrement the
7715   * count, possibly to 0 which would cause it to be output as ending the
7716   * range, and the next time through we would take the same number, and
7717   * output it again as beginning the next range.  By doing it the
7718   * opposite way, there is no possibility that the count will be
7719   * momentarily decremented to 0, and thus the two adjoining ranges will
7720   * be seamlessly merged.  (In a tie and both are in the set or both not
7721   * in the set, it doesn't matter which we take first.) */
7722   if (array_a[i_a] < array_b[i_b]
7723    || (array_a[i_a] == array_b[i_b]
7724     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7725   {
7726    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7727    cp= array_a[i_a++];
7728   }
7729   else {
7730    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7731    cp = array_b[i_b++];
7732   }
7733
7734   /* Here, have chosen which of the two inputs to look at.  Only output
7735   * if the running count changes to/from 0, which marks the
7736   * beginning/end of a range in that's in the set */
7737   if (cp_in_set) {
7738    if (count == 0) {
7739     array_u[i_u++] = cp;
7740    }
7741    count++;
7742   }
7743   else {
7744    count--;
7745    if (count == 0) {
7746     array_u[i_u++] = cp;
7747    }
7748   }
7749  }
7750
7751  /* Here, we are finished going through at least one of the lists, which
7752  * means there is something remaining in at most one.  We check if the list
7753  * that hasn't been exhausted is positioned such that we are in the middle
7754  * of a range in its set or not.  (i_a and i_b point to the element beyond
7755  * the one we care about.) If in the set, we decrement 'count'; if 0, there
7756  * is potentially more to output.
7757  * There are four cases:
7758  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
7759  *    in the union is entirely from the non-exhausted set.
7760  * 2) Both were in their sets, count is 2.  Nothing further should
7761  *    be output, as everything that remains will be in the exhausted
7762  *    list's set, hence in the union; decrementing to 1 but not 0 insures
7763  *    that
7764  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7765  *    Nothing further should be output because the union includes
7766  *    everything from the exhausted set.  Not decrementing ensures that.
7767  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7768  *    decrementing to 0 insures that we look at the remainder of the
7769  *    non-exhausted set */
7770  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7771   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7772  {
7773   count--;
7774  }
7775
7776  /* The final length is what we've output so far, plus what else is about to
7777  * be output.  (If 'count' is non-zero, then the input list we exhausted
7778  * has everything remaining up to the machine's limit in its set, and hence
7779  * in the union, so there will be no further output. */
7780  len_u = i_u;
7781  if (count == 0) {
7782   /* At most one of the subexpressions will be non-zero */
7783   len_u += (len_a - i_a) + (len_b - i_b);
7784  }
7785
7786  /* Set result to final length, which can change the pointer to array_u, so
7787  * re-find it */
7788  if (len_u != _invlist_len(u)) {
7789   invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
7790   invlist_trim(u);
7791   array_u = invlist_array(u);
7792  }
7793
7794  /* When 'count' is 0, the list that was exhausted (if one was shorter than
7795  * the other) ended with everything above it not in its set.  That means
7796  * that the remaining part of the union is precisely the same as the
7797  * non-exhausted list, so can just copy it unchanged.  (If both list were
7798  * exhausted at the same time, then the operations below will be both 0.)
7799  */
7800  if (count == 0) {
7801   IV copy_count; /* At most one will have a non-zero copy count */
7802   if ((copy_count = len_a - i_a) > 0) {
7803    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7804   }
7805   else if ((copy_count = len_b - i_b) > 0) {
7806    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7807   }
7808  }
7809
7810  /*  We may be removing a reference to one of the inputs */
7811  if (a == *output || b == *output) {
7812   assert(! invlist_is_iterating(*output));
7813   SvREFCNT_dec_NN(*output);
7814  }
7815
7816  *output = u;
7817  return;
7818 }
7819
7820 void
7821 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
7822 {
7823  /* Take the intersection of two inversion lists and point <i> to it.  *i
7824  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7825  * the reference count to that list will be decremented.
7826  * If <complement_b> is TRUE, the result will be the intersection of <a>
7827  * and the complement (or inversion) of <b> instead of <b> directly.
7828  *
7829  * The basis for this comes from "Unicode Demystified" Chapter 13 by
7830  * Richard Gillam, published by Addison-Wesley, and explained at some
7831  * length there.  The preface says to incorporate its examples into your
7832  * code at your own risk.  In fact, it had bugs
7833  *
7834  * The algorithm is like a merge sort, and is essentially the same as the
7835  * union above
7836  */
7837
7838  const UV* array_a;  /* a's array */
7839  const UV* array_b;
7840  UV len_a; /* length of a's array */
7841  UV len_b;
7842
7843  SV* r;       /* the resulting intersection */
7844  UV* array_r;
7845  UV len_r;
7846
7847  UV i_a = 0;      /* current index into a's array */
7848  UV i_b = 0;
7849  UV i_r = 0;
7850
7851  /* running count, as explained in the algorithm source book; items are
7852  * stopped accumulating and are output when the count changes to/from 2.
7853  * The count is incremented when we start a range that's in the set, and
7854  * decremented when we start a range that's not in the set.  So its range
7855  * is 0 to 2.  Only when the count is 2 is something in the intersection.
7856  */
7857  UV count = 0;
7858
7859  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7860  assert(a != b);
7861
7862  /* Special case if either one is empty */
7863  len_a = (a == NULL) ? 0 : _invlist_len(a);
7864  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7865
7866   if (len_a != 0 && complement_b) {
7867
7868    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7869    * be empty.  Here, also we are using 'b's complement, which hence
7870    * must be every possible code point.  Thus the intersection is
7871    * simply 'a'. */
7872    if (*i != a) {
7873     if (*i == b) {
7874      SvREFCNT_dec_NN(b);
7875     }
7876
7877     *i = invlist_clone(a);
7878    }
7879    /* else *i is already 'a' */
7880    return;
7881   }
7882
7883   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7884   * intersection must be empty */
7885   if (*i == a) {
7886    SvREFCNT_dec_NN(a);
7887   }
7888   else if (*i == b) {
7889    SvREFCNT_dec_NN(b);
7890   }
7891   *i = _new_invlist(0);
7892   return;
7893  }
7894
7895  /* Here both lists exist and are non-empty */
7896  array_a = invlist_array(a);
7897  array_b = invlist_array(b);
7898
7899  /* If are to take the intersection of 'a' with the complement of b, set it
7900  * up so are looking at b's complement. */
7901  if (complement_b) {
7902
7903   /* To complement, we invert: if the first element is 0, remove it.  To
7904   * do this, we just pretend the array starts one later */
7905   if (array_b[0] == 0) {
7906    array_b++;
7907    len_b--;
7908   }
7909   else {
7910
7911    /* But if the first element is not zero, we pretend the list starts
7912    * at the 0 that is always stored immediately before the array. */
7913    array_b--;
7914    len_b++;
7915   }
7916  }
7917
7918  /* Size the intersection for the worst case: that the intersection ends up
7919  * fragmenting everything to be completely disjoint */
7920  r= _new_invlist(len_a + len_b);
7921
7922  /* Will contain U+0000 iff both components do */
7923  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7924          && len_b > 0 && array_b[0] == 0);
7925
7926  /* Go through each list item by item, stopping when exhausted one of
7927  * them */
7928  while (i_a < len_a && i_b < len_b) {
7929   UV cp;     /* The element to potentially add to the intersection's
7930      array */
7931   bool cp_in_set; /* Is it in the input list's set or not */
7932
7933   /* We need to take one or the other of the two inputs for the
7934   * intersection.  Since we are merging two sorted lists, we take the
7935   * smaller of the next items.  In case of a tie, we take the one that
7936   * is not in its set first (a difference from the union algorithm).  If
7937   * we took one in the set first, it would increment the count, possibly
7938   * to 2 which would cause it to be output as starting a range in the
7939   * intersection, and the next time through we would take that same
7940   * number, and output it again as ending the set.  By doing it the
7941   * opposite of this, there is no possibility that the count will be
7942   * momentarily incremented to 2.  (In a tie and both are in the set or
7943   * both not in the set, it doesn't matter which we take first.) */
7944   if (array_a[i_a] < array_b[i_b]
7945    || (array_a[i_a] == array_b[i_b]
7946     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7947   {
7948    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7949    cp= array_a[i_a++];
7950   }
7951   else {
7952    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7953    cp= array_b[i_b++];
7954   }
7955
7956   /* Here, have chosen which of the two inputs to look at.  Only output
7957   * if the running count changes to/from 2, which marks the
7958   * beginning/end of a range that's in the intersection */
7959   if (cp_in_set) {
7960    count++;
7961    if (count == 2) {
7962     array_r[i_r++] = cp;
7963    }
7964   }
7965   else {
7966    if (count == 2) {
7967     array_r[i_r++] = cp;
7968    }
7969    count--;
7970   }
7971  }
7972
7973  /* Here, we are finished going through at least one of the lists, which
7974  * means there is something remaining in at most one.  We check if the list
7975  * that has been exhausted is positioned such that we are in the middle
7976  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7977  * the ones we care about.)  There are four cases:
7978  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
7979  *    nothing left in the intersection.
7980  * 2) Both were in their sets, count is 2 and perhaps is incremented to
7981  *    above 2.  What should be output is exactly that which is in the
7982  *    non-exhausted set, as everything it has is also in the intersection
7983  *    set, and everything it doesn't have can't be in the intersection
7984  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7985  *    gets incremented to 2.  Like the previous case, the intersection is
7986  *    everything that remains in the non-exhausted set.
7987  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7988  *    remains 1.  And the intersection has nothing more. */
7989  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7990   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7991  {
7992   count++;
7993  }
7994
7995  /* The final length is what we've output so far plus what else is in the
7996  * intersection.  At most one of the subexpressions below will be non-zero */
7997  len_r = i_r;
7998  if (count >= 2) {
7999   len_r += (len_a - i_a) + (len_b - i_b);
8000  }
8001
8002  /* Set result to final length, which can change the pointer to array_r, so
8003  * re-find it */
8004  if (len_r != _invlist_len(r)) {
8005   invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8006   invlist_trim(r);
8007   array_r = invlist_array(r);
8008  }
8009
8010  /* Finish outputting any remaining */
8011  if (count >= 2) { /* At most one will have a non-zero copy count */
8012   IV copy_count;
8013   if ((copy_count = len_a - i_a) > 0) {
8014    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8015   }
8016   else if ((copy_count = len_b - i_b) > 0) {
8017    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8018   }
8019  }
8020
8021  /*  We may be removing a reference to one of the inputs */
8022  if (a == *i || b == *i) {
8023   assert(! invlist_is_iterating(*i));
8024   SvREFCNT_dec_NN(*i);
8025  }
8026
8027  *i = r;
8028  return;
8029 }
8030
8031 SV*
8032 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8033 {
8034  /* Add the range from 'start' to 'end' inclusive to the inversion list's
8035  * set.  A pointer to the inversion list is returned.  This may actually be
8036  * a new list, in which case the passed in one has been destroyed.  The
8037  * passed in inversion list can be NULL, in which case a new one is created
8038  * with just the one range in it */
8039
8040  SV* range_invlist;
8041  UV len;
8042
8043  if (invlist == NULL) {
8044   invlist = _new_invlist(2);
8045   len = 0;
8046  }
8047  else {
8048   len = _invlist_len(invlist);
8049  }
8050
8051  /* If comes after the final entry actually in the list, can just append it
8052  * to the end, */
8053  if (len == 0
8054   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8055    && start >= invlist_array(invlist)[len - 1]))
8056  {
8057   _append_range_to_invlist(invlist, start, end);
8058   return invlist;
8059  }
8060
8061  /* Here, can't just append things, create and return a new inversion list
8062  * which is the union of this range and the existing inversion list */
8063  range_invlist = _new_invlist(2);
8064  _append_range_to_invlist(range_invlist, start, end);
8065
8066  _invlist_union(invlist, range_invlist, &invlist);
8067
8068  /* The temporary can be freed */
8069  SvREFCNT_dec_NN(range_invlist);
8070
8071  return invlist;
8072 }
8073
8074 #endif
8075
8076 PERL_STATIC_INLINE SV*
8077 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8078  return _add_range_to_invlist(invlist, cp, cp);
8079 }
8080
8081 #ifndef PERL_IN_XSUB_RE
8082 void
8083 Perl__invlist_invert(pTHX_ SV* const invlist)
8084 {
8085  /* Complement the input inversion list.  This adds a 0 if the list didn't
8086  * have a zero; removes it otherwise.  As described above, the data
8087  * structure is set up so that this is very efficient */
8088
8089  PERL_ARGS_ASSERT__INVLIST_INVERT;
8090
8091  assert(! invlist_is_iterating(invlist));
8092
8093  /* The inverse of matching nothing is matching everything */
8094  if (_invlist_len(invlist) == 0) {
8095   _append_range_to_invlist(invlist, 0, UV_MAX);
8096   return;
8097  }
8098
8099  *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8100 }
8101
8102 void
8103 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8104 {
8105  /* Complement the input inversion list (which must be a Unicode property,
8106  * all of which don't match above the Unicode maximum code point.)  And
8107  * Perl has chosen to not have the inversion match above that either.  This
8108  * adds a 0x110000 if the list didn't end with it, and removes it if it did
8109  */
8110
8111  UV len;
8112  UV* array;
8113
8114  PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8115
8116  _invlist_invert(invlist);
8117
8118  len = _invlist_len(invlist);
8119
8120  if (len != 0) { /* If empty do nothing */
8121   array = invlist_array(invlist);
8122   if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8123    /* Add 0x110000.  First, grow if necessary */
8124    len++;
8125    if (invlist_max(invlist) < len) {
8126     invlist_extend(invlist, len);
8127     array = invlist_array(invlist);
8128    }
8129    invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8130    array[len - 1] = PERL_UNICODE_MAX + 1;
8131   }
8132   else {  /* Remove the 0x110000 */
8133    invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8134   }
8135  }
8136
8137  return;
8138 }
8139 #endif
8140
8141 PERL_STATIC_INLINE SV*
8142 S_invlist_clone(pTHX_ SV* const invlist)
8143 {
8144
8145  /* Return a new inversion list that is a copy of the input one, which is
8146  * unchanged */
8147
8148  /* Need to allocate extra space to accommodate Perl's addition of a
8149  * trailing NUL to SvPV's, since it thinks they are always strings */
8150  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8151  STRLEN physical_length = SvCUR(invlist);
8152  bool offset = *(get_invlist_offset_addr(invlist));
8153
8154  PERL_ARGS_ASSERT_INVLIST_CLONE;
8155
8156  *(get_invlist_offset_addr(new_invlist)) = offset;
8157  invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8158  Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8159
8160  return new_invlist;
8161 }
8162
8163 PERL_STATIC_INLINE STRLEN*
8164 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8165 {
8166  /* Return the address of the UV that contains the current iteration
8167  * position */
8168
8169  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8170
8171  assert(SvTYPE(invlist) == SVt_INVLIST);
8172
8173  return &(((XINVLIST*) SvANY(invlist))->iterator);
8174 }
8175
8176 PERL_STATIC_INLINE void
8177 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8178 {
8179  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8180
8181  *get_invlist_iter_addr(invlist) = 0;
8182 }
8183
8184 PERL_STATIC_INLINE void
8185 S_invlist_iterfinish(pTHX_ SV* invlist)
8186 {
8187  /* Terminate iterator for invlist.  This is to catch development errors.
8188  * Any iteration that is interrupted before completed should call this
8189  * function.  Functions that add code points anywhere else but to the end
8190  * of an inversion list assert that they are not in the middle of an
8191  * iteration.  If they were, the addition would make the iteration
8192  * problematical: if the iteration hadn't reached the place where things
8193  * were being added, it would be ok */
8194
8195  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8196
8197  *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8198 }
8199
8200 STATIC bool
8201 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8202 {
8203  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8204  * This call sets in <*start> and <*end>, the next range in <invlist>.
8205  * Returns <TRUE> if successful and the next call will return the next
8206  * range; <FALSE> if was already at the end of the list.  If the latter,
8207  * <*start> and <*end> are unchanged, and the next call to this function
8208  * will start over at the beginning of the list */
8209
8210  STRLEN* pos = get_invlist_iter_addr(invlist);
8211  UV len = _invlist_len(invlist);
8212  UV *array;
8213
8214  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8215
8216  if (*pos >= len) {
8217   *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8218   return FALSE;
8219  }
8220
8221  array = invlist_array(invlist);
8222
8223  *start = array[(*pos)++];
8224
8225  if (*pos >= len) {
8226   *end = UV_MAX;
8227  }
8228  else {
8229   *end = array[(*pos)++] - 1;
8230  }
8231
8232  return TRUE;
8233 }
8234
8235 PERL_STATIC_INLINE bool
8236 S_invlist_is_iterating(pTHX_ SV* const invlist)
8237 {
8238  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8239
8240  return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8241 }
8242
8243 PERL_STATIC_INLINE UV
8244 S_invlist_highest(pTHX_ SV* const invlist)
8245 {
8246  /* Returns the highest code point that matches an inversion list.  This API
8247  * has an ambiguity, as it returns 0 under either the highest is actually
8248  * 0, or if the list is empty.  If this distinction matters to you, check
8249  * for emptiness before calling this function */
8250
8251  UV len = _invlist_len(invlist);
8252  UV *array;
8253
8254  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8255
8256  if (len == 0) {
8257   return 0;
8258  }
8259
8260  array = invlist_array(invlist);
8261
8262  /* The last element in the array in the inversion list always starts a
8263  * range that goes to infinity.  That range may be for code points that are
8264  * matched in the inversion list, or it may be for ones that aren't
8265  * matched.  In the latter case, the highest code point in the set is one
8266  * less than the beginning of this range; otherwise it is the final element
8267  * of this range: infinity */
8268  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8269   ? UV_MAX
8270   : array[len - 1] - 1;
8271 }
8272
8273 #ifndef PERL_IN_XSUB_RE
8274 SV *
8275 Perl__invlist_contents(pTHX_ SV* const invlist)
8276 {
8277  /* Get the contents of an inversion list into a string SV so that they can
8278  * be printed out.  It uses the format traditionally done for debug tracing
8279  */
8280
8281  UV start, end;
8282  SV* output = newSVpvs("\n");
8283
8284  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8285
8286  assert(! invlist_is_iterating(invlist));
8287
8288  invlist_iterinit(invlist);
8289  while (invlist_iternext(invlist, &start, &end)) {
8290   if (end == UV_MAX) {
8291    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8292   }
8293   else if (end != start) {
8294    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8295      start,       end);
8296   }
8297   else {
8298    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8299   }
8300  }
8301
8302  return output;
8303 }
8304 #endif
8305
8306 #ifndef PERL_IN_XSUB_RE
8307 void
8308 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8309 {
8310  /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
8311  * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
8312  * the string 'indent'.  The output looks like this:
8313   [0] 0x000A .. 0x000D
8314   [2] 0x0085
8315   [4] 0x2028 .. 0x2029
8316   [6] 0x3104 .. INFINITY
8317  * This means that the first range of code points matched by the list are
8318  * 0xA through 0xD; the second range contains only the single code point
8319  * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
8320  * are used to define each range (except if the final range extends to
8321  * infinity, only a single element is needed).  The array index of the
8322  * first element for the corresponding range is given in brackets. */
8323
8324  UV start, end;
8325  STRLEN count = 0;
8326
8327  PERL_ARGS_ASSERT__INVLIST_DUMP;
8328
8329  if (invlist_is_iterating(invlist)) {
8330   Perl_dump_indent(aTHX_ level, file,
8331    "%sCan't dump inversion list because is in middle of iterating\n",
8332    indent);
8333   return;
8334  }
8335
8336  invlist_iterinit(invlist);
8337  while (invlist_iternext(invlist, &start, &end)) {
8338   if (end == UV_MAX) {
8339    Perl_dump_indent(aTHX_ level, file,
8340          "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8341         indent, (UV)count, start);
8342   }
8343   else if (end != start) {
8344    Perl_dump_indent(aTHX_ level, file,
8345          "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8346         indent, (UV)count, start,         end);
8347   }
8348   else {
8349    Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8350            indent, (UV)count, start);
8351   }
8352   count += 2;
8353  }
8354 }
8355 #endif
8356
8357 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8358 bool
8359 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8360 {
8361  /* Return a boolean as to if the two passed in inversion lists are
8362  * identical.  The final argument, if TRUE, says to take the complement of
8363  * the second inversion list before doing the comparison */
8364
8365  const UV* array_a = invlist_array(a);
8366  const UV* array_b = invlist_array(b);
8367  UV len_a = _invlist_len(a);
8368  UV len_b = _invlist_len(b);
8369
8370  UV i = 0;      /* current index into the arrays */
8371  bool retval = TRUE;     /* Assume are identical until proven otherwise */
8372
8373  PERL_ARGS_ASSERT__INVLISTEQ;
8374
8375  /* If are to compare 'a' with the complement of b, set it
8376  * up so are looking at b's complement. */
8377  if (complement_b) {
8378
8379   /* The complement of nothing is everything, so <a> would have to have
8380   * just one element, starting at zero (ending at infinity) */
8381   if (len_b == 0) {
8382    return (len_a == 1 && array_a[0] == 0);
8383   }
8384   else if (array_b[0] == 0) {
8385
8386    /* Otherwise, to complement, we invert.  Here, the first element is
8387    * 0, just remove it.  To do this, we just pretend the array starts
8388    * one later */
8389
8390    array_b++;
8391    len_b--;
8392   }
8393   else {
8394
8395    /* But if the first element is not zero, we pretend the list starts
8396    * at the 0 that is always stored immediately before the array. */
8397    array_b--;
8398    len_b++;
8399   }
8400  }
8401
8402  /* Make sure that the lengths are the same, as well as the final element
8403  * before looping through the remainder.  (Thus we test the length, final,
8404  * and first elements right off the bat) */
8405  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8406   retval = FALSE;
8407  }
8408  else for (i = 0; i < len_a - 1; i++) {
8409   if (array_a[i] != array_b[i]) {
8410    retval = FALSE;
8411    break;
8412   }
8413  }
8414
8415  return retval;
8416 }
8417 #endif
8418
8419 #undef HEADER_LENGTH
8420 #undef TO_INTERNAL_SIZE
8421 #undef FROM_INTERNAL_SIZE
8422 #undef INVLIST_VERSION_ID
8423
8424 /* End of inversion list object */
8425
8426 STATIC void
8427 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8428 {
8429  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8430  * constructs, and updates RExC_flags with them.  On input, RExC_parse
8431  * should point to the first flag; it is updated on output to point to the
8432  * final ')' or ':'.  There needs to be at least one flag, or this will
8433  * abort */
8434
8435  /* for (?g), (?gc), and (?o) warnings; warning
8436  about (?c) will warn about (?g) -- japhy    */
8437
8438 #define WASTED_O  0x01
8439 #define WASTED_G  0x02
8440 #define WASTED_C  0x04
8441 #define WASTED_GC (WASTED_G|WASTED_C)
8442  I32 wastedflags = 0x00;
8443  U32 posflags = 0, negflags = 0;
8444  U32 *flagsp = &posflags;
8445  char has_charset_modifier = '\0';
8446  regex_charset cs;
8447  bool has_use_defaults = FALSE;
8448  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8449
8450  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8451
8452  /* '^' as an initial flag sets certain defaults */
8453  if (UCHARAT(RExC_parse) == '^') {
8454   RExC_parse++;
8455   has_use_defaults = TRUE;
8456   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8457   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8458           ? REGEX_UNICODE_CHARSET
8459           : REGEX_DEPENDS_CHARSET);
8460  }
8461
8462  cs = get_regex_charset(RExC_flags);
8463  if (cs == REGEX_DEPENDS_CHARSET
8464   && (RExC_utf8 || RExC_uni_semantics))
8465  {
8466   cs = REGEX_UNICODE_CHARSET;
8467  }
8468
8469  while (*RExC_parse) {
8470   /* && strchr("iogcmsx", *RExC_parse) */
8471   /* (?g), (?gc) and (?o) are useless here
8472   and must be globally applied -- japhy */
8473   switch (*RExC_parse) {
8474
8475    /* Code for the imsx flags */
8476    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8477
8478    case LOCALE_PAT_MOD:
8479     if (has_charset_modifier) {
8480      goto excess_modifier;
8481     }
8482     else if (flagsp == &negflags) {
8483      goto neg_modifier;
8484     }
8485     cs = REGEX_LOCALE_CHARSET;
8486     has_charset_modifier = LOCALE_PAT_MOD;
8487     RExC_contains_locale = 1;
8488     break;
8489    case UNICODE_PAT_MOD:
8490     if (has_charset_modifier) {
8491      goto excess_modifier;
8492     }
8493     else if (flagsp == &negflags) {
8494      goto neg_modifier;
8495     }
8496     cs = REGEX_UNICODE_CHARSET;
8497     has_charset_modifier = UNICODE_PAT_MOD;
8498     break;
8499    case ASCII_RESTRICT_PAT_MOD:
8500     if (flagsp == &negflags) {
8501      goto neg_modifier;
8502     }
8503     if (has_charset_modifier) {
8504      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8505       goto excess_modifier;
8506      }
8507      /* Doubled modifier implies more restricted */
8508      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8509     }
8510     else {
8511      cs = REGEX_ASCII_RESTRICTED_CHARSET;
8512     }
8513     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8514     break;
8515    case DEPENDS_PAT_MOD:
8516     if (has_use_defaults) {
8517      goto fail_modifiers;
8518     }
8519     else if (flagsp == &negflags) {
8520      goto neg_modifier;
8521     }
8522     else if (has_charset_modifier) {
8523      goto excess_modifier;
8524     }
8525
8526     /* The dual charset means unicode semantics if the
8527     * pattern (or target, not known until runtime) are
8528     * utf8, or something in the pattern indicates unicode
8529     * semantics */
8530     cs = (RExC_utf8 || RExC_uni_semantics)
8531      ? REGEX_UNICODE_CHARSET
8532      : REGEX_DEPENDS_CHARSET;
8533     has_charset_modifier = DEPENDS_PAT_MOD;
8534     break;
8535    excess_modifier:
8536     RExC_parse++;
8537     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8538      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8539     }
8540     else if (has_charset_modifier == *(RExC_parse - 1)) {
8541      vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8542     }
8543     else {
8544      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8545     }
8546     /*NOTREACHED*/
8547    neg_modifier:
8548     RExC_parse++;
8549     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8550     /*NOTREACHED*/
8551    case ONCE_PAT_MOD: /* 'o' */
8552    case GLOBAL_PAT_MOD: /* 'g' */
8553     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8554      const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8555      if (! (wastedflags & wflagbit) ) {
8556       wastedflags |= wflagbit;
8557       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8558       vWARN5(
8559        RExC_parse + 1,
8560        "Useless (%s%c) - %suse /%c modifier",
8561        flagsp == &negflags ? "?-" : "?",
8562        *RExC_parse,
8563        flagsp == &negflags ? "don't " : "",
8564        *RExC_parse
8565       );
8566      }
8567     }
8568     break;
8569
8570    case CONTINUE_PAT_MOD: /* 'c' */
8571     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8572      if (! (wastedflags & WASTED_C) ) {
8573       wastedflags |= WASTED_GC;
8574       /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8575       vWARN3(
8576        RExC_parse + 1,
8577        "Useless (%sc) - %suse /gc modifier",
8578        flagsp == &negflags ? "?-" : "?",
8579        flagsp == &negflags ? "don't " : ""
8580       );
8581      }
8582     }
8583     break;
8584    case KEEPCOPY_PAT_MOD: /* 'p' */
8585     if (flagsp == &negflags) {
8586      if (SIZE_ONLY)
8587       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8588     } else {
8589      *flagsp |= RXf_PMf_KEEPCOPY;
8590     }
8591     break;
8592    case '-':
8593     /* A flag is a default iff it is following a minus, so
8594     * if there is a minus, it means will be trying to
8595     * re-specify a default which is an error */
8596     if (has_use_defaults || flagsp == &negflags) {
8597      goto fail_modifiers;
8598     }
8599     flagsp = &negflags;
8600     wastedflags = 0;  /* reset so (?g-c) warns twice */
8601     break;
8602    case ':':
8603    case ')':
8604     RExC_flags |= posflags;
8605     RExC_flags &= ~negflags;
8606     set_regex_charset(&RExC_flags, cs);
8607     return;
8608     /*NOTREACHED*/
8609    default:
8610    fail_modifiers:
8611     RExC_parse++;
8612     vFAIL3("Sequence (%.*s...) not recognized",
8613      RExC_parse-seqstart, seqstart);
8614     /*NOTREACHED*/
8615   }
8616
8617   ++RExC_parse;
8618  }
8619 }
8620
8621 /*
8622  - reg - regular expression, i.e. main body or parenthesized thing
8623  *
8624  * Caller must absorb opening parenthesis.
8625  *
8626  * Combining parenthesis handling with the base level of regular expression
8627  * is a trifle forced, but the need to tie the tails of the branches to what
8628  * follows makes it hard to avoid.
8629  */
8630 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8631 #ifdef DEBUGGING
8632 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8633 #else
8634 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8635 #endif
8636
8637 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8638    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8639    needs to be restarted.
8640    Otherwise would only return NULL if regbranch() returns NULL, which
8641    cannot happen.  */
8642 STATIC regnode *
8643 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8644  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
8645  * 2 is like 1, but indicates that nextchar() has been called to advance
8646  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
8647  * this flag alerts us to the need to check for that */
8648 {
8649  dVAR;
8650  regnode *ret;  /* Will be the head of the group. */
8651  regnode *br;
8652  regnode *lastbr;
8653  regnode *ender = NULL;
8654  I32 parno = 0;
8655  I32 flags;
8656  U32 oregflags = RExC_flags;
8657  bool have_branch = 0;
8658  bool is_open = 0;
8659  I32 freeze_paren = 0;
8660  I32 after_freeze = 0;
8661
8662  char * parse_start = RExC_parse; /* MJD */
8663  char * const oregcomp_parse = RExC_parse;
8664
8665  GET_RE_DEBUG_FLAGS_DECL;
8666
8667  PERL_ARGS_ASSERT_REG;
8668  DEBUG_PARSE("reg ");
8669
8670  *flagp = 0;    /* Tentatively. */
8671
8672
8673  /* Make an OPEN node, if parenthesized. */
8674  if (paren) {
8675
8676   /* Under /x, space and comments can be gobbled up between the '(' and
8677   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
8678   * intervening space, as the sequence is a token, and a token should be
8679   * indivisible */
8680   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
8681
8682   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8683    char *start_verb = RExC_parse;
8684    STRLEN verb_len = 0;
8685    char *start_arg = NULL;
8686    unsigned char op = 0;
8687    int argok = 1;
8688    int internal_argval = 0; /* internal_argval is only useful if !argok */
8689
8690    if (has_intervening_patws && SIZE_ONLY) {
8691     ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
8692    }
8693    while ( *RExC_parse && *RExC_parse != ')' ) {
8694     if ( *RExC_parse == ':' ) {
8695      start_arg = RExC_parse + 1;
8696      break;
8697     }
8698     RExC_parse++;
8699    }
8700    ++start_verb;
8701    verb_len = RExC_parse - start_verb;
8702    if ( start_arg ) {
8703     RExC_parse++;
8704     while ( *RExC_parse && *RExC_parse != ')' )
8705      RExC_parse++;
8706     if ( *RExC_parse != ')' )
8707      vFAIL("Unterminated verb pattern argument");
8708     if ( RExC_parse == start_arg )
8709      start_arg = NULL;
8710    } else {
8711     if ( *RExC_parse != ')' )
8712      vFAIL("Unterminated verb pattern");
8713    }
8714
8715    switch ( *start_verb ) {
8716    case 'A':  /* (*ACCEPT) */
8717     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8718      op = ACCEPT;
8719      internal_argval = RExC_nestroot;
8720     }
8721     break;
8722    case 'C':  /* (*COMMIT) */
8723     if ( memEQs(start_verb,verb_len,"COMMIT") )
8724      op = COMMIT;
8725     break;
8726    case 'F':  /* (*FAIL) */
8727     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8728      op = OPFAIL;
8729      argok = 0;
8730     }
8731     break;
8732    case ':':  /* (*:NAME) */
8733    case 'M':  /* (*MARK:NAME) */
8734     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8735      op = MARKPOINT;
8736      argok = -1;
8737     }
8738     break;
8739    case 'P':  /* (*PRUNE) */
8740     if ( memEQs(start_verb,verb_len,"PRUNE") )
8741      op = PRUNE;
8742     break;
8743    case 'S':   /* (*SKIP) */
8744     if ( memEQs(start_verb,verb_len,"SKIP") )
8745      op = SKIP;
8746     break;
8747    case 'T':  /* (*THEN) */
8748     /* [19:06] <TimToady> :: is then */
8749     if ( memEQs(start_verb,verb_len,"THEN") ) {
8750      op = CUTGROUP;
8751      RExC_seen |= REG_SEEN_CUTGROUP;
8752     }
8753     break;
8754    }
8755    if ( ! op ) {
8756     RExC_parse++;
8757     vFAIL3("Unknown verb pattern '%.*s'",
8758      verb_len, start_verb);
8759    }
8760    if ( argok ) {
8761     if ( start_arg && internal_argval ) {
8762      vFAIL3("Verb pattern '%.*s' may not have an argument",
8763       verb_len, start_verb);
8764     } else if ( argok < 0 && !start_arg ) {
8765      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8766       verb_len, start_verb);
8767     } else {
8768      ret = reganode(pRExC_state, op, internal_argval);
8769      if ( ! internal_argval && ! SIZE_ONLY ) {
8770       if (start_arg) {
8771        SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8772        ARG(ret) = add_data( pRExC_state, 1, "S" );
8773        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8774        ret->flags = 0;
8775       } else {
8776        ret->flags = 1;
8777       }
8778      }
8779     }
8780     if (!internal_argval)
8781      RExC_seen |= REG_SEEN_VERBARG;
8782    } else if ( start_arg ) {
8783     vFAIL3("Verb pattern '%.*s' may not have an argument",
8784       verb_len, start_verb);
8785    } else {
8786     ret = reg_node(pRExC_state, op);
8787    }
8788    nextchar(pRExC_state);
8789    return ret;
8790   }
8791   else if (*RExC_parse == '?') { /* (?...) */
8792    bool is_logical = 0;
8793    const char * const seqstart = RExC_parse;
8794    if (has_intervening_patws && SIZE_ONLY) {
8795     ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
8796    }
8797
8798    RExC_parse++;
8799    paren = *RExC_parse++;
8800    ret = NULL;   /* For look-ahead/behind. */
8801    switch (paren) {
8802
8803    case 'P': /* (?P...) variants for those used to PCRE/Python */
8804     paren = *RExC_parse++;
8805     if ( paren == '<')         /* (?P<...>) named capture */
8806      goto named_capture;
8807     else if (paren == '>') {   /* (?P>name) named recursion */
8808      goto named_recursion;
8809     }
8810     else if (paren == '=') {   /* (?P=...)  named backref */
8811      /* this pretty much dupes the code for \k<NAME> in regatom(), if
8812      you change this make sure you change that */
8813      char* name_start = RExC_parse;
8814      U32 num = 0;
8815      SV *sv_dat = reg_scan_name(pRExC_state,
8816       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8817      if (RExC_parse == name_start || *RExC_parse != ')')
8818       vFAIL2("Sequence %.3s... not terminated",parse_start);
8819
8820      if (!SIZE_ONLY) {
8821       num = add_data( pRExC_state, 1, "S" );
8822       RExC_rxi->data->data[num]=(void*)sv_dat;
8823       SvREFCNT_inc_simple_void(sv_dat);
8824      }
8825      RExC_sawback = 1;
8826      ret = reganode(pRExC_state,
8827         ((! FOLD)
8828          ? NREF
8829          : (ASCII_FOLD_RESTRICTED)
8830          ? NREFFA
8831          : (AT_LEAST_UNI_SEMANTICS)
8832           ? NREFFU
8833           : (LOC)
8834           ? NREFFL
8835           : NREFF),
8836          num);
8837      *flagp |= HASWIDTH;
8838
8839      Set_Node_Offset(ret, parse_start+1);
8840      Set_Node_Cur_Length(ret, parse_start);
8841
8842      nextchar(pRExC_state);
8843      return ret;
8844     }
8845     RExC_parse++;
8846     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8847     /*NOTREACHED*/
8848    case '<':           /* (?<...) */
8849     if (*RExC_parse == '!')
8850      paren = ',';
8851     else if (*RExC_parse != '=')
8852    named_capture:
8853     {               /* (?<...>) */
8854      char *name_start;
8855      SV *svname;
8856      paren= '>';
8857    case '\'':          /* (?'...') */
8858       name_start= RExC_parse;
8859       svname = reg_scan_name(pRExC_state,
8860        SIZE_ONLY ?  /* reverse test from the others */
8861        REG_RSN_RETURN_NAME :
8862        REG_RSN_RETURN_NULL);
8863      if (RExC_parse == name_start) {
8864       RExC_parse++;
8865       vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8866       /*NOTREACHED*/
8867      }
8868      if (*RExC_parse != paren)
8869       vFAIL2("Sequence (?%c... not terminated",
8870        paren=='>' ? '<' : paren);
8871      if (SIZE_ONLY) {
8872       HE *he_str;
8873       SV *sv_dat = NULL;
8874       if (!svname) /* shouldn't happen */
8875        Perl_croak(aTHX_
8876         "panic: reg_scan_name returned NULL");
8877       if (!RExC_paren_names) {
8878        RExC_paren_names= newHV();
8879        sv_2mortal(MUTABLE_SV(RExC_paren_names));
8880 #ifdef DEBUGGING
8881        RExC_paren_name_list= newAV();
8882        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8883 #endif
8884       }
8885       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8886       if ( he_str )
8887        sv_dat = HeVAL(he_str);
8888       if ( ! sv_dat ) {
8889        /* croak baby croak */
8890        Perl_croak(aTHX_
8891         "panic: paren_name hash element allocation failed");
8892       } else if ( SvPOK(sv_dat) ) {
8893        /* (?|...) can mean we have dupes so scan to check
8894        its already been stored. Maybe a flag indicating
8895        we are inside such a construct would be useful,
8896        but the arrays are likely to be quite small, so
8897        for now we punt -- dmq */
8898        IV count = SvIV(sv_dat);
8899        I32 *pv = (I32*)SvPVX(sv_dat);
8900        IV i;
8901        for ( i = 0 ; i < count ; i++ ) {
8902         if ( pv[i] == RExC_npar ) {
8903          count = 0;
8904          break;
8905         }
8906        }
8907        if ( count ) {
8908         pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8909         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8910         pv[count] = RExC_npar;
8911         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8912        }
8913       } else {
8914        (void)SvUPGRADE(sv_dat,SVt_PVNV);
8915        sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8916        SvIOK_on(sv_dat);
8917        SvIV_set(sv_dat, 1);
8918       }
8919 #ifdef DEBUGGING
8920       /* Yes this does cause a memory leak in debugging Perls */
8921       if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8922        SvREFCNT_dec_NN(svname);
8923 #endif
8924
8925       /*sv_dump(sv_dat);*/
8926      }
8927      nextchar(pRExC_state);
8928      paren = 1;
8929      goto capturing_parens;
8930     }
8931     RExC_seen |= REG_SEEN_LOOKBEHIND;
8932     RExC_in_lookbehind++;
8933     RExC_parse++;
8934    case '=':           /* (?=...) */
8935     RExC_seen_zerolen++;
8936     break;
8937    case '!':           /* (?!...) */
8938     RExC_seen_zerolen++;
8939     if (*RExC_parse == ')') {
8940      ret=reg_node(pRExC_state, OPFAIL);
8941      nextchar(pRExC_state);
8942      return ret;
8943     }
8944     break;
8945    case '|':           /* (?|...) */
8946     /* branch reset, behave like a (?:...) except that
8947     buffers in alternations share the same numbers */
8948     paren = ':';
8949     after_freeze = freeze_paren = RExC_npar;
8950     break;
8951    case ':':           /* (?:...) */
8952    case '>':           /* (?>...) */
8953     break;
8954    case '$':           /* (?$...) */
8955    case '@':           /* (?@...) */
8956     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8957     break;
8958    case '#':           /* (?#...) */
8959     /* XXX As soon as we disallow separating the '?' and '*' (by
8960     * spaces or (?#...) comment), it is believed that this case
8961     * will be unreachable and can be removed.  See
8962     * [perl #117327] */
8963     while (*RExC_parse && *RExC_parse != ')')
8964      RExC_parse++;
8965     if (*RExC_parse != ')')
8966      FAIL("Sequence (?#... not terminated");
8967     nextchar(pRExC_state);
8968     *flagp = TRYAGAIN;
8969     return NULL;
8970    case '0' :           /* (?0) */
8971    case 'R' :           /* (?R) */
8972     if (*RExC_parse != ')')
8973      FAIL("Sequence (?R) not terminated");
8974     ret = reg_node(pRExC_state, GOSTART);
8975     *flagp |= POSTPONED;
8976     nextchar(pRExC_state);
8977     return ret;
8978     /*notreached*/
8979    { /* named and numeric backreferences */
8980     I32 num;
8981    case '&':            /* (?&NAME) */
8982     parse_start = RExC_parse - 1;
8983    named_recursion:
8984     {
8985       SV *sv_dat = reg_scan_name(pRExC_state,
8986        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8987       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8988     }
8989     goto gen_recurse_regop;
8990     assert(0); /* NOT REACHED */
8991    case '+':
8992     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8993      RExC_parse++;
8994      vFAIL("Illegal pattern");
8995     }
8996     goto parse_recursion;
8997     /* NOT REACHED*/
8998    case '-': /* (?-1) */
8999     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9000      RExC_parse--; /* rewind to let it be handled later */
9001      goto parse_flags;
9002     }
9003     /*FALLTHROUGH */
9004    case '1': case '2': case '3': case '4': /* (?1) */
9005    case '5': case '6': case '7': case '8': case '9':
9006     RExC_parse--;
9007    parse_recursion:
9008     num = atoi(RExC_parse);
9009     parse_start = RExC_parse - 1; /* MJD */
9010     if (*RExC_parse == '-')
9011      RExC_parse++;
9012     while (isDIGIT(*RExC_parse))
9013       RExC_parse++;
9014     if (*RExC_parse!=')')
9015      vFAIL("Expecting close bracket");
9016
9017    gen_recurse_regop:
9018     if ( paren == '-' ) {
9019      /*
9020      Diagram of capture buffer numbering.
9021      Top line is the normal capture buffer numbers
9022      Bottom line is the negative indexing as from
9023      the X (the (?-2))
9024
9025      +   1 2    3 4 5 X          6 7
9026      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9027      -   5 4    3 2 1 X          x x
9028
9029      */
9030      num = RExC_npar + num;
9031      if (num < 1)  {
9032       RExC_parse++;
9033       vFAIL("Reference to nonexistent group");
9034      }
9035     } else if ( paren == '+' ) {
9036      num = RExC_npar + num - 1;
9037     }
9038
9039     ret = reganode(pRExC_state, GOSUB, num);
9040     if (!SIZE_ONLY) {
9041      if (num > (I32)RExC_rx->nparens) {
9042       RExC_parse++;
9043       vFAIL("Reference to nonexistent group");
9044      }
9045      ARG2L_SET( ret, RExC_recurse_count++);
9046      RExC_emit++;
9047      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9048       "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9049     } else {
9050      RExC_size++;
9051      }
9052      RExC_seen |= REG_SEEN_RECURSE;
9053     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9054     Set_Node_Offset(ret, parse_start); /* MJD */
9055
9056     *flagp |= POSTPONED;
9057     nextchar(pRExC_state);
9058     return ret;
9059    } /* named and numeric backreferences */
9060    assert(0); /* NOT REACHED */
9061
9062    case '?':           /* (??...) */
9063     is_logical = 1;
9064     if (*RExC_parse != '{') {
9065      RExC_parse++;
9066      vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9067      /*NOTREACHED*/
9068     }
9069     *flagp |= POSTPONED;
9070     paren = *RExC_parse++;
9071     /* FALL THROUGH */
9072    case '{':           /* (?{...}) */
9073    {
9074     U32 n = 0;
9075     struct reg_code_block *cb;
9076
9077     RExC_seen_zerolen++;
9078
9079     if (   !pRExC_state->num_code_blocks
9080      || pRExC_state->code_index >= pRExC_state->num_code_blocks
9081      || pRExC_state->code_blocks[pRExC_state->code_index].start
9082       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9083        - RExC_start)
9084     ) {
9085      if (RExC_pm_flags & PMf_USE_RE_EVAL)
9086       FAIL("panic: Sequence (?{...}): no code block found\n");
9087      FAIL("Eval-group not allowed at runtime, use re 'eval'");
9088     }
9089     /* this is a pre-compiled code block (?{...}) */
9090     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9091     RExC_parse = RExC_start + cb->end;
9092     if (!SIZE_ONLY) {
9093      OP *o = cb->block;
9094      if (cb->src_regex) {
9095       n = add_data(pRExC_state, 2, "rl");
9096       RExC_rxi->data->data[n] =
9097        (void*)SvREFCNT_inc((SV*)cb->src_regex);
9098       RExC_rxi->data->data[n+1] = (void*)o;
9099      }
9100      else {
9101       n = add_data(pRExC_state, 1,
9102        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9103       RExC_rxi->data->data[n] = (void*)o;
9104      }
9105     }
9106     pRExC_state->code_index++;
9107     nextchar(pRExC_state);
9108
9109     if (is_logical) {
9110      regnode *eval;
9111      ret = reg_node(pRExC_state, LOGICAL);
9112      eval = reganode(pRExC_state, EVAL, n);
9113      if (!SIZE_ONLY) {
9114       ret->flags = 2;
9115       /* for later propagation into (??{}) return value */
9116       eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9117      }
9118      REGTAIL(pRExC_state, ret, eval);
9119      /* deal with the length of this later - MJD */
9120      return ret;
9121     }
9122     ret = reganode(pRExC_state, EVAL, n);
9123     Set_Node_Length(ret, RExC_parse - parse_start + 1);
9124     Set_Node_Offset(ret, parse_start);
9125     return ret;
9126    }
9127    case '(':           /* (?(?{...})...) and (?(?=...)...) */
9128    {
9129     int is_define= 0;
9130     if (RExC_parse[0] == '?') {        /* (?(?...)) */
9131      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9132       || RExC_parse[1] == '<'
9133       || RExC_parse[1] == '{') { /* Lookahead or eval. */
9134       I32 flag;
9135       regnode *tail;
9136
9137       ret = reg_node(pRExC_state, LOGICAL);
9138       if (!SIZE_ONLY)
9139        ret->flags = 1;
9140
9141       tail = reg(pRExC_state, 1, &flag, depth+1);
9142       if (flag & RESTART_UTF8) {
9143        *flagp = RESTART_UTF8;
9144        return NULL;
9145       }
9146       REGTAIL(pRExC_state, ret, tail);
9147       goto insert_if;
9148      }
9149     }
9150     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9151       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9152     {
9153      char ch = RExC_parse[0] == '<' ? '>' : '\'';
9154      char *name_start= RExC_parse++;
9155      U32 num = 0;
9156      SV *sv_dat=reg_scan_name(pRExC_state,
9157       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9158      if (RExC_parse == name_start || *RExC_parse != ch)
9159       vFAIL2("Sequence (?(%c... not terminated",
9160        (ch == '>' ? '<' : ch));
9161      RExC_parse++;
9162      if (!SIZE_ONLY) {
9163       num = add_data( pRExC_state, 1, "S" );
9164       RExC_rxi->data->data[num]=(void*)sv_dat;
9165       SvREFCNT_inc_simple_void(sv_dat);
9166      }
9167      ret = reganode(pRExC_state,NGROUPP,num);
9168      goto insert_if_check_paren;
9169     }
9170     else if (RExC_parse[0] == 'D' &&
9171       RExC_parse[1] == 'E' &&
9172       RExC_parse[2] == 'F' &&
9173       RExC_parse[3] == 'I' &&
9174       RExC_parse[4] == 'N' &&
9175       RExC_parse[5] == 'E')
9176     {
9177      ret = reganode(pRExC_state,DEFINEP,0);
9178      RExC_parse +=6 ;
9179      is_define = 1;
9180      goto insert_if_check_paren;
9181     }
9182     else if (RExC_parse[0] == 'R') {
9183      RExC_parse++;
9184      parno = 0;
9185      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9186       parno = atoi(RExC_parse++);
9187       while (isDIGIT(*RExC_parse))
9188        RExC_parse++;
9189      } else if (RExC_parse[0] == '&') {
9190       SV *sv_dat;
9191       RExC_parse++;
9192       sv_dat = reg_scan_name(pRExC_state,
9193         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9194        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9195      }
9196      ret = reganode(pRExC_state,INSUBP,parno);
9197      goto insert_if_check_paren;
9198     }
9199     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9200      /* (?(1)...) */
9201      char c;
9202      parno = atoi(RExC_parse++);
9203
9204      while (isDIGIT(*RExC_parse))
9205       RExC_parse++;
9206      ret = reganode(pRExC_state, GROUPP, parno);
9207
9208     insert_if_check_paren:
9209      if ((c = *nextchar(pRExC_state)) != ')')
9210       vFAIL("Switch condition not recognized");
9211     insert_if:
9212      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9213      br = regbranch(pRExC_state, &flags, 1,depth+1);
9214      if (br == NULL) {
9215       if (flags & RESTART_UTF8) {
9216        *flagp = RESTART_UTF8;
9217        return NULL;
9218       }
9219       FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9220        (UV) flags);
9221      } else
9222       REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9223      c = *nextchar(pRExC_state);
9224      if (flags&HASWIDTH)
9225       *flagp |= HASWIDTH;
9226      if (c == '|') {
9227       if (is_define)
9228        vFAIL("(?(DEFINE)....) does not allow branches");
9229       lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9230       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9231        if (flags & RESTART_UTF8) {
9232         *flagp = RESTART_UTF8;
9233         return NULL;
9234        }
9235        FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9236         (UV) flags);
9237       }
9238       REGTAIL(pRExC_state, ret, lastbr);
9239       if (flags&HASWIDTH)
9240        *flagp |= HASWIDTH;
9241       c = *nextchar(pRExC_state);
9242      }
9243      else
9244       lastbr = NULL;
9245      if (c != ')')
9246       vFAIL("Switch (?(condition)... contains too many branches");
9247      ender = reg_node(pRExC_state, TAIL);
9248      REGTAIL(pRExC_state, br, ender);
9249      if (lastbr) {
9250       REGTAIL(pRExC_state, lastbr, ender);
9251       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9252      }
9253      else
9254       REGTAIL(pRExC_state, ret, ender);
9255      RExC_size++; /* XXX WHY do we need this?!!
9256          For large programs it seems to be required
9257          but I can't figure out why. -- dmq*/
9258      return ret;
9259     }
9260     else {
9261      vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9262     }
9263    }
9264    case '[':           /* (?[ ... ]) */
9265     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9266           oregcomp_parse);
9267    case 0:
9268     RExC_parse--; /* for vFAIL to print correctly */
9269     vFAIL("Sequence (? incomplete");
9270     break;
9271    default: /* e.g., (?i) */
9272     --RExC_parse;
9273    parse_flags:
9274     parse_lparen_question_flags(pRExC_state);
9275     if (UCHARAT(RExC_parse) != ':') {
9276      nextchar(pRExC_state);
9277      *flagp = TRYAGAIN;
9278      return NULL;
9279     }
9280     paren = ':';
9281     nextchar(pRExC_state);
9282     ret = NULL;
9283     goto parse_rest;
9284    } /* end switch */
9285   }
9286   else {                  /* (...) */
9287   capturing_parens:
9288    parno = RExC_npar;
9289    RExC_npar++;
9290
9291    ret = reganode(pRExC_state, OPEN, parno);
9292    if (!SIZE_ONLY ){
9293     if (!RExC_nestroot)
9294      RExC_nestroot = parno;
9295     if (RExC_seen & REG_SEEN_RECURSE
9296      && !RExC_open_parens[parno-1])
9297     {
9298      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9299       "Setting open paren #%"IVdf" to %d\n",
9300       (IV)parno, REG_NODE_NUM(ret)));
9301      RExC_open_parens[parno-1]= ret;
9302     }
9303    }
9304    Set_Node_Length(ret, 1); /* MJD */
9305    Set_Node_Offset(ret, RExC_parse); /* MJD */
9306    is_open = 1;
9307   }
9308  }
9309  else                        /* ! paren */
9310   ret = NULL;
9311
9312    parse_rest:
9313  /* Pick up the branches, linking them together. */
9314  parse_start = RExC_parse;   /* MJD */
9315  br = regbranch(pRExC_state, &flags, 1,depth+1);
9316
9317  /*     branch_len = (paren != 0); */
9318
9319  if (br == NULL) {
9320   if (flags & RESTART_UTF8) {
9321    *flagp = RESTART_UTF8;
9322    return NULL;
9323   }
9324   FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9325  }
9326  if (*RExC_parse == '|') {
9327   if (!SIZE_ONLY && RExC_extralen) {
9328    reginsert(pRExC_state, BRANCHJ, br, depth+1);
9329   }
9330   else {                  /* MJD */
9331    reginsert(pRExC_state, BRANCH, br, depth+1);
9332    Set_Node_Length(br, paren != 0);
9333    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9334   }
9335   have_branch = 1;
9336   if (SIZE_ONLY)
9337    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
9338  }
9339  else if (paren == ':') {
9340   *flagp |= flags&SIMPLE;
9341  }
9342  if (is_open) {    /* Starts with OPEN. */
9343   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9344  }
9345  else if (paren != '?')  /* Not Conditional */
9346   ret = br;
9347  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9348  lastbr = br;
9349  while (*RExC_parse == '|') {
9350   if (!SIZE_ONLY && RExC_extralen) {
9351    ender = reganode(pRExC_state, LONGJMP,0);
9352    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9353   }
9354   if (SIZE_ONLY)
9355    RExC_extralen += 2;  /* Account for LONGJMP. */
9356   nextchar(pRExC_state);
9357   if (freeze_paren) {
9358    if (RExC_npar > after_freeze)
9359     after_freeze = RExC_npar;
9360    RExC_npar = freeze_paren;
9361   }
9362   br = regbranch(pRExC_state, &flags, 0, depth+1);
9363
9364   if (br == NULL) {
9365    if (flags & RESTART_UTF8) {
9366     *flagp = RESTART_UTF8;
9367     return NULL;
9368    }
9369    FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9370   }
9371   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9372   lastbr = br;
9373   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9374  }
9375
9376  if (have_branch || paren != ':') {
9377   /* Make a closing node, and hook it on the end. */
9378   switch (paren) {
9379   case ':':
9380    ender = reg_node(pRExC_state, TAIL);
9381    break;
9382   case 1: case 2:
9383    ender = reganode(pRExC_state, CLOSE, parno);
9384    if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9385     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9386       "Setting close paren #%"IVdf" to %d\n",
9387       (IV)parno, REG_NODE_NUM(ender)));
9388     RExC_close_parens[parno-1]= ender;
9389     if (RExC_nestroot == parno)
9390      RExC_nestroot = 0;
9391    }
9392    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9393    Set_Node_Length(ender,1); /* MJD */
9394    break;
9395   case '<':
9396   case ',':
9397   case '=':
9398   case '!':
9399    *flagp &= ~HASWIDTH;
9400    /* FALL THROUGH */
9401   case '>':
9402    ender = reg_node(pRExC_state, SUCCEED);
9403    break;
9404   case 0:
9405    ender = reg_node(pRExC_state, END);
9406    if (!SIZE_ONLY) {
9407     assert(!RExC_opend); /* there can only be one! */
9408     RExC_opend = ender;
9409    }
9410    break;
9411   }
9412   DEBUG_PARSE_r(if (!SIZE_ONLY) {
9413    SV * const mysv_val1=sv_newmortal();
9414    SV * const mysv_val2=sv_newmortal();
9415    DEBUG_PARSE_MSG("lsbr");
9416    regprop(RExC_rx, mysv_val1, lastbr);
9417    regprop(RExC_rx, mysv_val2, ender);
9418    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9419       SvPV_nolen_const(mysv_val1),
9420       (IV)REG_NODE_NUM(lastbr),
9421       SvPV_nolen_const(mysv_val2),
9422       (IV)REG_NODE_NUM(ender),
9423       (IV)(ender - lastbr)
9424    );
9425   });
9426   REGTAIL(pRExC_state, lastbr, ender);
9427
9428   if (have_branch && !SIZE_ONLY) {
9429    char is_nothing= 1;
9430    if (depth==1)
9431     RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9432
9433    /* Hook the tails of the branches to the closing node. */
9434    for (br = ret; br; br = regnext(br)) {
9435     const U8 op = PL_regkind[OP(br)];
9436     if (op == BRANCH) {
9437      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9438      if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9439       is_nothing= 0;
9440     }
9441     else if (op == BRANCHJ) {
9442      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9443      /* for now we always disable this optimisation * /
9444      if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9445      */
9446       is_nothing= 0;
9447     }
9448    }
9449    if (is_nothing) {
9450     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9451     DEBUG_PARSE_r(if (!SIZE_ONLY) {
9452      SV * const mysv_val1=sv_newmortal();
9453      SV * const mysv_val2=sv_newmortal();
9454      DEBUG_PARSE_MSG("NADA");
9455      regprop(RExC_rx, mysv_val1, ret);
9456      regprop(RExC_rx, mysv_val2, ender);
9457      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9458         SvPV_nolen_const(mysv_val1),
9459         (IV)REG_NODE_NUM(ret),
9460         SvPV_nolen_const(mysv_val2),
9461         (IV)REG_NODE_NUM(ender),
9462         (IV)(ender - ret)
9463      );
9464     });
9465     OP(br)= NOTHING;
9466     if (OP(ender) == TAIL) {
9467      NEXT_OFF(br)= 0;
9468      RExC_emit= br + 1;
9469     } else {
9470      regnode *opt;
9471      for ( opt= br + 1; opt < ender ; opt++ )
9472       OP(opt)= OPTIMIZED;
9473      NEXT_OFF(br)= ender - br;
9474     }
9475    }
9476   }
9477  }
9478
9479  {
9480   const char *p;
9481   static const char parens[] = "=!<,>";
9482
9483   if (paren && (p = strchr(parens, paren))) {
9484    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9485    int flag = (p - parens) > 1;
9486
9487    if (paren == '>')
9488     node = SUSPEND, flag = 0;
9489    reginsert(pRExC_state, node,ret, depth+1);
9490    Set_Node_Cur_Length(ret, parse_start);
9491    Set_Node_Offset(ret, parse_start + 1);
9492    ret->flags = flag;
9493    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9494   }
9495  }
9496
9497  /* Check for proper termination. */
9498  if (paren) {
9499   /* restore original flags, but keep (?p) */
9500   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9501   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9502    RExC_parse = oregcomp_parse;
9503    vFAIL("Unmatched (");
9504   }
9505  }
9506  else if (!paren && RExC_parse < RExC_end) {
9507   if (*RExC_parse == ')') {
9508    RExC_parse++;
9509    vFAIL("Unmatched )");
9510   }
9511   else
9512    FAIL("Junk on end of regexp"); /* "Can't happen". */
9513   assert(0); /* NOTREACHED */
9514  }
9515
9516  if (RExC_in_lookbehind) {
9517   RExC_in_lookbehind--;
9518  }
9519  if (after_freeze > RExC_npar)
9520   RExC_npar = after_freeze;
9521  return(ret);
9522 }
9523
9524 /*
9525  - regbranch - one alternative of an | operator
9526  *
9527  * Implements the concatenation operator.
9528  *
9529  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9530  * restarted.
9531  */
9532 STATIC regnode *
9533 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9534 {
9535  dVAR;
9536  regnode *ret;
9537  regnode *chain = NULL;
9538  regnode *latest;
9539  I32 flags = 0, c = 0;
9540  GET_RE_DEBUG_FLAGS_DECL;
9541
9542  PERL_ARGS_ASSERT_REGBRANCH;
9543
9544  DEBUG_PARSE("brnc");
9545
9546  if (first)
9547   ret = NULL;
9548  else {
9549   if (!SIZE_ONLY && RExC_extralen)
9550    ret = reganode(pRExC_state, BRANCHJ,0);
9551   else {
9552    ret = reg_node(pRExC_state, BRANCH);
9553    Set_Node_Length(ret, 1);
9554   }
9555  }
9556
9557  if (!first && SIZE_ONLY)
9558   RExC_extralen += 1;   /* BRANCHJ */
9559
9560  *flagp = WORST;   /* Tentatively. */
9561
9562  RExC_parse--;
9563  nextchar(pRExC_state);
9564  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9565   flags &= ~TRYAGAIN;
9566   latest = regpiece(pRExC_state, &flags,depth+1);
9567   if (latest == NULL) {
9568    if (flags & TRYAGAIN)
9569     continue;
9570    if (flags & RESTART_UTF8) {
9571     *flagp = RESTART_UTF8;
9572     return NULL;
9573    }
9574    FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
9575   }
9576   else if (ret == NULL)
9577    ret = latest;
9578   *flagp |= flags&(HASWIDTH|POSTPONED);
9579   if (chain == NULL)  /* First piece. */
9580    *flagp |= flags&SPSTART;
9581   else {
9582    RExC_naughty++;
9583    REGTAIL(pRExC_state, chain, latest);
9584   }
9585   chain = latest;
9586   c++;
9587  }
9588  if (chain == NULL) { /* Loop ran zero times. */
9589   chain = reg_node(pRExC_state, NOTHING);
9590   if (ret == NULL)
9591    ret = chain;
9592  }
9593  if (c == 1) {
9594   *flagp |= flags&SIMPLE;
9595  }
9596
9597  return ret;
9598 }
9599
9600 /*
9601  - regpiece - something followed by possible [*+?]
9602  *
9603  * Note that the branching code sequences used for ? and the general cases
9604  * of * and + are somewhat optimized:  they use the same NOTHING node as
9605  * both the endmarker for their branch list and the body of the last branch.
9606  * It might seem that this node could be dispensed with entirely, but the
9607  * endmarker role is not redundant.
9608  *
9609  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9610  * TRYAGAIN.
9611  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9612  * restarted.
9613  */
9614 STATIC regnode *
9615 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9616 {
9617  dVAR;
9618  regnode *ret;
9619  char op;
9620  char *next;
9621  I32 flags;
9622  const char * const origparse = RExC_parse;
9623  I32 min;
9624  I32 max = REG_INFTY;
9625 #ifdef RE_TRACK_PATTERN_OFFSETS
9626  char *parse_start;
9627 #endif
9628  const char *maxpos = NULL;
9629
9630  /* Save the original in case we change the emitted regop to a FAIL. */
9631  regnode * const orig_emit = RExC_emit;
9632
9633  GET_RE_DEBUG_FLAGS_DECL;
9634
9635  PERL_ARGS_ASSERT_REGPIECE;
9636
9637  DEBUG_PARSE("piec");
9638
9639  ret = regatom(pRExC_state, &flags,depth+1);
9640  if (ret == NULL) {
9641   if (flags & (TRYAGAIN|RESTART_UTF8))
9642    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9643   else
9644    FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
9645   return(NULL);
9646  }
9647
9648  op = *RExC_parse;
9649
9650  if (op == '{' && regcurly(RExC_parse, FALSE)) {
9651   maxpos = NULL;
9652 #ifdef RE_TRACK_PATTERN_OFFSETS
9653   parse_start = RExC_parse; /* MJD */
9654 #endif
9655   next = RExC_parse + 1;
9656   while (isDIGIT(*next) || *next == ',') {
9657    if (*next == ',') {
9658     if (maxpos)
9659      break;
9660     else
9661      maxpos = next;
9662    }
9663    next++;
9664   }
9665   if (*next == '}') {  /* got one */
9666    if (!maxpos)
9667     maxpos = next;
9668    RExC_parse++;
9669    min = atoi(RExC_parse);
9670    if (*maxpos == ',')
9671     maxpos++;
9672    else
9673     maxpos = RExC_parse;
9674    max = atoi(maxpos);
9675    if (!max && *maxpos != '0')
9676     max = REG_INFTY;  /* meaning "infinity" */
9677    else if (max >= REG_INFTY)
9678     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9679    RExC_parse = next;
9680    nextchar(pRExC_state);
9681    if (max < min) {    /* If can't match, warn and optimize to fail
9682         unconditionally */
9683     if (SIZE_ONLY) {
9684      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9685
9686      /* We can't back off the size because we have to reserve
9687      * enough space for all the things we are about to throw
9688      * away, but we can shrink it by the ammount we are about
9689      * to re-use here */
9690      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9691     }
9692     else {
9693      RExC_emit = orig_emit;
9694     }
9695     ret = reg_node(pRExC_state, OPFAIL);
9696     return ret;
9697    }
9698
9699   do_curly:
9700    if ((flags&SIMPLE)) {
9701     RExC_naughty += 2 + RExC_naughty / 2;
9702     reginsert(pRExC_state, CURLY, ret, depth+1);
9703     Set_Node_Offset(ret, parse_start+1); /* MJD */
9704     Set_Node_Cur_Length(ret, parse_start);
9705    }
9706    else {
9707     regnode * const w = reg_node(pRExC_state, WHILEM);
9708
9709     w->flags = 0;
9710     REGTAIL(pRExC_state, ret, w);
9711     if (!SIZE_ONLY && RExC_extralen) {
9712      reginsert(pRExC_state, LONGJMP,ret, depth+1);
9713      reginsert(pRExC_state, NOTHING,ret, depth+1);
9714      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9715     }
9716     reginsert(pRExC_state, CURLYX,ret, depth+1);
9717         /* MJD hk */
9718     Set_Node_Offset(ret, parse_start+1);
9719     Set_Node_Length(ret,
9720         op == '{' ? (RExC_parse - parse_start) : 1);
9721
9722     if (!SIZE_ONLY && RExC_extralen)
9723      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9724     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9725     if (SIZE_ONLY)
9726      RExC_whilem_seen++, RExC_extralen += 3;
9727     RExC_naughty += 4 + RExC_naughty; /* compound interest */
9728    }
9729    ret->flags = 0;
9730
9731    if (min > 0)
9732     *flagp = WORST;
9733    if (max > 0)
9734     *flagp |= HASWIDTH;
9735    if (!SIZE_ONLY) {
9736     ARG1_SET(ret, (U16)min);
9737     ARG2_SET(ret, (U16)max);
9738    }
9739
9740    goto nest_check;
9741   }
9742  }
9743
9744  if (!ISMULT1(op)) {
9745   *flagp = flags;
9746   return(ret);
9747  }
9748
9749 #if 0    /* Now runtime fix should be reliable. */
9750
9751  /* if this is reinstated, don't forget to put this back into perldiag:
9752
9753    =item Regexp *+ operand could be empty at {#} in regex m/%s/
9754
9755   (F) The part of the regexp subject to either the * or + quantifier
9756   could match an empty string. The {#} shows in the regular
9757   expression about where the problem was discovered.
9758
9759  */
9760
9761  if (!(flags&HASWIDTH) && op != '?')
9762  vFAIL("Regexp *+ operand could be empty");
9763 #endif
9764
9765 #ifdef RE_TRACK_PATTERN_OFFSETS
9766  parse_start = RExC_parse;
9767 #endif
9768  nextchar(pRExC_state);
9769
9770  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9771
9772  if (op == '*' && (flags&SIMPLE)) {
9773   reginsert(pRExC_state, STAR, ret, depth+1);
9774   ret->flags = 0;
9775   RExC_naughty += 4;
9776  }
9777  else if (op == '*') {
9778   min = 0;
9779   goto do_curly;
9780  }
9781  else if (op == '+' && (flags&SIMPLE)) {
9782   reginsert(pRExC_state, PLUS, ret, depth+1);
9783   ret->flags = 0;
9784   RExC_naughty += 3;
9785  }
9786  else if (op == '+') {
9787   min = 1;
9788   goto do_curly;
9789  }
9790  else if (op == '?') {
9791   min = 0; max = 1;
9792   goto do_curly;
9793  }
9794   nest_check:
9795  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9796   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9797   ckWARN3reg(RExC_parse,
9798     "%.*s matches null string many times",
9799     (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9800     origparse);
9801   (void)ReREFCNT_inc(RExC_rx_sv);
9802  }
9803
9804  if (RExC_parse < RExC_end && *RExC_parse == '?') {
9805   nextchar(pRExC_state);
9806   reginsert(pRExC_state, MINMOD, ret, depth+1);
9807   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9808  }
9809  else
9810  if (RExC_parse < RExC_end && *RExC_parse == '+') {
9811   regnode *ender;
9812   nextchar(pRExC_state);
9813   ender = reg_node(pRExC_state, SUCCEED);
9814   REGTAIL(pRExC_state, ret, ender);
9815   reginsert(pRExC_state, SUSPEND, ret, depth+1);
9816   ret->flags = 0;
9817   ender = reg_node(pRExC_state, TAIL);
9818   REGTAIL(pRExC_state, ret, ender);
9819  }
9820
9821  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9822   RExC_parse++;
9823   vFAIL("Nested quantifiers");
9824  }
9825
9826  return(ret);
9827 }
9828
9829 STATIC bool
9830 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9831   const bool strict   /* Apply stricter parsing rules? */
9832  )
9833 {
9834
9835  /* This is expected to be called by a parser routine that has recognized '\N'
9836    and needs to handle the rest. RExC_parse is expected to point at the first
9837    char following the N at the time of the call.  On successful return,
9838    RExC_parse has been updated to point to just after the sequence identified
9839    by this routine, and <*flagp> has been updated.
9840
9841    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9842    character class.
9843
9844    \N may begin either a named sequence, or if outside a character class, mean
9845    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9846    attempted to decide which, and in the case of a named sequence, converted it
9847    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9848    where c1... are the characters in the sequence.  For single-quoted regexes,
9849    the tokenizer passes the \N sequence through unchanged; this code will not
9850    attempt to determine this nor expand those, instead raising a syntax error.
9851    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9852    or there is no '}', it signals that this \N occurrence means to match a
9853    non-newline.
9854
9855    Only the \N{U+...} form should occur in a character class, for the same
9856    reason that '.' inside a character class means to just match a period: it
9857    just doesn't make sense.
9858
9859    The function raises an error (via vFAIL), and doesn't return for various
9860    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9861    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9862    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9863    only possible if node_p is non-NULL.
9864
9865
9866    If <valuep> is non-null, it means the caller can accept an input sequence
9867    consisting of a just a single code point; <*valuep> is set to that value
9868    if the input is such.
9869
9870    If <node_p> is non-null it signifies that the caller can accept any other
9871    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9872    is set as follows:
9873  1) \N means not-a-NL: points to a newly created REG_ANY node;
9874  2) \N{}:              points to a new NOTHING node;
9875  3) otherwise:         points to a new EXACT node containing the resolved
9876       string.
9877    Note that FALSE is returned for single code point sequences if <valuep> is
9878    null.
9879  */
9880
9881  char * endbrace;    /* '}' following the name */
9882  char* p;
9883  char *endchar; /* Points to '.' or '}' ending cur char in the input
9884       stream */
9885  bool has_multiple_chars; /* true if the input stream contains a sequence of
9886         more than one character */
9887
9888  GET_RE_DEBUG_FLAGS_DECL;
9889
9890  PERL_ARGS_ASSERT_GROK_BSLASH_N;
9891
9892  GET_RE_DEBUG_FLAGS;
9893
9894  assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9895
9896  /* The [^\n] meaning of \N ignores spaces and comments under the /x
9897  * modifier.  The other meaning does not */
9898  p = (RExC_flags & RXf_PMf_EXTENDED)
9899   ? regwhite( pRExC_state, RExC_parse )
9900   : RExC_parse;
9901
9902  /* Disambiguate between \N meaning a named character versus \N meaning
9903  * [^\n].  The former is assumed when it can't be the latter. */
9904  if (*p != '{' || regcurly(p, FALSE)) {
9905   RExC_parse = p;
9906   if (! node_p) {
9907    /* no bare \N in a charclass */
9908    if (in_char_class) {
9909     vFAIL("\\N in a character class must be a named character: \\N{...}");
9910    }
9911    return FALSE;
9912   }
9913   nextchar(pRExC_state);
9914   *node_p = reg_node(pRExC_state, REG_ANY);
9915   *flagp |= HASWIDTH|SIMPLE;
9916   RExC_naughty++;
9917   RExC_parse--;
9918   Set_Node_Length(*node_p, 1); /* MJD */
9919   return TRUE;
9920  }
9921
9922  /* Here, we have decided it should be a named character or sequence */
9923
9924  /* The test above made sure that the next real character is a '{', but
9925  * under the /x modifier, it could be separated by space (or a comment and
9926  * \n) and this is not allowed (for consistency with \x{...} and the
9927  * tokenizer handling of \N{NAME}). */
9928  if (*RExC_parse != '{') {
9929   vFAIL("Missing braces on \\N{}");
9930  }
9931
9932  RExC_parse++; /* Skip past the '{' */
9933
9934  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9935   || ! (endbrace == RExC_parse  /* nothing between the {} */
9936    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9937     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9938  {
9939   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9940   vFAIL("\\N{NAME} must be resolved by the lexer");
9941  }
9942
9943  if (endbrace == RExC_parse) {   /* empty: \N{} */
9944   bool ret = TRUE;
9945   if (node_p) {
9946    *node_p = reg_node(pRExC_state,NOTHING);
9947   }
9948   else if (in_char_class) {
9949    if (SIZE_ONLY && in_char_class) {
9950     if (strict) {
9951      RExC_parse++;   /* Position after the "}" */
9952      vFAIL("Zero length \\N{}");
9953     }
9954     else {
9955      ckWARNreg(RExC_parse,
9956        "Ignoring zero length \\N{} in character class");
9957     }
9958    }
9959    ret = FALSE;
9960   }
9961   else {
9962    return FALSE;
9963   }
9964   nextchar(pRExC_state);
9965   return ret;
9966  }
9967
9968  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9969  RExC_parse += 2; /* Skip past the 'U+' */
9970
9971  endchar = RExC_parse + strcspn(RExC_parse, ".}");
9972
9973  /* Code points are separated by dots.  If none, there is only one code
9974  * point, and is terminated by the brace */
9975  has_multiple_chars = (endchar < endbrace);
9976
9977  if (valuep && (! has_multiple_chars || in_char_class)) {
9978   /* We only pay attention to the first char of
9979   multichar strings being returned in char classes. I kinda wonder
9980   if this makes sense as it does change the behaviour
9981   from earlier versions, OTOH that behaviour was broken
9982   as well. XXX Solution is to recharacterize as
9983   [rest-of-class]|multi1|multi2... */
9984
9985   STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9986   I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9987    | PERL_SCAN_DISALLOW_PREFIX
9988    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9989
9990   *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9991
9992   /* The tokenizer should have guaranteed validity, but it's possible to
9993   * bypass it by using single quoting, so check */
9994   if (length_of_hex == 0
9995    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9996   {
9997    RExC_parse += length_of_hex; /* Includes all the valid */
9998    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9999        ? UTF8SKIP(RExC_parse)
10000        : 1;
10001    /* Guard against malformed utf8 */
10002    if (RExC_parse >= endchar) {
10003     RExC_parse = endchar;
10004    }
10005    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10006   }
10007
10008   if (in_char_class && has_multiple_chars) {
10009    if (strict) {
10010     RExC_parse = endbrace;
10011     vFAIL("\\N{} in character class restricted to one character");
10012    }
10013    else {
10014     ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10015    }
10016   }
10017
10018   RExC_parse = endbrace + 1;
10019  }
10020  else if (! node_p || ! has_multiple_chars) {
10021
10022   /* Here, the input is legal, but not according to the caller's
10023   * options.  We fail without advancing the parse, so that the
10024   * caller can try again */
10025   RExC_parse = p;
10026   return FALSE;
10027  }
10028  else {
10029
10030   /* What is done here is to convert this to a sub-pattern of the form
10031   * (?:\x{char1}\x{char2}...)
10032   * and then call reg recursively.  That way, it retains its atomicness,
10033   * while not having to worry about special handling that some code
10034   * points may have.  toke.c has converted the original Unicode values
10035   * to native, so that we can just pass on the hex values unchanged.  We
10036   * do have to set a flag to keep recoding from happening in the
10037   * recursion */
10038
10039   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10040   STRLEN len;
10041   char *orig_end = RExC_end;
10042   I32 flags;
10043
10044   while (RExC_parse < endbrace) {
10045
10046    /* Convert to notation the rest of the code understands */
10047    sv_catpv(substitute_parse, "\\x{");
10048    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10049    sv_catpv(substitute_parse, "}");
10050
10051    /* Point to the beginning of the next character in the sequence. */
10052    RExC_parse = endchar + 1;
10053    endchar = RExC_parse + strcspn(RExC_parse, ".}");
10054   }
10055   sv_catpv(substitute_parse, ")");
10056
10057   RExC_parse = SvPV(substitute_parse, len);
10058
10059   /* Don't allow empty number */
10060   if (len < 8) {
10061    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10062   }
10063   RExC_end = RExC_parse + len;
10064
10065   /* The values are Unicode, and therefore not subject to recoding */
10066   RExC_override_recoding = 1;
10067
10068   if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10069    if (flags & RESTART_UTF8) {
10070     *flagp = RESTART_UTF8;
10071     return FALSE;
10072    }
10073    FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10074     (UV) flags);
10075   }
10076   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10077
10078   RExC_parse = endbrace;
10079   RExC_end = orig_end;
10080   RExC_override_recoding = 0;
10081
10082   nextchar(pRExC_state);
10083  }
10084
10085  return TRUE;
10086 }
10087
10088
10089 /*
10090  * reg_recode
10091  *
10092  * It returns the code point in utf8 for the value in *encp.
10093  *    value: a code value in the source encoding
10094  *    encp:  a pointer to an Encode object
10095  *
10096  * If the result from Encode is not a single character,
10097  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10098  */
10099 STATIC UV
10100 S_reg_recode(pTHX_ const char value, SV **encp)
10101 {
10102  STRLEN numlen = 1;
10103  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10104  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10105  const STRLEN newlen = SvCUR(sv);
10106  UV uv = UNICODE_REPLACEMENT;
10107
10108  PERL_ARGS_ASSERT_REG_RECODE;
10109
10110  if (newlen)
10111   uv = SvUTF8(sv)
10112    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10113    : *(U8*)s;
10114
10115  if (!newlen || numlen != newlen) {
10116   uv = UNICODE_REPLACEMENT;
10117   *encp = NULL;
10118  }
10119  return uv;
10120 }
10121
10122 PERL_STATIC_INLINE U8
10123 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10124 {
10125  U8 op;
10126
10127  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10128
10129  if (! FOLD) {
10130   return EXACT;
10131  }
10132
10133  op = get_regex_charset(RExC_flags);
10134  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10135   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10136     been, so there is no hole */
10137  }
10138
10139  return op + EXACTF;
10140 }
10141
10142 PERL_STATIC_INLINE void
10143 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10144 {
10145  /* This knows the details about sizing an EXACTish node, setting flags for
10146  * it (by setting <*flagp>, and potentially populating it with a single
10147  * character.
10148  *
10149  * If <len> (the length in bytes) is non-zero, this function assumes that
10150  * the node has already been populated, and just does the sizing.  In this
10151  * case <code_point> should be the final code point that has already been
10152  * placed into the node.  This value will be ignored except that under some
10153  * circumstances <*flagp> is set based on it.
10154  *
10155  * If <len> is zero, the function assumes that the node is to contain only
10156  * the single character given by <code_point> and calculates what <len>
10157  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10158  * additionally will populate the node's STRING with <code_point>, if <len>
10159  * is 0.  In both cases <*flagp> is appropriately set
10160  *
10161  * It knows that under FOLD, the Latin Sharp S and UTF characters above
10162  * 255, must be folded (the former only when the rules indicate it can
10163  * match 'ss') */
10164
10165  bool len_passed_in = cBOOL(len != 0);
10166  U8 character[UTF8_MAXBYTES_CASE+1];
10167
10168  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10169
10170  if (! len_passed_in) {
10171   if (UTF) {
10172    if (FOLD && (! LOC || code_point > 255)) {
10173     _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
10174         character,
10175         &len,
10176         FOLD_FLAGS_FULL | ((LOC)
10177              ? FOLD_FLAGS_LOCALE
10178              : (ASCII_FOLD_RESTRICTED)
10179              ? FOLD_FLAGS_NOMIX_ASCII
10180              : 0));
10181    }
10182    else {
10183     uvchr_to_utf8( character, code_point);
10184     len = UTF8SKIP(character);
10185    }
10186   }
10187   else if (! FOLD
10188     || code_point != LATIN_SMALL_LETTER_SHARP_S
10189     || ASCII_FOLD_RESTRICTED
10190     || ! AT_LEAST_UNI_SEMANTICS)
10191   {
10192    *character = (U8) code_point;
10193    len = 1;
10194   }
10195   else {
10196    *character = 's';
10197    *(character + 1) = 's';
10198    len = 2;
10199   }
10200  }
10201
10202  if (SIZE_ONLY) {
10203   RExC_size += STR_SZ(len);
10204  }
10205  else {
10206   RExC_emit += STR_SZ(len);
10207   STR_LEN(node) = len;
10208   if (! len_passed_in) {
10209    Copy((char *) character, STRING(node), len, char);
10210   }
10211  }
10212
10213  *flagp |= HASWIDTH;
10214
10215  /* A single character node is SIMPLE, except for the special-cased SHARP S
10216  * under /di. */
10217  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10218   && (code_point != LATIN_SMALL_LETTER_SHARP_S
10219    || ! FOLD || ! DEPENDS_SEMANTICS))
10220  {
10221   *flagp |= SIMPLE;
10222  }
10223 }
10224
10225 /*
10226  - regatom - the lowest level
10227
10228    Try to identify anything special at the start of the pattern. If there
10229    is, then handle it as required. This may involve generating a single regop,
10230    such as for an assertion; or it may involve recursing, such as to
10231    handle a () structure.
10232
10233    If the string doesn't start with something special then we gobble up
10234    as much literal text as we can.
10235
10236    Once we have been able to handle whatever type of thing started the
10237    sequence, we return.
10238
10239    Note: we have to be careful with escapes, as they can be both literal
10240    and special, and in the case of \10 and friends, context determines which.
10241
10242    A summary of the code structure is:
10243
10244    switch (first_byte) {
10245   cases for each special:
10246    handle this special;
10247    break;
10248   case '\\':
10249    switch (2nd byte) {
10250     cases for each unambiguous special:
10251      handle this special;
10252      break;
10253     cases for each ambigous special/literal:
10254      disambiguate;
10255      if (special)  handle here
10256      else goto defchar;
10257     default: // unambiguously literal:
10258      goto defchar;
10259    }
10260   default:  // is a literal char
10261    // FALL THROUGH
10262   defchar:
10263    create EXACTish node for literal;
10264    while (more input and node isn't full) {
10265     switch (input_byte) {
10266     cases for each special;
10267      make sure parse pointer is set so that the next call to
10268       regatom will see this special first
10269      goto loopdone; // EXACTish node terminated by prev. char
10270     default:
10271      append char to EXACTISH node;
10272     }
10273     get next input byte;
10274    }
10275   loopdone:
10276    }
10277    return the generated node;
10278
10279    Specifically there are two separate switches for handling
10280    escape sequences, with the one for handling literal escapes requiring
10281    a dummy entry for all of the special escapes that are actually handled
10282    by the other.
10283
10284    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10285    TRYAGAIN.
10286    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10287    restarted.
10288    Otherwise does not return NULL.
10289 */
10290
10291 STATIC regnode *
10292 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10293 {
10294  dVAR;
10295  regnode *ret = NULL;
10296  I32 flags = 0;
10297  char *parse_start = RExC_parse;
10298  U8 op;
10299  int invert = 0;
10300
10301  GET_RE_DEBUG_FLAGS_DECL;
10302
10303  *flagp = WORST;  /* Tentatively. */
10304
10305  DEBUG_PARSE("atom");
10306
10307  PERL_ARGS_ASSERT_REGATOM;
10308
10309 tryagain:
10310  switch ((U8)*RExC_parse) {
10311  case '^':
10312   RExC_seen_zerolen++;
10313   nextchar(pRExC_state);
10314   if (RExC_flags & RXf_PMf_MULTILINE)
10315    ret = reg_node(pRExC_state, MBOL);
10316   else if (RExC_flags & RXf_PMf_SINGLELINE)
10317    ret = reg_node(pRExC_state, SBOL);
10318   else
10319    ret = reg_node(pRExC_state, BOL);
10320   Set_Node_Length(ret, 1); /* MJD */
10321   break;
10322  case '$':
10323   nextchar(pRExC_state);
10324   if (*RExC_parse)
10325    RExC_seen_zerolen++;
10326   if (RExC_flags & RXf_PMf_MULTILINE)
10327    ret = reg_node(pRExC_state, MEOL);
10328   else if (RExC_flags & RXf_PMf_SINGLELINE)
10329    ret = reg_node(pRExC_state, SEOL);
10330   else
10331    ret = reg_node(pRExC_state, EOL);
10332   Set_Node_Length(ret, 1); /* MJD */
10333   break;
10334  case '.':
10335   nextchar(pRExC_state);
10336   if (RExC_flags & RXf_PMf_SINGLELINE)
10337    ret = reg_node(pRExC_state, SANY);
10338   else
10339    ret = reg_node(pRExC_state, REG_ANY);
10340   *flagp |= HASWIDTH|SIMPLE;
10341   RExC_naughty++;
10342   Set_Node_Length(ret, 1); /* MJD */
10343   break;
10344  case '[':
10345  {
10346   char * const oregcomp_parse = ++RExC_parse;
10347   ret = regclass(pRExC_state, flagp,depth+1,
10348      FALSE, /* means parse the whole char class */
10349      TRUE, /* allow multi-char folds */
10350      FALSE, /* don't silence non-portable warnings. */
10351      NULL);
10352   if (*RExC_parse != ']') {
10353    RExC_parse = oregcomp_parse;
10354    vFAIL("Unmatched [");
10355   }
10356   if (ret == NULL) {
10357    if (*flagp & RESTART_UTF8)
10358     return NULL;
10359    FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10360     (UV) *flagp);
10361   }
10362   nextchar(pRExC_state);
10363   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10364   break;
10365  }
10366  case '(':
10367   nextchar(pRExC_state);
10368   ret = reg(pRExC_state, 2, &flags,depth+1);
10369   if (ret == NULL) {
10370     if (flags & TRYAGAIN) {
10371      if (RExC_parse == RExC_end) {
10372       /* Make parent create an empty node if needed. */
10373       *flagp |= TRYAGAIN;
10374       return(NULL);
10375      }
10376      goto tryagain;
10377     }
10378     if (flags & RESTART_UTF8) {
10379      *flagp = RESTART_UTF8;
10380      return NULL;
10381     }
10382     FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10383   }
10384   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10385   break;
10386  case '|':
10387  case ')':
10388   if (flags & TRYAGAIN) {
10389    *flagp |= TRYAGAIN;
10390    return NULL;
10391   }
10392   vFAIL("Internal urp");
10393         /* Supposed to be caught earlier. */
10394   break;
10395  case '{':
10396   if (!regcurly(RExC_parse, FALSE)) {
10397    RExC_parse++;
10398    goto defchar;
10399   }
10400   /* FALL THROUGH */
10401  case '?':
10402  case '+':
10403  case '*':
10404   RExC_parse++;
10405   vFAIL("Quantifier follows nothing");
10406   break;
10407  case '\\':
10408   /* Special Escapes
10409
10410   This switch handles escape sequences that resolve to some kind
10411   of special regop and not to literal text. Escape sequnces that
10412   resolve to literal text are handled below in the switch marked
10413   "Literal Escapes".
10414
10415   Every entry in this switch *must* have a corresponding entry
10416   in the literal escape switch. However, the opposite is not
10417   required, as the default for this switch is to jump to the
10418   literal text handling code.
10419   */
10420   switch ((U8)*++RExC_parse) {
10421    U8 arg;
10422   /* Special Escapes */
10423   case 'A':
10424    RExC_seen_zerolen++;
10425    ret = reg_node(pRExC_state, SBOL);
10426    *flagp |= SIMPLE;
10427    goto finish_meta_pat;
10428   case 'G':
10429    ret = reg_node(pRExC_state, GPOS);
10430    RExC_seen |= REG_SEEN_GPOS;
10431    *flagp |= SIMPLE;
10432    goto finish_meta_pat;
10433   case 'K':
10434    RExC_seen_zerolen++;
10435    ret = reg_node(pRExC_state, KEEPS);
10436    *flagp |= SIMPLE;
10437    /* XXX:dmq : disabling in-place substitution seems to
10438    * be necessary here to avoid cases of memory corruption, as
10439    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10440    */
10441    RExC_seen |= REG_SEEN_LOOKBEHIND;
10442    goto finish_meta_pat;
10443   case 'Z':
10444    ret = reg_node(pRExC_state, SEOL);
10445    *flagp |= SIMPLE;
10446    RExC_seen_zerolen++;  /* Do not optimize RE away */
10447    goto finish_meta_pat;
10448   case 'z':
10449    ret = reg_node(pRExC_state, EOS);
10450    *flagp |= SIMPLE;
10451    RExC_seen_zerolen++;  /* Do not optimize RE away */
10452    goto finish_meta_pat;
10453   case 'C':
10454    ret = reg_node(pRExC_state, CANY);
10455    RExC_seen |= REG_SEEN_CANY;
10456    *flagp |= HASWIDTH|SIMPLE;
10457    goto finish_meta_pat;
10458   case 'X':
10459    ret = reg_node(pRExC_state, CLUMP);
10460    *flagp |= HASWIDTH;
10461    goto finish_meta_pat;
10462
10463   case 'W':
10464    invert = 1;
10465    /* FALLTHROUGH */
10466   case 'w':
10467    arg = ANYOF_WORDCHAR;
10468    goto join_posix;
10469
10470   case 'b':
10471    RExC_seen_zerolen++;
10472    RExC_seen |= REG_SEEN_LOOKBEHIND;
10473    op = BOUND + get_regex_charset(RExC_flags);
10474    if (op > BOUNDA) {  /* /aa is same as /a */
10475     op = BOUNDA;
10476    }
10477    ret = reg_node(pRExC_state, op);
10478    FLAGS(ret) = get_regex_charset(RExC_flags);
10479    *flagp |= SIMPLE;
10480    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10481     ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10482    }
10483    goto finish_meta_pat;
10484   case 'B':
10485    RExC_seen_zerolen++;
10486    RExC_seen |= REG_SEEN_LOOKBEHIND;
10487    op = NBOUND + get_regex_charset(RExC_flags);
10488    if (op > NBOUNDA) { /* /aa is same as /a */
10489     op = NBOUNDA;
10490    }
10491    ret = reg_node(pRExC_state, op);
10492    FLAGS(ret) = get_regex_charset(RExC_flags);
10493    *flagp |= SIMPLE;
10494    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10495     ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10496    }
10497    goto finish_meta_pat;
10498
10499   case 'D':
10500    invert = 1;
10501    /* FALLTHROUGH */
10502   case 'd':
10503    arg = ANYOF_DIGIT;
10504    goto join_posix;
10505
10506   case 'R':
10507    ret = reg_node(pRExC_state, LNBREAK);
10508    *flagp |= HASWIDTH|SIMPLE;
10509    goto finish_meta_pat;
10510
10511   case 'H':
10512    invert = 1;
10513    /* FALLTHROUGH */
10514   case 'h':
10515    arg = ANYOF_BLANK;
10516    op = POSIXU;
10517    goto join_posix_op_known;
10518
10519   case 'V':
10520    invert = 1;
10521    /* FALLTHROUGH */
10522   case 'v':
10523    arg = ANYOF_VERTWS;
10524    op = POSIXU;
10525    goto join_posix_op_known;
10526
10527   case 'S':
10528    invert = 1;
10529    /* FALLTHROUGH */
10530   case 's':
10531    arg = ANYOF_SPACE;
10532
10533   join_posix:
10534
10535    op = POSIXD + get_regex_charset(RExC_flags);
10536    if (op > POSIXA) {  /* /aa is same as /a */
10537     op = POSIXA;
10538    }
10539
10540   join_posix_op_known:
10541
10542    if (invert) {
10543     op += NPOSIXD - POSIXD;
10544    }
10545
10546    ret = reg_node(pRExC_state, op);
10547    if (! SIZE_ONLY) {
10548     FLAGS(ret) = namedclass_to_classnum(arg);
10549    }
10550
10551    *flagp |= HASWIDTH|SIMPLE;
10552    /* FALL THROUGH */
10553
10554   finish_meta_pat:
10555    nextchar(pRExC_state);
10556    Set_Node_Length(ret, 2); /* MJD */
10557    break;
10558   case 'p':
10559   case 'P':
10560    {
10561 #ifdef DEBUGGING
10562     char* parse_start = RExC_parse - 2;
10563 #endif
10564
10565     RExC_parse--;
10566
10567     ret = regclass(pRExC_state, flagp,depth+1,
10568        TRUE, /* means just parse this element */
10569        FALSE, /* don't allow multi-char folds */
10570        FALSE, /* don't silence non-portable warnings.
10571           It would be a bug if these returned
10572           non-portables */
10573        NULL);
10574     /* regclass() can only return RESTART_UTF8 if multi-char folds
10575     are allowed.  */
10576     if (!ret)
10577      FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10578       (UV) *flagp);
10579
10580     RExC_parse--;
10581
10582     Set_Node_Offset(ret, parse_start + 2);
10583     Set_Node_Cur_Length(ret, parse_start);
10584     nextchar(pRExC_state);
10585    }
10586    break;
10587   case 'N':
10588    /* Handle \N and \N{NAME} with multiple code points here and not
10589    * below because it can be multicharacter. join_exact() will join
10590    * them up later on.  Also this makes sure that things like
10591    * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10592    * The options to the grok function call causes it to fail if the
10593    * sequence is just a single code point.  We then go treat it as
10594    * just another character in the current EXACT node, and hence it
10595    * gets uniform treatment with all the other characters.  The
10596    * special treatment for quantifiers is not needed for such single
10597    * character sequences */
10598    ++RExC_parse;
10599    if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10600         FALSE /* not strict */ )) {
10601     if (*flagp & RESTART_UTF8)
10602      return NULL;
10603     RExC_parse--;
10604     goto defchar;
10605    }
10606    break;
10607   case 'k':    /* Handle \k<NAME> and \k'NAME' */
10608   parse_named_seq:
10609   {
10610    char ch= RExC_parse[1];
10611    if (ch != '<' && ch != '\'' && ch != '{') {
10612     RExC_parse++;
10613     vFAIL2("Sequence %.2s... not terminated",parse_start);
10614    } else {
10615     /* this pretty much dupes the code for (?P=...) in reg(), if
10616     you change this make sure you change that */
10617     char* name_start = (RExC_parse += 2);
10618     U32 num = 0;
10619     SV *sv_dat = reg_scan_name(pRExC_state,
10620      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10621     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10622     if (RExC_parse == name_start || *RExC_parse != ch)
10623      vFAIL2("Sequence %.3s... not terminated",parse_start);
10624
10625     if (!SIZE_ONLY) {
10626      num = add_data( pRExC_state, 1, "S" );
10627      RExC_rxi->data->data[num]=(void*)sv_dat;
10628      SvREFCNT_inc_simple_void(sv_dat);
10629     }
10630
10631     RExC_sawback = 1;
10632     ret = reganode(pRExC_state,
10633        ((! FOLD)
10634         ? NREF
10635         : (ASCII_FOLD_RESTRICTED)
10636         ? NREFFA
10637         : (AT_LEAST_UNI_SEMANTICS)
10638          ? NREFFU
10639          : (LOC)
10640          ? NREFFL
10641          : NREFF),
10642         num);
10643     *flagp |= HASWIDTH;
10644
10645     /* override incorrect value set in reganode MJD */
10646     Set_Node_Offset(ret, parse_start+1);
10647     Set_Node_Cur_Length(ret, parse_start);
10648     nextchar(pRExC_state);
10649
10650    }
10651    break;
10652   }
10653   case 'g':
10654   case '1': case '2': case '3': case '4':
10655   case '5': case '6': case '7': case '8': case '9':
10656    {
10657     I32 num;
10658     bool isg = *RExC_parse == 'g';
10659     bool isrel = 0;
10660     bool hasbrace = 0;
10661     if (isg) {
10662      RExC_parse++;
10663      if (*RExC_parse == '{') {
10664       RExC_parse++;
10665       hasbrace = 1;
10666      }
10667      if (*RExC_parse == '-') {
10668       RExC_parse++;
10669       isrel = 1;
10670      }
10671      if (hasbrace && !isDIGIT(*RExC_parse)) {
10672       if (isrel) RExC_parse--;
10673       RExC_parse -= 2;
10674       goto parse_named_seq;
10675     }   }
10676     num = atoi(RExC_parse);
10677     if (isg && num == 0) {
10678      if (*RExC_parse == '0') {
10679       vFAIL("Reference to invalid group 0");
10680      }
10681      else {
10682       vFAIL("Unterminated \\g... pattern");
10683      }
10684     }
10685     if (isrel) {
10686      num = RExC_npar - num;
10687      if (num < 1)
10688       vFAIL("Reference to nonexistent or unclosed group");
10689     }
10690     if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
10691      /* Probably a character specified in octal, e.g. \35 */
10692      goto defchar;
10693     else {
10694 #ifdef RE_TRACK_PATTERN_OFFSETS
10695      char * const parse_start = RExC_parse - 1; /* MJD */
10696 #endif
10697      while (isDIGIT(*RExC_parse))
10698       RExC_parse++;
10699      if (hasbrace) {
10700       if (*RExC_parse != '}')
10701        vFAIL("Unterminated \\g{...} pattern");
10702       RExC_parse++;
10703      }
10704      if (!SIZE_ONLY) {
10705       if (num > (I32)RExC_rx->nparens)
10706        vFAIL("Reference to nonexistent group");
10707      }
10708      RExC_sawback = 1;
10709      ret = reganode(pRExC_state,
10710         ((! FOLD)
10711          ? REF
10712          : (ASCII_FOLD_RESTRICTED)
10713          ? REFFA
10714          : (AT_LEAST_UNI_SEMANTICS)
10715           ? REFFU
10716           : (LOC)
10717           ? REFFL
10718           : REFF),
10719          num);
10720      *flagp |= HASWIDTH;
10721
10722      /* override incorrect value set in reganode MJD */
10723      Set_Node_Offset(ret, parse_start+1);
10724      Set_Node_Cur_Length(ret, parse_start);
10725      RExC_parse--;
10726      nextchar(pRExC_state);
10727     }
10728    }
10729    break;
10730   case '\0':
10731    if (RExC_parse >= RExC_end)
10732     FAIL("Trailing \\");
10733    /* FALL THROUGH */
10734   default:
10735    /* Do not generate "unrecognized" warnings here, we fall
10736    back into the quick-grab loop below */
10737    parse_start--;
10738    goto defchar;
10739   }
10740   break;
10741
10742  case '#':
10743   if (RExC_flags & RXf_PMf_EXTENDED) {
10744    if ( reg_skipcomment( pRExC_state ) )
10745     goto tryagain;
10746   }
10747   /* FALL THROUGH */
10748
10749  default:
10750
10751    parse_start = RExC_parse - 1;
10752
10753    RExC_parse++;
10754
10755   defchar: {
10756    STRLEN len = 0;
10757    UV ender = 0;
10758    char *p;
10759    char *s;
10760 #define MAX_NODE_STRING_SIZE 127
10761    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10762    char *s0;
10763    U8 upper_parse = MAX_NODE_STRING_SIZE;
10764    STRLEN foldlen;
10765    U8 node_type = compute_EXACTish(pRExC_state);
10766    bool next_is_quantifier;
10767    char * oldp = NULL;
10768
10769    /* We can convert EXACTF nodes to EXACTFU if they contain only
10770    * characters that match identically regardless of the target
10771    * string's UTF8ness.  The reason to do this is that EXACTF is not
10772    * trie-able, EXACTFU is.  (We don't need to figure this out until
10773    * pass 2) */
10774    bool maybe_exactfu = node_type == EXACTF && PASS2;
10775
10776    /* If a folding node contains only code points that don't
10777    * participate in folds, it can be changed into an EXACT node,
10778    * which allows the optimizer more things to look for */
10779    bool maybe_exact;
10780
10781    ret = reg_node(pRExC_state, node_type);
10782
10783    /* In pass1, folded, we use a temporary buffer instead of the
10784    * actual node, as the node doesn't exist yet */
10785    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10786
10787    s0 = s;
10788
10789   reparse:
10790
10791    /* We do the EXACTFish to EXACT node only if folding, and not if in
10792    * locale, as whether a character folds or not isn't known until
10793    * runtime.  (And we don't need to figure this out until pass 2) */
10794    maybe_exact = FOLD && ! LOC && PASS2;
10795
10796    /* XXX The node can hold up to 255 bytes, yet this only goes to
10797    * 127.  I (khw) do not know why.  Keeping it somewhat less than
10798    * 255 allows us to not have to worry about overflow due to
10799    * converting to utf8 and fold expansion, but that value is
10800    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10801    * split up by this limit into a single one using the real max of
10802    * 255.  Even at 127, this breaks under rare circumstances.  If
10803    * folding, we do not want to split a node at a character that is a
10804    * non-final in a multi-char fold, as an input string could just
10805    * happen to want to match across the node boundary.  The join
10806    * would solve that problem if the join actually happens.  But a
10807    * series of more than two nodes in a row each of 127 would cause
10808    * the first join to succeed to get to 254, but then there wouldn't
10809    * be room for the next one, which could at be one of those split
10810    * multi-char folds.  I don't know of any fool-proof solution.  One
10811    * could back off to end with only a code point that isn't such a
10812    * non-final, but it is possible for there not to be any in the
10813    * entire node. */
10814    for (p = RExC_parse - 1;
10815     len < upper_parse && p < RExC_end;
10816     len++)
10817    {
10818     oldp = p;
10819
10820     if (RExC_flags & RXf_PMf_EXTENDED)
10821      p = regwhite( pRExC_state, p );
10822     switch ((U8)*p) {
10823     case '^':
10824     case '$':
10825     case '.':
10826     case '[':
10827     case '(':
10828     case ')':
10829     case '|':
10830      goto loopdone;
10831     case '\\':
10832      /* Literal Escapes Switch
10833
10834      This switch is meant to handle escape sequences that
10835      resolve to a literal character.
10836
10837      Every escape sequence that represents something
10838      else, like an assertion or a char class, is handled
10839      in the switch marked 'Special Escapes' above in this
10840      routine, but also has an entry here as anything that
10841      isn't explicitly mentioned here will be treated as
10842      an unescaped equivalent literal.
10843      */
10844
10845      switch ((U8)*++p) {
10846      /* These are all the special escapes. */
10847      case 'A':             /* Start assertion */
10848      case 'b': case 'B':   /* Word-boundary assertion*/
10849      case 'C':             /* Single char !DANGEROUS! */
10850      case 'd': case 'D':   /* digit class */
10851      case 'g': case 'G':   /* generic-backref, pos assertion */
10852      case 'h': case 'H':   /* HORIZWS */
10853      case 'k': case 'K':   /* named backref, keep marker */
10854      case 'p': case 'P':   /* Unicode property */
10855        case 'R':   /* LNBREAK */
10856      case 's': case 'S':   /* space class */
10857      case 'v': case 'V':   /* VERTWS */
10858      case 'w': case 'W':   /* word class */
10859      case 'X':             /* eXtended Unicode "combining character sequence" */
10860      case 'z': case 'Z':   /* End of line/string assertion */
10861       --p;
10862       goto loopdone;
10863
10864      /* Anything after here is an escape that resolves to a
10865      literal. (Except digits, which may or may not)
10866      */
10867      case 'n':
10868       ender = '\n';
10869       p++;
10870       break;
10871      case 'N': /* Handle a single-code point named character. */
10872       /* The options cause it to fail if a multiple code
10873       * point sequence.  Handle those in the switch() above
10874       * */
10875       RExC_parse = p + 1;
10876       if (! grok_bslash_N(pRExC_state, NULL, &ender,
10877            flagp, depth, FALSE,
10878            FALSE /* not strict */ ))
10879       {
10880        if (*flagp & RESTART_UTF8)
10881         FAIL("panic: grok_bslash_N set RESTART_UTF8");
10882        RExC_parse = p = oldp;
10883        goto loopdone;
10884       }
10885       p = RExC_parse;
10886       if (ender > 0xff) {
10887        REQUIRE_UTF8;
10888       }
10889       break;
10890      case 'r':
10891       ender = '\r';
10892       p++;
10893       break;
10894      case 't':
10895       ender = '\t';
10896       p++;
10897       break;
10898      case 'f':
10899       ender = '\f';
10900       p++;
10901       break;
10902      case 'e':
10903       ender = ASCII_TO_NATIVE('\033');
10904       p++;
10905       break;
10906      case 'a':
10907       ender = ASCII_TO_NATIVE('\007');
10908       p++;
10909       break;
10910      case 'o':
10911       {
10912        UV result;
10913        const char* error_msg;
10914
10915        bool valid = grok_bslash_o(&p,
10916              &result,
10917              &error_msg,
10918              TRUE, /* out warnings */
10919              FALSE, /* not strict */
10920              TRUE, /* Output warnings
10921                 for non-
10922                 portables */
10923              UTF);
10924        if (! valid) {
10925         RExC_parse = p; /* going to die anyway; point
10926             to exact spot of failure */
10927         vFAIL(error_msg);
10928        }
10929        ender = result;
10930        if (PL_encoding && ender < 0x100) {
10931         goto recode_encoding;
10932        }
10933        if (ender > 0xff) {
10934         REQUIRE_UTF8;
10935        }
10936        break;
10937       }
10938      case 'x':
10939       {
10940        UV result = UV_MAX; /* initialize to erroneous
10941             value */
10942        const char* error_msg;
10943
10944        bool valid = grok_bslash_x(&p,
10945              &result,
10946              &error_msg,
10947              TRUE, /* out warnings */
10948              FALSE, /* not strict */
10949              TRUE, /* Output warnings
10950                 for non-
10951                 portables */
10952              UTF);
10953        if (! valid) {
10954         RExC_parse = p; /* going to die anyway; point
10955             to exact spot of failure */
10956         vFAIL(error_msg);
10957        }
10958        ender = result;
10959
10960        if (PL_encoding && ender < 0x100) {
10961         goto recode_encoding;
10962        }
10963        if (ender > 0xff) {
10964         REQUIRE_UTF8;
10965        }
10966        break;
10967       }
10968      case 'c':
10969       p++;
10970       ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10971       break;
10972      case '8': case '9': /* must be a backreference */
10973       --p;
10974       goto loopdone;
10975      case '1': case '2': case '3':case '4':
10976      case '5': case '6': case '7':
10977       /* When we parse backslash escapes there is ambiguity between
10978       * backreferences and octal escapes. Any escape from \1 - \9 is
10979       * a backreference, any multi-digit escape which does not start with
10980       * 0 and which when evaluated as decimal could refer to an already
10981       * parsed capture buffer is a backslash. Anything else is octal.
10982       *
10983       * Note this implies that \118 could be interpreted as 118 OR as
10984       * "\11" . "8" depending on whether there were 118 capture buffers
10985       * defined already in the pattern.
10986       */
10987       if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
10988       {  /* Not to be treated as an octal constant, go
10989         find backref */
10990        --p;
10991        goto loopdone;
10992       }
10993      case '0':
10994       {
10995        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10996        STRLEN numlen = 3;
10997        ender = grok_oct(p, &numlen, &flags, NULL);
10998        if (ender > 0xff) {
10999         REQUIRE_UTF8;
11000        }
11001        p += numlen;
11002        if (SIZE_ONLY   /* like \08, \178 */
11003         && numlen < 3
11004         && p < RExC_end
11005         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11006        {
11007         reg_warn_non_literal_string(
11008           p + 1,
11009           form_short_octal_warning(p, numlen));
11010        }
11011       }
11012       if (PL_encoding && ender < 0x100)
11013        goto recode_encoding;
11014       break;
11015      recode_encoding:
11016       if (! RExC_override_recoding) {
11017        SV* enc = PL_encoding;
11018        ender = reg_recode((const char)(U8)ender, &enc);
11019        if (!enc && SIZE_ONLY)
11020         ckWARNreg(p, "Invalid escape in the specified encoding");
11021        REQUIRE_UTF8;
11022       }
11023       break;
11024      case '\0':
11025       if (p >= RExC_end)
11026        FAIL("Trailing \\");
11027       /* FALL THROUGH */
11028      default:
11029       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11030        /* Include any { following the alpha to emphasize
11031        * that it could be part of an escape at some point
11032        * in the future */
11033        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11034        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11035       }
11036       goto normal_default;
11037      } /* End of switch on '\' */
11038      break;
11039     default:    /* A literal character */
11040
11041      if (! SIZE_ONLY
11042       && RExC_flags & RXf_PMf_EXTENDED
11043       && ckWARN_d(WARN_DEPRECATED)
11044       && is_PATWS_non_low(p, UTF))
11045      {
11046       vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11047         "Escape literal pattern white space under /x");
11048      }
11049
11050     normal_default:
11051      if (UTF8_IS_START(*p) && UTF) {
11052       STRLEN numlen;
11053       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11054            &numlen, UTF8_ALLOW_DEFAULT);
11055       p += numlen;
11056      }
11057      else
11058       ender = (U8) *p++;
11059      break;
11060     } /* End of switch on the literal */
11061
11062     /* Here, have looked at the literal character and <ender>
11063     * contains its ordinal, <p> points to the character after it
11064     */
11065
11066     if ( RExC_flags & RXf_PMf_EXTENDED)
11067      p = regwhite( pRExC_state, p );
11068
11069     /* If the next thing is a quantifier, it applies to this
11070     * character only, which means that this character has to be in
11071     * its own node and can't just be appended to the string in an
11072     * existing node, so if there are already other characters in
11073     * the node, close the node with just them, and set up to do
11074     * this character again next time through, when it will be the
11075     * only thing in its new node */
11076     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11077     {
11078      p = oldp;
11079      goto loopdone;
11080     }
11081
11082     if (! FOLD) {
11083      if (UTF) {
11084       const STRLEN unilen = reguni(pRExC_state, ender, s);
11085       if (unilen > 0) {
11086       s   += unilen;
11087       len += unilen;
11088       }
11089
11090       /* The loop increments <len> each time, as all but this
11091       * path (and one other) through it add a single byte to
11092       * the EXACTish node.  But this one has changed len to
11093       * be the correct final value, so subtract one to
11094       * cancel out the increment that follows */
11095       len--;
11096      }
11097      else {
11098       REGC((char)ender, s++);
11099      }
11100     }
11101     else /* FOLD */
11102      if (! ( UTF
11103       /* See comments for join_exact() as to why we fold this
11104       * non-UTF at compile time */
11105       || (node_type == EXACTFU
11106        && ender == LATIN_SMALL_LETTER_SHARP_S)))
11107     {
11108      if (IS_IN_SOME_FOLD_L1(ender)) {
11109       maybe_exact = FALSE;
11110
11111       /* See if the character's fold differs between /d and
11112       * /u.  This includes the multi-char fold SHARP S to
11113       * 'ss' */
11114       if (maybe_exactfu
11115        && (PL_fold[ender] != PL_fold_latin1[ender]
11116         || ender == LATIN_SMALL_LETTER_SHARP_S
11117         || (len > 0
11118         && isARG2_lower_or_UPPER_ARG1('s', ender)
11119         && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11120       {
11121        maybe_exactfu = FALSE;
11122       }
11123      }
11124      *(s++) = (char) ender;
11125     }
11126     else {  /* UTF */
11127
11128      /* Prime the casefolded buffer.  Locale rules, which apply
11129      * only to code points < 256, aren't known until execution,
11130      * so for them, just output the original character using
11131      * utf8.  If we start to fold non-UTF patterns, be sure to
11132      * update join_exact() */
11133      if (LOC && ender < 256) {
11134       if (UNI_IS_INVARIANT(ender)) {
11135        *s = (U8) ender;
11136        foldlen = 1;
11137       } else {
11138        *s = UTF8_TWO_BYTE_HI(ender);
11139        *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11140        foldlen = 2;
11141       }
11142      }
11143      else {
11144       UV folded = _to_uni_fold_flags(
11145          ender,
11146          (U8 *) s,
11147          &foldlen,
11148          FOLD_FLAGS_FULL
11149          | ((LOC) ?  FOLD_FLAGS_LOCALE
11150             : (ASCII_FOLD_RESTRICTED)
11151             ? FOLD_FLAGS_NOMIX_ASCII
11152             : 0)
11153           );
11154
11155       /* If this node only contains non-folding code points
11156       * so far, see if this new one is also non-folding */
11157       if (maybe_exact) {
11158        if (folded != ender) {
11159         maybe_exact = FALSE;
11160        }
11161        else {
11162         /* Here the fold is the original; we have
11163         * to check further to see if anything
11164         * folds to it */
11165         if (! PL_utf8_foldable) {
11166          SV* swash = swash_init("utf8",
11167              "_Perl_Any_Folds",
11168              &PL_sv_undef, 1, 0);
11169          PL_utf8_foldable =
11170             _get_swash_invlist(swash);
11171          SvREFCNT_dec_NN(swash);
11172         }
11173         if (_invlist_contains_cp(PL_utf8_foldable,
11174               ender))
11175         {
11176          maybe_exact = FALSE;
11177         }
11178        }
11179       }
11180       ender = folded;
11181      }
11182      s += foldlen;
11183
11184      /* The loop increments <len> each time, as all but this
11185      * path (and one other) through it add a single byte to the
11186      * EXACTish node.  But this one has changed len to be the
11187      * correct final value, so subtract one to cancel out the
11188      * increment that follows */
11189      len += foldlen - 1;
11190     }
11191
11192     if (next_is_quantifier) {
11193
11194      /* Here, the next input is a quantifier, and to get here,
11195      * the current character is the only one in the node.
11196      * Also, here <len> doesn't include the final byte for this
11197      * character */
11198      len++;
11199      goto loopdone;
11200     }
11201
11202    } /* End of loop through literal characters */
11203
11204    /* Here we have either exhausted the input or ran out of room in
11205    * the node.  (If we encountered a character that can't be in the
11206    * node, transfer is made directly to <loopdone>, and so we
11207    * wouldn't have fallen off the end of the loop.)  In the latter
11208    * case, we artificially have to split the node into two, because
11209    * we just don't have enough space to hold everything.  This
11210    * creates a problem if the final character participates in a
11211    * multi-character fold in the non-final position, as a match that
11212    * should have occurred won't, due to the way nodes are matched,
11213    * and our artificial boundary.  So back off until we find a non-
11214    * problematic character -- one that isn't at the beginning or
11215    * middle of such a fold.  (Either it doesn't participate in any
11216    * folds, or appears only in the final position of all the folds it
11217    * does participate in.)  A better solution with far fewer false
11218    * positives, and that would fill the nodes more completely, would
11219    * be to actually have available all the multi-character folds to
11220    * test against, and to back-off only far enough to be sure that
11221    * this node isn't ending with a partial one.  <upper_parse> is set
11222    * further below (if we need to reparse the node) to include just
11223    * up through that final non-problematic character that this code
11224    * identifies, so when it is set to less than the full node, we can
11225    * skip the rest of this */
11226    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11227
11228     const STRLEN full_len = len;
11229
11230     assert(len >= MAX_NODE_STRING_SIZE);
11231
11232     /* Here, <s> points to the final byte of the final character.
11233     * Look backwards through the string until find a non-
11234     * problematic character */
11235
11236     if (! UTF) {
11237
11238      /* These two have no multi-char folds to non-UTF characters
11239      */
11240      if (ASCII_FOLD_RESTRICTED || LOC) {
11241       goto loopdone;
11242      }
11243
11244      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11245      len = s - s0 + 1;
11246     }
11247     else {
11248      if (!  PL_NonL1NonFinalFold) {
11249       PL_NonL1NonFinalFold = _new_invlist_C_array(
11250           NonL1_Perl_Non_Final_Folds_invlist);
11251      }
11252
11253      /* Point to the first byte of the final character */
11254      s = (char *) utf8_hop((U8 *) s, -1);
11255
11256      while (s >= s0) {   /* Search backwards until find
11257           non-problematic char */
11258       if (UTF8_IS_INVARIANT(*s)) {
11259
11260        /* There are no ascii characters that participate
11261        * in multi-char folds under /aa.  In EBCDIC, the
11262        * non-ascii invariants are all control characters,
11263        * so don't ever participate in any folds. */
11264        if (ASCII_FOLD_RESTRICTED
11265         || ! IS_NON_FINAL_FOLD(*s))
11266        {
11267         break;
11268        }
11269       }
11270       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11271
11272        /* No Latin1 characters participate in multi-char
11273        * folds under /l */
11274        if (LOC
11275         || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11276                 *s, *(s+1))))
11277        {
11278         break;
11279        }
11280       }
11281       else if (! _invlist_contains_cp(
11282           PL_NonL1NonFinalFold,
11283           valid_utf8_to_uvchr((U8 *) s, NULL)))
11284       {
11285        break;
11286       }
11287
11288       /* Here, the current character is problematic in that
11289       * it does occur in the non-final position of some
11290       * fold, so try the character before it, but have to
11291       * special case the very first byte in the string, so
11292       * we don't read outside the string */
11293       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11294      } /* End of loop backwards through the string */
11295
11296      /* If there were only problematic characters in the string,
11297      * <s> will point to before s0, in which case the length
11298      * should be 0, otherwise include the length of the
11299      * non-problematic character just found */
11300      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11301     }
11302
11303     /* Here, have found the final character, if any, that is
11304     * non-problematic as far as ending the node without splitting
11305     * it across a potential multi-char fold.  <len> contains the
11306     * number of bytes in the node up-to and including that
11307     * character, or is 0 if there is no such character, meaning
11308     * the whole node contains only problematic characters.  In
11309     * this case, give up and just take the node as-is.  We can't
11310     * do any better */
11311     if (len == 0) {
11312      len = full_len;
11313
11314      /* If the node ends in an 's' we make sure it stays EXACTF,
11315      * as if it turns into an EXACTFU, it could later get
11316      * joined with another 's' that would then wrongly match
11317      * the sharp s */
11318      if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11319      {
11320       maybe_exactfu = FALSE;
11321      }
11322     } else {
11323
11324      /* Here, the node does contain some characters that aren't
11325      * problematic.  If one such is the final character in the
11326      * node, we are done */
11327      if (len == full_len) {
11328       goto loopdone;
11329      }
11330      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11331
11332       /* If the final character is problematic, but the
11333       * penultimate is not, back-off that last character to
11334       * later start a new node with it */
11335       p = oldp;
11336       goto loopdone;
11337      }
11338
11339      /* Here, the final non-problematic character is earlier
11340      * in the input than the penultimate character.  What we do
11341      * is reparse from the beginning, going up only as far as
11342      * this final ok one, thus guaranteeing that the node ends
11343      * in an acceptable character.  The reason we reparse is
11344      * that we know how far in the character is, but we don't
11345      * know how to correlate its position with the input parse.
11346      * An alternate implementation would be to build that
11347      * correlation as we go along during the original parse,
11348      * but that would entail extra work for every node, whereas
11349      * this code gets executed only when the string is too
11350      * large for the node, and the final two characters are
11351      * problematic, an infrequent occurrence.  Yet another
11352      * possible strategy would be to save the tail of the
11353      * string, and the next time regatom is called, initialize
11354      * with that.  The problem with this is that unless you
11355      * back off one more character, you won't be guaranteed
11356      * regatom will get called again, unless regbranch,
11357      * regpiece ... are also changed.  If you do back off that
11358      * extra character, so that there is input guaranteed to
11359      * force calling regatom, you can't handle the case where
11360      * just the first character in the node is acceptable.  I
11361      * (khw) decided to try this method which doesn't have that
11362      * pitfall; if performance issues are found, we can do a
11363      * combination of the current approach plus that one */
11364      upper_parse = len;
11365      len = 0;
11366      s = s0;
11367      goto reparse;
11368     }
11369    }   /* End of verifying node ends with an appropriate char */
11370
11371   loopdone:   /* Jumped to when encounters something that shouldn't be in
11372      the node */
11373
11374    /* I (khw) don't know if you can get here with zero length, but the
11375    * old code handled this situation by creating a zero-length EXACT
11376    * node.  Might as well be NOTHING instead */
11377    if (len == 0) {
11378     OP(ret) = NOTHING;
11379    }
11380    else {
11381     if (FOLD) {
11382      /* If 'maybe_exact' is still set here, means there are no
11383      * code points in the node that participate in folds;
11384      * similarly for 'maybe_exactfu' and code points that match
11385      * differently depending on UTF8ness of the target string
11386      * */
11387      if (maybe_exact) {
11388       OP(ret) = EXACT;
11389      }
11390      else if (maybe_exactfu) {
11391       OP(ret) = EXACTFU;
11392      }
11393     }
11394     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11395    }
11396
11397    RExC_parse = p - 1;
11398    Set_Node_Cur_Length(ret, parse_start);
11399    nextchar(pRExC_state);
11400    {
11401     /* len is STRLEN which is unsigned, need to copy to signed */
11402     IV iv = len;
11403     if (iv < 0)
11404      vFAIL("Internal disaster");
11405    }
11406
11407   } /* End of label 'defchar:' */
11408   break;
11409  } /* End of giant switch on input character */
11410
11411  return(ret);
11412 }
11413
11414 STATIC char *
11415 S_regwhite( RExC_state_t *pRExC_state, char *p )
11416 {
11417  const char *e = RExC_end;
11418
11419  PERL_ARGS_ASSERT_REGWHITE;
11420
11421  while (p < e) {
11422   if (isSPACE(*p))
11423    ++p;
11424   else if (*p == '#') {
11425    bool ended = 0;
11426    do {
11427     if (*p++ == '\n') {
11428      ended = 1;
11429      break;
11430     }
11431    } while (p < e);
11432    if (!ended)
11433     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11434   }
11435   else
11436    break;
11437  }
11438  return p;
11439 }
11440
11441 STATIC char *
11442 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11443 {
11444  /* Returns the next non-pattern-white space, non-comment character (the
11445  * latter only if 'recognize_comment is true) in the string p, which is
11446  * ended by RExC_end.  If there is no line break ending a comment,
11447  * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11448  const char *e = RExC_end;
11449
11450  PERL_ARGS_ASSERT_REGPATWS;
11451
11452  while (p < e) {
11453   STRLEN len;
11454   if ((len = is_PATWS_safe(p, e, UTF))) {
11455    p += len;
11456   }
11457   else if (recognize_comment && *p == '#') {
11458    bool ended = 0;
11459    do {
11460     p++;
11461     if (is_LNBREAK_safe(p, e, UTF)) {
11462      ended = 1;
11463      break;
11464     }
11465    } while (p < e);
11466    if (!ended)
11467     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11468   }
11469   else
11470    break;
11471  }
11472  return p;
11473 }
11474
11475 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11476    Character classes ([:foo:]) can also be negated ([:^foo:]).
11477    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11478    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11479    but trigger failures because they are currently unimplemented. */
11480
11481 #define POSIXCC_DONE(c)   ((c) == ':')
11482 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11483 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11484
11485 PERL_STATIC_INLINE I32
11486 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11487 {
11488  dVAR;
11489  I32 namedclass = OOB_NAMEDCLASS;
11490
11491  PERL_ARGS_ASSERT_REGPPOSIXCC;
11492
11493  if (value == '[' && RExC_parse + 1 < RExC_end &&
11494   /* I smell either [: or [= or [. -- POSIX has been here, right? */
11495   POSIXCC(UCHARAT(RExC_parse)))
11496  {
11497   const char c = UCHARAT(RExC_parse);
11498   char* const s = RExC_parse++;
11499
11500   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11501    RExC_parse++;
11502   if (RExC_parse == RExC_end) {
11503    if (strict) {
11504
11505     /* Try to give a better location for the error (than the end of
11506     * the string) by looking for the matching ']' */
11507     RExC_parse = s;
11508     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11509      RExC_parse++;
11510     }
11511     vFAIL2("Unmatched '%c' in POSIX class", c);
11512    }
11513    /* Grandfather lone [:, [=, [. */
11514    RExC_parse = s;
11515   }
11516   else {
11517    const char* const t = RExC_parse++; /* skip over the c */
11518    assert(*t == c);
11519
11520    if (UCHARAT(RExC_parse) == ']') {
11521     const char *posixcc = s + 1;
11522     RExC_parse++; /* skip over the ending ] */
11523
11524     if (*s == ':') {
11525      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11526      const I32 skip = t - posixcc;
11527
11528      /* Initially switch on the length of the name.  */
11529      switch (skip) {
11530      case 4:
11531       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11532               this is the Perl \w
11533               */
11534        namedclass = ANYOF_WORDCHAR;
11535       break;
11536      case 5:
11537       /* Names all of length 5.  */
11538       /* alnum alpha ascii blank cntrl digit graph lower
11539       print punct space upper  */
11540       /* Offset 4 gives the best switch position.  */
11541       switch (posixcc[4]) {
11542       case 'a':
11543        if (memEQ(posixcc, "alph", 4)) /* alpha */
11544         namedclass = ANYOF_ALPHA;
11545        break;
11546       case 'e':
11547        if (memEQ(posixcc, "spac", 4)) /* space */
11548         namedclass = ANYOF_PSXSPC;
11549        break;
11550       case 'h':
11551        if (memEQ(posixcc, "grap", 4)) /* graph */
11552         namedclass = ANYOF_GRAPH;
11553        break;
11554       case 'i':
11555        if (memEQ(posixcc, "asci", 4)) /* ascii */
11556         namedclass = ANYOF_ASCII;
11557        break;
11558       case 'k':
11559        if (memEQ(posixcc, "blan", 4)) /* blank */
11560         namedclass = ANYOF_BLANK;
11561        break;
11562       case 'l':
11563        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11564         namedclass = ANYOF_CNTRL;
11565        break;
11566       case 'm':
11567        if (memEQ(posixcc, "alnu", 4)) /* alnum */
11568         namedclass = ANYOF_ALPHANUMERIC;
11569        break;
11570       case 'r':
11571        if (memEQ(posixcc, "lowe", 4)) /* lower */
11572         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11573        else if (memEQ(posixcc, "uppe", 4)) /* upper */
11574         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11575        break;
11576       case 't':
11577        if (memEQ(posixcc, "digi", 4)) /* digit */
11578         namedclass = ANYOF_DIGIT;
11579        else if (memEQ(posixcc, "prin", 4)) /* print */
11580         namedclass = ANYOF_PRINT;
11581        else if (memEQ(posixcc, "punc", 4)) /* punct */
11582         namedclass = ANYOF_PUNCT;
11583        break;
11584       }
11585       break;
11586      case 6:
11587       if (memEQ(posixcc, "xdigit", 6))
11588        namedclass = ANYOF_XDIGIT;
11589       break;
11590      }
11591
11592      if (namedclass == OOB_NAMEDCLASS)
11593       Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11594          t - s - 1, s + 1);
11595
11596      /* The #defines are structured so each complement is +1 to
11597      * the normal one */
11598      if (complement) {
11599       namedclass++;
11600      }
11601      assert (posixcc[skip] == ':');
11602      assert (posixcc[skip+1] == ']');
11603     } else if (!SIZE_ONLY) {
11604      /* [[=foo=]] and [[.foo.]] are still future. */
11605
11606      /* adjust RExC_parse so the warning shows after
11607      the class closes */
11608      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11609       RExC_parse++;
11610      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11611     }
11612    } else {
11613     /* Maternal grandfather:
11614     * "[:" ending in ":" but not in ":]" */
11615     if (strict) {
11616      vFAIL("Unmatched '[' in POSIX class");
11617     }
11618
11619     /* Grandfather lone [:, [=, [. */
11620     RExC_parse = s;
11621    }
11622   }
11623  }
11624
11625  return namedclass;
11626 }
11627
11628 STATIC bool
11629 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11630 {
11631  /* This applies some heuristics at the current parse position (which should
11632  * be at a '[') to see if what follows might be intended to be a [:posix:]
11633  * class.  It returns true if it really is a posix class, of course, but it
11634  * also can return true if it thinks that what was intended was a posix
11635  * class that didn't quite make it.
11636  *
11637  * It will return true for
11638  *      [:alphanumerics:
11639  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11640  *                         ')' indicating the end of the (?[
11641  *      [:any garbage including %^&$ punctuation:]
11642  *
11643  * This is designed to be called only from S_handle_regex_sets; it could be
11644  * easily adapted to be called from the spot at the beginning of regclass()
11645  * that checks to see in a normal bracketed class if the surrounding []
11646  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11647  * change long-standing behavior, so I (khw) didn't do that */
11648  char* p = RExC_parse + 1;
11649  char first_char = *p;
11650
11651  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11652
11653  assert(*(p - 1) == '[');
11654
11655  if (! POSIXCC(first_char)) {
11656   return FALSE;
11657  }
11658
11659  p++;
11660  while (p < RExC_end && isWORDCHAR(*p)) p++;
11661
11662  if (p >= RExC_end) {
11663   return FALSE;
11664  }
11665
11666  if (p - RExC_parse > 2    /* Got at least 1 word character */
11667   && (*p == first_char
11668    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11669  {
11670   return TRUE;
11671  }
11672
11673  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11674
11675  return (p
11676    && p - RExC_parse > 2 /* [:] evaluates to colon;
11677          [::] is a bad posix class. */
11678    && first_char == *(p - 1));
11679 }
11680
11681 STATIC regnode *
11682 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11683     char * const oregcomp_parse)
11684 {
11685  /* Handle the (?[...]) construct to do set operations */
11686
11687  U8 curchar;
11688  UV start, end; /* End points of code point ranges */
11689  SV* result_string;
11690  char *save_end, *save_parse;
11691  SV* final;
11692  STRLEN len;
11693  regnode* node;
11694  AV* stack;
11695  const bool save_fold = FOLD;
11696
11697  GET_RE_DEBUG_FLAGS_DECL;
11698
11699  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11700
11701  if (LOC) {
11702   vFAIL("(?[...]) not valid in locale");
11703  }
11704  RExC_uni_semantics = 1;
11705
11706  /* This will return only an ANYOF regnode, or (unlikely) something smaller
11707  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11708  * call regclass to handle '[]' so as to not have to reinvent its parsing
11709  * rules here (throwing away the size it computes each time).  And, we exit
11710  * upon an unescaped ']' that isn't one ending a regclass.  To do both
11711  * these things, we need to realize that something preceded by a backslash
11712  * is escaped, so we have to keep track of backslashes */
11713  if (SIZE_ONLY) {
11714   UV depth = 0; /* how many nested (?[...]) constructs */
11715
11716   Perl_ck_warner_d(aTHX_
11717    packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11718    "The regex_sets feature is experimental" REPORT_LOCATION,
11719    (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11720
11721   while (RExC_parse < RExC_end) {
11722    SV* current = NULL;
11723    RExC_parse = regpatws(pRExC_state, RExC_parse,
11724         TRUE); /* means recognize comments */
11725    switch (*RExC_parse) {
11726     case '?':
11727      if (RExC_parse[1] == '[') depth++, RExC_parse++;
11728      /* FALL THROUGH */
11729     default:
11730      break;
11731     case '\\':
11732      /* Skip the next byte (which could cause us to end up in
11733      * the middle of a UTF-8 character, but since none of those
11734      * are confusable with anything we currently handle in this
11735      * switch (invariants all), it's safe.  We'll just hit the
11736      * default: case next time and keep on incrementing until
11737      * we find one of the invariants we do handle. */
11738      RExC_parse++;
11739      break;
11740     case '[':
11741     {
11742      /* If this looks like it is a [:posix:] class, leave the
11743      * parse pointer at the '[' to fool regclass() into
11744      * thinking it is part of a '[[:posix:]]'.  That function
11745      * will use strict checking to force a syntax error if it
11746      * doesn't work out to a legitimate class */
11747      bool is_posix_class
11748          = could_it_be_a_POSIX_class(pRExC_state);
11749      if (! is_posix_class) {
11750       RExC_parse++;
11751      }
11752
11753      /* regclass() can only return RESTART_UTF8 if multi-char
11754      folds are allowed.  */
11755      if (!regclass(pRExC_state, flagp,depth+1,
11756         is_posix_class, /* parse the whole char
11757              class only if not a
11758              posix class */
11759         FALSE, /* don't allow multi-char folds */
11760         TRUE, /* silence non-portable warnings. */
11761         &current))
11762       FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11763        (UV) *flagp);
11764
11765      /* function call leaves parse pointing to the ']', except
11766      * if we faked it */
11767      if (is_posix_class) {
11768       RExC_parse--;
11769      }
11770
11771      SvREFCNT_dec(current);   /* In case it returned something */
11772      break;
11773     }
11774
11775     case ']':
11776      if (depth--) break;
11777      RExC_parse++;
11778      if (RExC_parse < RExC_end
11779       && *RExC_parse == ')')
11780      {
11781       node = reganode(pRExC_state, ANYOF, 0);
11782       RExC_size += ANYOF_SKIP;
11783       nextchar(pRExC_state);
11784       Set_Node_Length(node,
11785         RExC_parse - oregcomp_parse + 1); /* MJD */
11786       return node;
11787      }
11788      goto no_close;
11789    }
11790    RExC_parse++;
11791   }
11792
11793   no_close:
11794   FAIL("Syntax error in (?[...])");
11795  }
11796
11797  /* Pass 2 only after this.  Everything in this construct is a
11798  * metacharacter.  Operands begin with either a '\' (for an escape
11799  * sequence), or a '[' for a bracketed character class.  Any other
11800  * character should be an operator, or parenthesis for grouping.  Both
11801  * types of operands are handled by calling regclass() to parse them.  It
11802  * is called with a parameter to indicate to return the computed inversion
11803  * list.  The parsing here is implemented via a stack.  Each entry on the
11804  * stack is a single character representing one of the operators, or the
11805  * '('; or else a pointer to an operand inversion list. */
11806
11807 #define IS_OPERAND(a)  (! SvIOK(a))
11808
11809  /* The stack starts empty.  It is a syntax error if the first thing parsed
11810  * is a binary operator; everything else is pushed on the stack.  When an
11811  * operand is parsed, the top of the stack is examined.  If it is a binary
11812  * operator, the item before it should be an operand, and both are replaced
11813  * by the result of doing that operation on the new operand and the one on
11814  * the stack.   Thus a sequence of binary operands is reduced to a single
11815  * one before the next one is parsed.
11816  *
11817  * A unary operator may immediately follow a binary in the input, for
11818  * example
11819  *      [a] + ! [b]
11820  * When an operand is parsed and the top of the stack is a unary operator,
11821  * the operation is performed, and then the stack is rechecked to see if
11822  * this new operand is part of a binary operation; if so, it is handled as
11823  * above.
11824  *
11825  * A '(' is simply pushed on the stack; it is valid only if the stack is
11826  * empty, or the top element of the stack is an operator or another '('
11827  * (for which the parenthesized expression will become an operand).  By the
11828  * time the corresponding ')' is parsed everything in between should have
11829  * been parsed and evaluated to a single operand (or else is a syntax
11830  * error), and is handled as a regular operand */
11831
11832  sv_2mortal((SV *)(stack = newAV()));
11833
11834  while (RExC_parse < RExC_end) {
11835   I32 top_index = av_tindex(stack);
11836   SV** top_ptr;
11837   SV* current = NULL;
11838
11839   /* Skip white space */
11840   RExC_parse = regpatws(pRExC_state, RExC_parse,
11841         TRUE); /* means recognize comments */
11842   if (RExC_parse >= RExC_end) {
11843    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11844   }
11845   if ((curchar = UCHARAT(RExC_parse)) == ']') {
11846    break;
11847   }
11848
11849   switch (curchar) {
11850
11851    case '?':
11852     if (av_tindex(stack) >= 0   /* This makes sure that we can
11853            safely subtract 1 from
11854            RExC_parse in the next clause.
11855            If we have something on the
11856            stack, we have parsed something
11857            */
11858      && UCHARAT(RExC_parse - 1) == '('
11859      && RExC_parse < RExC_end)
11860     {
11861      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11862      * This happens when we have some thing like
11863      *
11864      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11865      *   ...
11866      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
11867      *
11868      * Here we would be handling the interpolated
11869      * '$thai_or_lao'.  We handle this by a recursive call to
11870      * ourselves which returns the inversion list the
11871      * interpolated expression evaluates to.  We use the flags
11872      * from the interpolated pattern. */
11873      U32 save_flags = RExC_flags;
11874      const char * const save_parse = ++RExC_parse;
11875
11876      parse_lparen_question_flags(pRExC_state);
11877
11878      if (RExC_parse == save_parse  /* Makes sure there was at
11879              least one flag (or this
11880              embedding wasn't compiled)
11881             */
11882       || RExC_parse >= RExC_end - 4
11883       || UCHARAT(RExC_parse) != ':'
11884       || UCHARAT(++RExC_parse) != '('
11885       || UCHARAT(++RExC_parse) != '?'
11886       || UCHARAT(++RExC_parse) != '[')
11887      {
11888
11889       /* In combination with the above, this moves the
11890       * pointer to the point just after the first erroneous
11891       * character (or if there are no flags, to where they
11892       * should have been) */
11893       if (RExC_parse >= RExC_end - 4) {
11894        RExC_parse = RExC_end;
11895       }
11896       else if (RExC_parse != save_parse) {
11897        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11898       }
11899       vFAIL("Expecting '(?flags:(?[...'");
11900      }
11901      RExC_parse++;
11902      (void) handle_regex_sets(pRExC_state, &current, flagp,
11903              depth+1, oregcomp_parse);
11904
11905      /* Here, 'current' contains the embedded expression's
11906      * inversion list, and RExC_parse points to the trailing
11907      * ']'; the next character should be the ')' which will be
11908      * paired with the '(' that has been put on the stack, so
11909      * the whole embedded expression reduces to '(operand)' */
11910      RExC_parse++;
11911
11912      RExC_flags = save_flags;
11913      goto handle_operand;
11914     }
11915     /* FALL THROUGH */
11916
11917    default:
11918     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11919     vFAIL("Unexpected character");
11920
11921    case '\\':
11922     /* regclass() can only return RESTART_UTF8 if multi-char
11923     folds are allowed.  */
11924     if (!regclass(pRExC_state, flagp,depth+1,
11925        TRUE, /* means parse just the next thing */
11926        FALSE, /* don't allow multi-char folds */
11927        FALSE, /* don't silence non-portable warnings.  */
11928        &current))
11929      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11930       (UV) *flagp);
11931     /* regclass() will return with parsing just the \ sequence,
11932     * leaving the parse pointer at the next thing to parse */
11933     RExC_parse--;
11934     goto handle_operand;
11935
11936    case '[':   /* Is a bracketed character class */
11937    {
11938     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11939
11940     if (! is_posix_class) {
11941      RExC_parse++;
11942     }
11943
11944     /* regclass() can only return RESTART_UTF8 if multi-char
11945     folds are allowed.  */
11946     if(!regclass(pRExC_state, flagp,depth+1,
11947        is_posix_class, /* parse the whole char class
11948             only if not a posix class */
11949        FALSE, /* don't allow multi-char folds */
11950        FALSE, /* don't silence non-portable warnings.  */
11951        &current))
11952      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11953       (UV) *flagp);
11954     /* function call leaves parse pointing to the ']', except if we
11955     * faked it */
11956     if (is_posix_class) {
11957      RExC_parse--;
11958     }
11959
11960     goto handle_operand;
11961    }
11962
11963    case '&':
11964    case '|':
11965    case '+':
11966    case '-':
11967    case '^':
11968     if (top_index < 0
11969      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11970      || ! IS_OPERAND(*top_ptr))
11971     {
11972      RExC_parse++;
11973      vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11974     }
11975     av_push(stack, newSVuv(curchar));
11976     break;
11977
11978    case '!':
11979     av_push(stack, newSVuv(curchar));
11980     break;
11981
11982    case '(':
11983     if (top_index >= 0) {
11984      top_ptr = av_fetch(stack, top_index, FALSE);
11985      assert(top_ptr);
11986      if (IS_OPERAND(*top_ptr)) {
11987       RExC_parse++;
11988       vFAIL("Unexpected '(' with no preceding operator");
11989      }
11990     }
11991     av_push(stack, newSVuv(curchar));
11992     break;
11993
11994    case ')':
11995    {
11996     SV* lparen;
11997     if (top_index < 1
11998      || ! (current = av_pop(stack))
11999      || ! IS_OPERAND(current)
12000      || ! (lparen = av_pop(stack))
12001      || IS_OPERAND(lparen)
12002      || SvUV(lparen) != '(')
12003     {
12004      SvREFCNT_dec(current);
12005      RExC_parse++;
12006      vFAIL("Unexpected ')'");
12007     }
12008     top_index -= 2;
12009     SvREFCNT_dec_NN(lparen);
12010
12011     /* FALL THROUGH */
12012    }
12013
12014    handle_operand:
12015
12016     /* Here, we have an operand to process, in 'current' */
12017
12018     if (top_index < 0) {    /* Just push if stack is empty */
12019      av_push(stack, current);
12020     }
12021     else {
12022      SV* top = av_pop(stack);
12023      SV *prev = NULL;
12024      char current_operator;
12025
12026      if (IS_OPERAND(top)) {
12027       SvREFCNT_dec_NN(top);
12028       SvREFCNT_dec_NN(current);
12029       vFAIL("Operand with no preceding operator");
12030      }
12031      current_operator = (char) SvUV(top);
12032      switch (current_operator) {
12033       case '(':   /* Push the '(' back on followed by the new
12034          operand */
12035        av_push(stack, top);
12036        av_push(stack, current);
12037        SvREFCNT_inc(top);  /* Counters the '_dec' done
12038             just after the 'break', so
12039             it doesn't get wrongly freed
12040             */
12041        break;
12042
12043       case '!':
12044        _invlist_invert(current);
12045
12046        /* Unlike binary operators, the top of the stack,
12047        * now that this unary one has been popped off, may
12048        * legally be an operator, and we now have operand
12049        * for it. */
12050        top_index--;
12051        SvREFCNT_dec_NN(top);
12052        goto handle_operand;
12053
12054       case '&':
12055        prev = av_pop(stack);
12056        _invlist_intersection(prev,
12057             current,
12058             &current);
12059        av_push(stack, current);
12060        break;
12061
12062       case '|':
12063       case '+':
12064        prev = av_pop(stack);
12065        _invlist_union(prev, current, &current);
12066        av_push(stack, current);
12067        break;
12068
12069       case '-':
12070        prev = av_pop(stack);;
12071        _invlist_subtract(prev, current, &current);
12072        av_push(stack, current);
12073        break;
12074
12075       case '^':   /* The union minus the intersection */
12076       {
12077        SV* i = NULL;
12078        SV* u = NULL;
12079        SV* element;
12080
12081        prev = av_pop(stack);
12082        _invlist_union(prev, current, &u);
12083        _invlist_intersection(prev, current, &i);
12084        /* _invlist_subtract will overwrite current
12085         without freeing what it already contains */
12086        element = current;
12087        _invlist_subtract(u, i, &current);
12088        av_push(stack, current);
12089        SvREFCNT_dec_NN(i);
12090        SvREFCNT_dec_NN(u);
12091        SvREFCNT_dec_NN(element);
12092        break;
12093       }
12094
12095       default:
12096        Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12097     }
12098     SvREFCNT_dec_NN(top);
12099     SvREFCNT_dec(prev);
12100    }
12101   }
12102
12103   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12104  }
12105
12106  if (av_tindex(stack) < 0   /* Was empty */
12107   || ((final = av_pop(stack)) == NULL)
12108   || ! IS_OPERAND(final)
12109   || av_tindex(stack) >= 0)  /* More left on stack */
12110  {
12111   vFAIL("Incomplete expression within '(?[ ])'");
12112  }
12113
12114  /* Here, 'final' is the resultant inversion list from evaluating the
12115  * expression.  Return it if so requested */
12116  if (return_invlist) {
12117   *return_invlist = final;
12118   return END;
12119  }
12120
12121  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
12122  * expecting a string of ranges and individual code points */
12123  invlist_iterinit(final);
12124  result_string = newSVpvs("");
12125  while (invlist_iternext(final, &start, &end)) {
12126   if (start == end) {
12127    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12128   }
12129   else {
12130    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12131              start,          end);
12132   }
12133  }
12134
12135  save_parse = RExC_parse;
12136  RExC_parse = SvPV(result_string, len);
12137  save_end = RExC_end;
12138  RExC_end = RExC_parse + len;
12139
12140  /* We turn off folding around the call, as the class we have constructed
12141  * already has all folding taken into consideration, and we don't want
12142  * regclass() to add to that */
12143  RExC_flags &= ~RXf_PMf_FOLD;
12144  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12145  */
12146  node = regclass(pRExC_state, flagp,depth+1,
12147      FALSE, /* means parse the whole char class */
12148      FALSE, /* don't allow multi-char folds */
12149      TRUE, /* silence non-portable warnings.  The above may very
12150        well have generated non-portable code points, but
12151        they're valid on this machine */
12152      NULL);
12153  if (!node)
12154   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12155      PTR2UV(flagp));
12156  if (save_fold) {
12157   RExC_flags |= RXf_PMf_FOLD;
12158  }
12159  RExC_parse = save_parse + 1;
12160  RExC_end = save_end;
12161  SvREFCNT_dec_NN(final);
12162  SvREFCNT_dec_NN(result_string);
12163
12164  nextchar(pRExC_state);
12165  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12166  return node;
12167 }
12168 #undef IS_OPERAND
12169
12170 /* The names of properties whose definitions are not known at compile time are
12171  * stored in this SV, after a constant heading.  So if the length has been
12172  * changed since initialization, then there is a run-time definition. */
12173 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12174
12175 STATIC regnode *
12176 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12177     const bool stop_at_1,  /* Just parse the next thing, don't
12178           look for a full character class */
12179     bool allow_multi_folds,
12180     const bool silence_non_portable,   /* Don't output warnings
12181              about too large
12182              characters */
12183     SV** ret_invlist)  /* Return an inversion list, not a node */
12184 {
12185  /* parse a bracketed class specification.  Most of these will produce an
12186  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12187  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
12188  * under /i with multi-character folds: it will be rewritten following the
12189  * paradigm of this example, where the <multi-fold>s are characters which
12190  * fold to multiple character sequences:
12191  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12192  * gets effectively rewritten as:
12193  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12194  * reg() gets called (recursively) on the rewritten version, and this
12195  * function will return what it constructs.  (Actually the <multi-fold>s
12196  * aren't physically removed from the [abcdefghi], it's just that they are
12197  * ignored in the recursion by means of a flag:
12198  * <RExC_in_multi_char_class>.)
12199  *
12200  * ANYOF nodes contain a bit map for the first 256 characters, with the
12201  * corresponding bit set if that character is in the list.  For characters
12202  * above 255, a range list or swash is used.  There are extra bits for \w,
12203  * etc. in locale ANYOFs, as what these match is not determinable at
12204  * compile time
12205  *
12206  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12207  * to be restarted.  This can only happen if ret_invlist is non-NULL.
12208  */
12209
12210  dVAR;
12211  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12212  IV range = 0;
12213  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12214  regnode *ret;
12215  STRLEN numlen;
12216  IV namedclass = OOB_NAMEDCLASS;
12217  char *rangebegin = NULL;
12218  bool need_class = 0;
12219  SV *listsv = NULL;
12220  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12221          than just initialized.  */
12222  SV* properties = NULL;    /* Code points that match \p{} \P{} */
12223  SV* posixes = NULL;     /* Code points that match classes like, [:word:],
12224        extended beyond the Latin1 range */
12225  UV element_count = 0;   /* Number of distinct elements in the class.
12226        Optimizations may be possible if this is tiny */
12227  AV * multi_char_matches = NULL; /* Code points that fold to more than one
12228          character; used under /i */
12229  UV n;
12230  char * stop_ptr = RExC_end;    /* where to stop parsing */
12231  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12232             space? */
12233  const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12234
12235  /* Unicode properties are stored in a swash; this holds the current one
12236  * being parsed.  If this swash is the only above-latin1 component of the
12237  * character class, an optimization is to pass it directly on to the
12238  * execution engine.  Otherwise, it is set to NULL to indicate that there
12239  * are other things in the class that have to be dealt with at execution
12240  * time */
12241  SV* swash = NULL;  /* Code points that match \p{} \P{} */
12242
12243  /* Set if a component of this character class is user-defined; just passed
12244  * on to the engine */
12245  bool has_user_defined_property = FALSE;
12246
12247  /* inversion list of code points this node matches only when the target
12248  * string is in UTF-8.  (Because is under /d) */
12249  SV* depends_list = NULL;
12250
12251  /* inversion list of code points this node matches.  For much of the
12252  * function, it includes only those that match regardless of the utf8ness
12253  * of the target string */
12254  SV* cp_list = NULL;
12255
12256 #ifdef EBCDIC
12257  /* In a range, counts how many 0-2 of the ends of it came from literals,
12258  * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12259  UV literal_endpoint = 0;
12260 #endif
12261  bool invert = FALSE;    /* Is this class to be complemented */
12262
12263  /* Is there any thing like \W or [:^digit:] that matches above the legal
12264  * Unicode range? */
12265  bool runtime_posix_matches_above_Unicode = FALSE;
12266
12267  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12268   case we need to change the emitted regop to an EXACT. */
12269  const char * orig_parse = RExC_parse;
12270  const I32 orig_size = RExC_size;
12271  GET_RE_DEBUG_FLAGS_DECL;
12272
12273  PERL_ARGS_ASSERT_REGCLASS;
12274 #ifndef DEBUGGING
12275  PERL_UNUSED_ARG(depth);
12276 #endif
12277
12278  DEBUG_PARSE("clas");
12279
12280  /* Assume we are going to generate an ANYOF node. */
12281  ret = reganode(pRExC_state, ANYOF, 0);
12282
12283  if (SIZE_ONLY) {
12284   RExC_size += ANYOF_SKIP;
12285   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12286  }
12287  else {
12288   ANYOF_FLAGS(ret) = 0;
12289
12290   RExC_emit += ANYOF_SKIP;
12291   if (LOC) {
12292    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12293   }
12294   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12295   initial_listsv_len = SvCUR(listsv);
12296   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12297  }
12298
12299  if (skip_white) {
12300   RExC_parse = regpatws(pRExC_state, RExC_parse,
12301        FALSE /* means don't recognize comments */);
12302  }
12303
12304  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12305   RExC_parse++;
12306   invert = TRUE;
12307   allow_multi_folds = FALSE;
12308   RExC_naughty++;
12309   if (skip_white) {
12310    RExC_parse = regpatws(pRExC_state, RExC_parse,
12311         FALSE /* means don't recognize comments */);
12312   }
12313  }
12314
12315  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12316  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12317   const char *s = RExC_parse;
12318   const char  c = *s++;
12319
12320   while (isWORDCHAR(*s))
12321    s++;
12322   if (*s && c == *s && s[1] == ']') {
12323    SAVEFREESV(RExC_rx_sv);
12324    ckWARN3reg(s+2,
12325      "POSIX syntax [%c %c] belongs inside character classes",
12326      c, c);
12327    (void)ReREFCNT_inc(RExC_rx_sv);
12328   }
12329  }
12330
12331  /* If the caller wants us to just parse a single element, accomplish this
12332  * by faking the loop ending condition */
12333  if (stop_at_1 && RExC_end > RExC_parse) {
12334   stop_ptr = RExC_parse + 1;
12335  }
12336
12337  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12338  if (UCHARAT(RExC_parse) == ']')
12339   goto charclassloop;
12340
12341 parseit:
12342  while (1) {
12343   if  (RExC_parse >= stop_ptr) {
12344    break;
12345   }
12346
12347   if (skip_white) {
12348    RExC_parse = regpatws(pRExC_state, RExC_parse,
12349         FALSE /* means don't recognize comments */);
12350   }
12351
12352   if  (UCHARAT(RExC_parse) == ']') {
12353    break;
12354   }
12355
12356  charclassloop:
12357
12358   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12359   save_value = value;
12360   save_prevvalue = prevvalue;
12361
12362   if (!range) {
12363    rangebegin = RExC_parse;
12364    element_count++;
12365   }
12366   if (UTF) {
12367    value = utf8n_to_uvchr((U8*)RExC_parse,
12368         RExC_end - RExC_parse,
12369         &numlen, UTF8_ALLOW_DEFAULT);
12370    RExC_parse += numlen;
12371   }
12372   else
12373    value = UCHARAT(RExC_parse++);
12374
12375   if (value == '['
12376    && RExC_parse < RExC_end
12377    && POSIXCC(UCHARAT(RExC_parse)))
12378   {
12379    namedclass = regpposixcc(pRExC_state, value, strict);
12380   }
12381   else if (value == '\\') {
12382    if (UTF) {
12383     value = utf8n_to_uvchr((U8*)RExC_parse,
12384         RExC_end - RExC_parse,
12385         &numlen, UTF8_ALLOW_DEFAULT);
12386     RExC_parse += numlen;
12387    }
12388    else
12389     value = UCHARAT(RExC_parse++);
12390
12391    /* Some compilers cannot handle switching on 64-bit integer
12392    * values, therefore value cannot be an UV.  Yes, this will
12393    * be a problem later if we want switch on Unicode.
12394    * A similar issue a little bit later when switching on
12395    * namedclass. --jhi */
12396
12397    /* If the \ is escaping white space when white space is being
12398    * skipped, it means that that white space is wanted literally, and
12399    * is already in 'value'.  Otherwise, need to translate the escape
12400    * into what it signifies. */
12401    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12402
12403    case 'w': namedclass = ANYOF_WORDCHAR; break;
12404    case 'W': namedclass = ANYOF_NWORDCHAR; break;
12405    case 's': namedclass = ANYOF_SPACE; break;
12406    case 'S': namedclass = ANYOF_NSPACE; break;
12407    case 'd': namedclass = ANYOF_DIGIT; break;
12408    case 'D': namedclass = ANYOF_NDIGIT; break;
12409    case 'v': namedclass = ANYOF_VERTWS; break;
12410    case 'V': namedclass = ANYOF_NVERTWS; break;
12411    case 'h': namedclass = ANYOF_HORIZWS; break;
12412    case 'H': namedclass = ANYOF_NHORIZWS; break;
12413    case 'N':  /* Handle \N{NAME} in class */
12414     {
12415      /* We only pay attention to the first char of
12416      multichar strings being returned. I kinda wonder
12417      if this makes sense as it does change the behaviour
12418      from earlier versions, OTOH that behaviour was broken
12419      as well. */
12420      if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12421          TRUE, /* => charclass */
12422          strict))
12423      {
12424       if (*flagp & RESTART_UTF8)
12425        FAIL("panic: grok_bslash_N set RESTART_UTF8");
12426       goto parseit;
12427      }
12428     }
12429     break;
12430    case 'p':
12431    case 'P':
12432     {
12433     char *e;
12434
12435     /* We will handle any undefined properties ourselves */
12436     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12437
12438     if (RExC_parse >= RExC_end)
12439      vFAIL2("Empty \\%c{}", (U8)value);
12440     if (*RExC_parse == '{') {
12441      const U8 c = (U8)value;
12442      e = strchr(RExC_parse++, '}');
12443      if (!e)
12444       vFAIL2("Missing right brace on \\%c{}", c);
12445      while (isSPACE(UCHARAT(RExC_parse)))
12446       RExC_parse++;
12447      if (e == RExC_parse)
12448       vFAIL2("Empty \\%c{}", c);
12449      n = e - RExC_parse;
12450      while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12451       n--;
12452     }
12453     else {
12454      e = RExC_parse;
12455      n = 1;
12456     }
12457     if (!SIZE_ONLY) {
12458      SV* invlist;
12459      char* name;
12460
12461      if (UCHARAT(RExC_parse) == '^') {
12462       RExC_parse++;
12463       n--;
12464       /* toggle.  (The rhs xor gets the single bit that
12465       * differs between P and p; the other xor inverts just
12466       * that bit) */
12467       value ^= 'P' ^ 'p';
12468
12469       while (isSPACE(UCHARAT(RExC_parse))) {
12470        RExC_parse++;
12471        n--;
12472       }
12473      }
12474      /* Try to get the definition of the property into
12475      * <invlist>.  If /i is in effect, the effective property
12476      * will have its name be <__NAME_i>.  The design is
12477      * discussed in commit
12478      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12479      Newx(name, n + sizeof("_i__\n"), char);
12480
12481      sprintf(name, "%s%.*s%s\n",
12482          (FOLD) ? "__" : "",
12483          (int)n,
12484          RExC_parse,
12485          (FOLD) ? "_i" : ""
12486      );
12487
12488      /* Look up the property name, and get its swash and
12489      * inversion list, if the property is found  */
12490      if (swash) {
12491       SvREFCNT_dec_NN(swash);
12492      }
12493      swash = _core_swash_init("utf8", name, &PL_sv_undef,
12494            1, /* binary */
12495            0, /* not tr/// */
12496            NULL, /* No inversion list */
12497            &swash_init_flags
12498            );
12499      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12500       if (swash) {
12501        SvREFCNT_dec_NN(swash);
12502        swash = NULL;
12503       }
12504
12505       /* Here didn't find it.  It could be a user-defined
12506       * property that will be available at run-time.  If we
12507       * accept only compile-time properties, is an error;
12508       * otherwise add it to the list for run-time look up */
12509       if (ret_invlist) {
12510        RExC_parse = e + 1;
12511        vFAIL3("Property '%.*s' is unknown", (int) n, name);
12512       }
12513       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12514           (value == 'p' ? '+' : '!'),
12515           name);
12516       has_user_defined_property = TRUE;
12517
12518       /* We don't know yet, so have to assume that the
12519       * property could match something in the Latin1 range,
12520       * hence something that isn't utf8.  Note that this
12521       * would cause things in <depends_list> to match
12522       * inappropriately, except that any \p{}, including
12523       * this one forces Unicode semantics, which means there
12524       * is <no depends_list> */
12525       ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12526      }
12527      else {
12528
12529       /* Here, did get the swash and its inversion list.  If
12530       * the swash is from a user-defined property, then this
12531       * whole character class should be regarded as such */
12532       has_user_defined_property =
12533          (swash_init_flags
12534          & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12535
12536       /* Invert if asking for the complement */
12537       if (value == 'P') {
12538        _invlist_union_complement_2nd(properties,
12539               invlist,
12540               &properties);
12541
12542        /* The swash can't be used as-is, because we've
12543        * inverted things; delay removing it to here after
12544        * have copied its invlist above */
12545        SvREFCNT_dec_NN(swash);
12546        swash = NULL;
12547       }
12548       else {
12549        _invlist_union(properties, invlist, &properties);
12550       }
12551      }
12552      Safefree(name);
12553     }
12554     RExC_parse = e + 1;
12555     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12556             named */
12557
12558     /* \p means they want Unicode semantics */
12559     RExC_uni_semantics = 1;
12560     }
12561     break;
12562    case 'n': value = '\n';   break;
12563    case 'r': value = '\r';   break;
12564    case 't': value = '\t';   break;
12565    case 'f': value = '\f';   break;
12566    case 'b': value = '\b';   break;
12567    case 'e': value = ASCII_TO_NATIVE('\033');break;
12568    case 'a': value = ASCII_TO_NATIVE('\007');break;
12569    case 'o':
12570     RExC_parse--; /* function expects to be pointed at the 'o' */
12571     {
12572      const char* error_msg;
12573      bool valid = grok_bslash_o(&RExC_parse,
12574            &value,
12575            &error_msg,
12576            SIZE_ONLY,   /* warnings in pass
12577                1 only */
12578            strict,
12579            silence_non_portable,
12580            UTF);
12581      if (! valid) {
12582       vFAIL(error_msg);
12583      }
12584     }
12585     if (PL_encoding && value < 0x100) {
12586      goto recode_encoding;
12587     }
12588     break;
12589    case 'x':
12590     RExC_parse--; /* function expects to be pointed at the 'x' */
12591     {
12592      const char* error_msg;
12593      bool valid = grok_bslash_x(&RExC_parse,
12594            &value,
12595            &error_msg,
12596            TRUE, /* Output warnings */
12597            strict,
12598            silence_non_portable,
12599            UTF);
12600      if (! valid) {
12601       vFAIL(error_msg);
12602      }
12603     }
12604     if (PL_encoding && value < 0x100)
12605      goto recode_encoding;
12606     break;
12607    case 'c':
12608     value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12609     break;
12610    case '0': case '1': case '2': case '3': case '4':
12611    case '5': case '6': case '7':
12612     {
12613      /* Take 1-3 octal digits */
12614      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12615      numlen = (strict) ? 4 : 3;
12616      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12617      RExC_parse += numlen;
12618      if (numlen != 3) {
12619       if (strict) {
12620        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12621        vFAIL("Need exactly 3 octal digits");
12622       }
12623       else if (! SIZE_ONLY /* like \08, \178 */
12624         && numlen < 3
12625         && RExC_parse < RExC_end
12626         && isDIGIT(*RExC_parse)
12627         && ckWARN(WARN_REGEXP))
12628       {
12629        SAVEFREESV(RExC_rx_sv);
12630        reg_warn_non_literal_string(
12631         RExC_parse + 1,
12632         form_short_octal_warning(RExC_parse, numlen));
12633        (void)ReREFCNT_inc(RExC_rx_sv);
12634       }
12635      }
12636      if (PL_encoding && value < 0x100)
12637       goto recode_encoding;
12638      break;
12639     }
12640    recode_encoding:
12641     if (! RExC_override_recoding) {
12642      SV* enc = PL_encoding;
12643      value = reg_recode((const char)(U8)value, &enc);
12644      if (!enc) {
12645       if (strict) {
12646        vFAIL("Invalid escape in the specified encoding");
12647       }
12648       else if (SIZE_ONLY) {
12649        ckWARNreg(RExC_parse,
12650         "Invalid escape in the specified encoding");
12651       }
12652      }
12653      break;
12654     }
12655    default:
12656     /* Allow \_ to not give an error */
12657     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12658      if (strict) {
12659       vFAIL2("Unrecognized escape \\%c in character class",
12660        (int)value);
12661      }
12662      else {
12663       SAVEFREESV(RExC_rx_sv);
12664       ckWARN2reg(RExC_parse,
12665        "Unrecognized escape \\%c in character class passed through",
12666        (int)value);
12667       (void)ReREFCNT_inc(RExC_rx_sv);
12668      }
12669     }
12670     break;
12671    }   /* End of switch on char following backslash */
12672   } /* end of handling backslash escape sequences */
12673 #ifdef EBCDIC
12674   else
12675    literal_endpoint++;
12676 #endif
12677
12678   /* Here, we have the current token in 'value' */
12679
12680   /* What matches in a locale is not known until runtime.  This includes
12681   * what the Posix classes (like \w, [:space:]) match.  Room must be
12682   * reserved (one time per class) to store such classes, either if Perl
12683   * is compiled so that locale nodes always should have this space, or
12684   * if there is such class info to be stored.  The space will contain a
12685   * bit for each named class that is to be matched against.  This isn't
12686   * needed for \p{} and pseudo-classes, as they are not affected by
12687   * locale, and hence are dealt with separately */
12688   if (LOC
12689    && ! need_class
12690    && (ANYOF_LOCALE == ANYOF_CLASS
12691     || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12692   {
12693    need_class = 1;
12694    if (SIZE_ONLY) {
12695     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12696    }
12697    else {
12698     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12699     ANYOF_CLASS_ZERO(ret);
12700    }
12701    ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12702   }
12703
12704   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12705
12706    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12707    * literal, as is the character that began the false range, i.e.
12708    * the 'a' in the examples */
12709    if (range) {
12710     if (!SIZE_ONLY) {
12711      const int w = (RExC_parse >= rangebegin)
12712         ? RExC_parse - rangebegin
12713         : 0;
12714      if (strict) {
12715       vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12716      }
12717      else {
12718       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12719       ckWARN4reg(RExC_parse,
12720         "False [] range \"%*.*s\"",
12721         w, w, rangebegin);
12722       (void)ReREFCNT_inc(RExC_rx_sv);
12723       cp_list = add_cp_to_invlist(cp_list, '-');
12724       cp_list = add_cp_to_invlist(cp_list, prevvalue);
12725      }
12726     }
12727
12728     range = 0; /* this was not a true range */
12729     element_count += 2; /* So counts for three values */
12730    }
12731
12732    if (! SIZE_ONLY) {
12733     U8 classnum = namedclass_to_classnum(namedclass);
12734     if (namedclass >= ANYOF_MAX) {  /* If a special class */
12735      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12736
12737       /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12738       * /l make a difference in what these match.  There
12739       * would be problems if these characters had folds
12740       * other than themselves, as cp_list is subject to
12741       * folding. */
12742       if (classnum != _CC_VERTSPACE) {
12743        assert(   namedclass == ANYOF_HORIZWS
12744         || namedclass == ANYOF_NHORIZWS);
12745
12746        /* It turns out that \h is just a synonym for
12747        * XPosixBlank */
12748        classnum = _CC_BLANK;
12749       }
12750
12751       _invlist_union_maybe_complement_2nd(
12752         cp_list,
12753         PL_XPosix_ptrs[classnum],
12754         cBOOL(namedclass % 2), /* Complement if odd
12755               (NHORIZWS, NVERTWS)
12756               */
12757         &cp_list);
12758      }
12759     }
12760     else if (classnum == _CC_ASCII) {
12761 #ifdef HAS_ISASCII
12762      if (LOC) {
12763       ANYOF_CLASS_SET(ret, namedclass);
12764      }
12765      else
12766 #endif  /* Not isascii(); just use the hard-coded definition for it */
12767       _invlist_union_maybe_complement_2nd(
12768         posixes,
12769         PL_ASCII,
12770         cBOOL(namedclass % 2), /* Complement if odd
12771               (NASCII) */
12772         &posixes);
12773     }
12774     else {  /* Garden variety class */
12775
12776      /* The ascii range inversion list */
12777      SV* ascii_source = PL_Posix_ptrs[classnum];
12778
12779      /* The full Latin1 range inversion list */
12780      SV* l1_source = PL_L1Posix_ptrs[classnum];
12781
12782      /* This code is structured into two major clauses.  The
12783      * first is for classes whose complete definitions may not
12784      * already be known.  It not, the Latin1 definition
12785      * (guaranteed to already known) is used plus code is
12786      * generated to load the rest at run-time (only if needed).
12787      * If the complete definition is known, it drops down to
12788      * the second clause, where the complete definition is
12789      * known */
12790
12791      if (classnum < _FIRST_NON_SWASH_CC) {
12792
12793       /* Here, the class has a swash, which may or not
12794       * already be loaded */
12795
12796       /* The name of the property to use to match the full
12797       * eXtended Unicode range swash for this character
12798       * class */
12799       const char *Xname = swash_property_names[classnum];
12800
12801       /* If returning the inversion list, we can't defer
12802       * getting this until runtime */
12803       if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12804        PL_utf8_swash_ptrs[classnum] =
12805         _core_swash_init("utf8", Xname, &PL_sv_undef,
12806            1, /* binary */
12807            0, /* not tr/// */
12808            NULL, /* No inversion list */
12809            NULL  /* No flags */
12810            );
12811        assert(PL_utf8_swash_ptrs[classnum]);
12812       }
12813       if ( !  PL_utf8_swash_ptrs[classnum]) {
12814        if (namedclass % 2 == 0) { /* A non-complemented
12815               class */
12816         /* If not /a matching, there are code points we
12817         * don't know at compile time.  Arrange for the
12818         * unknown matches to be loaded at run-time, if
12819         * needed */
12820         if (! AT_LEAST_ASCII_RESTRICTED) {
12821          Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12822                 Xname);
12823         }
12824         if (LOC) {  /* Under locale, set run-time
12825            lookup */
12826          ANYOF_CLASS_SET(ret, namedclass);
12827         }
12828         else {
12829          /* Add the current class's code points to
12830          * the running total */
12831          _invlist_union(posixes,
12832             (AT_LEAST_ASCII_RESTRICTED)
12833               ? ascii_source
12834               : l1_source,
12835             &posixes);
12836         }
12837        }
12838        else {  /* A complemented class */
12839         if (AT_LEAST_ASCII_RESTRICTED) {
12840          /* Under /a should match everything above
12841          * ASCII, plus the complement of the set's
12842          * ASCII matches */
12843          _invlist_union_complement_2nd(posixes,
12844                 ascii_source,
12845                 &posixes);
12846         }
12847         else {
12848          /* Arrange for the unknown matches to be
12849          * loaded at run-time, if needed */
12850          Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12851                 Xname);
12852          runtime_posix_matches_above_Unicode = TRUE;
12853          if (LOC) {
12854           ANYOF_CLASS_SET(ret, namedclass);
12855          }
12856          else {
12857
12858           /* We want to match everything in
12859           * Latin1, except those things that
12860           * l1_source matches */
12861           SV* scratch_list = NULL;
12862           _invlist_subtract(PL_Latin1, l1_source,
12863               &scratch_list);
12864
12865           /* Add the list from this class to the
12866           * running total */
12867           if (! posixes) {
12868            posixes = scratch_list;
12869           }
12870           else {
12871            _invlist_union(posixes,
12872               scratch_list,
12873               &posixes);
12874            SvREFCNT_dec_NN(scratch_list);
12875           }
12876           if (DEPENDS_SEMANTICS) {
12877            ANYOF_FLAGS(ret)
12878             |= ANYOF_NON_UTF8_LATIN1_ALL;
12879           }
12880          }
12881         }
12882        }
12883        goto namedclass_done;
12884       }
12885
12886       /* Here, there is a swash loaded for the class.  If no
12887       * inversion list for it yet, get it */
12888       if (! PL_XPosix_ptrs[classnum]) {
12889        PL_XPosix_ptrs[classnum]
12890        = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12891       }
12892      }
12893
12894      /* Here there is an inversion list already loaded for the
12895      * entire class */
12896
12897      if (namedclass % 2 == 0) {  /* A non-complemented class,
12898             like ANYOF_PUNCT */
12899       if (! LOC) {
12900        /* For non-locale, just add it to any existing list
12901        * */
12902        _invlist_union(posixes,
12903           (AT_LEAST_ASCII_RESTRICTED)
12904            ? ascii_source
12905            : PL_XPosix_ptrs[classnum],
12906           &posixes);
12907       }
12908       else {  /* Locale */
12909        SV* scratch_list = NULL;
12910
12911        /* For above Latin1 code points, we use the full
12912        * Unicode range */
12913        _invlist_intersection(PL_AboveLatin1,
12914             PL_XPosix_ptrs[classnum],
12915             &scratch_list);
12916        /* And set the output to it, adding instead if
12917        * there already is an output.  Checking if
12918        * 'posixes' is NULL first saves an extra clone.
12919        * Its reference count will be decremented at the
12920        * next union, etc, or if this is the only
12921        * instance, at the end of the routine */
12922        if (! posixes) {
12923         posixes = scratch_list;
12924        }
12925        else {
12926         _invlist_union(posixes, scratch_list, &posixes);
12927         SvREFCNT_dec_NN(scratch_list);
12928        }
12929
12930 #ifndef HAS_ISBLANK
12931        if (namedclass != ANYOF_BLANK) {
12932 #endif
12933         /* Set this class in the node for runtime
12934         * matching */
12935         ANYOF_CLASS_SET(ret, namedclass);
12936 #ifndef HAS_ISBLANK
12937        }
12938        else {
12939         /* No isblank(), use the hard-coded ASCII-range
12940         * blanks, adding them to the running total. */
12941
12942         _invlist_union(posixes, ascii_source, &posixes);
12943        }
12944 #endif
12945       }
12946      }
12947      else {  /* A complemented class, like ANYOF_NPUNCT */
12948       if (! LOC) {
12949        _invlist_union_complement_2nd(
12950             posixes,
12951             (AT_LEAST_ASCII_RESTRICTED)
12952              ? ascii_source
12953              : PL_XPosix_ptrs[classnum],
12954             &posixes);
12955        /* Under /d, everything in the upper half of the
12956        * Latin1 range matches this complement */
12957        if (DEPENDS_SEMANTICS) {
12958         ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12959        }
12960       }
12961       else {  /* Locale */
12962        SV* scratch_list = NULL;
12963        _invlist_subtract(PL_AboveLatin1,
12964            PL_XPosix_ptrs[classnum],
12965            &scratch_list);
12966        if (! posixes) {
12967         posixes = scratch_list;
12968        }
12969        else {
12970         _invlist_union(posixes, scratch_list, &posixes);
12971         SvREFCNT_dec_NN(scratch_list);
12972        }
12973 #ifndef HAS_ISBLANK
12974        if (namedclass != ANYOF_NBLANK) {
12975 #endif
12976         ANYOF_CLASS_SET(ret, namedclass);
12977 #ifndef HAS_ISBLANK
12978        }
12979        else {
12980         /* Get the list of all code points in Latin1
12981         * that are not ASCII blanks, and add them to
12982         * the running total */
12983         _invlist_subtract(PL_Latin1, ascii_source,
12984             &scratch_list);
12985         _invlist_union(posixes, scratch_list, &posixes);
12986         SvREFCNT_dec_NN(scratch_list);
12987        }
12988 #endif
12989       }
12990      }
12991     }
12992    namedclass_done:
12993     continue;   /* Go get next character */
12994    }
12995   } /* end of namedclass \blah */
12996
12997   /* Here, we have a single value.  If 'range' is set, it is the ending
12998   * of a range--check its validity.  Later, we will handle each
12999   * individual code point in the range.  If 'range' isn't set, this
13000   * could be the beginning of a range, so check for that by looking
13001   * ahead to see if the next real character to be processed is the range
13002   * indicator--the minus sign */
13003
13004   if (skip_white) {
13005    RExC_parse = regpatws(pRExC_state, RExC_parse,
13006         FALSE /* means don't recognize comments */);
13007   }
13008
13009   if (range) {
13010    if (prevvalue > value) /* b-a */ {
13011     const int w = RExC_parse - rangebegin;
13012     Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
13013     range = 0; /* not a valid range */
13014    }
13015   }
13016   else {
13017    prevvalue = value; /* save the beginning of the potential range */
13018    if (! stop_at_1     /* Can't be a range if parsing just one thing */
13019     && *RExC_parse == '-')
13020    {
13021     char* next_char_ptr = RExC_parse + 1;
13022     if (skip_white) {   /* Get the next real char after the '-' */
13023      next_char_ptr = regpatws(pRExC_state,
13024            RExC_parse + 1,
13025            FALSE); /* means don't recognize
13026               comments */
13027     }
13028
13029     /* If the '-' is at the end of the class (just before the ']',
13030     * it is a literal minus; otherwise it is a range */
13031     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13032      RExC_parse = next_char_ptr;
13033
13034      /* a bad range like \w-, [:word:]- ? */
13035      if (namedclass > OOB_NAMEDCLASS) {
13036       if (strict || ckWARN(WARN_REGEXP)) {
13037        const int w =
13038         RExC_parse >= rangebegin ?
13039         RExC_parse - rangebegin : 0;
13040        if (strict) {
13041         vFAIL4("False [] range \"%*.*s\"",
13042          w, w, rangebegin);
13043        }
13044        else {
13045         vWARN4(RExC_parse,
13046          "False [] range \"%*.*s\"",
13047          w, w, rangebegin);
13048        }
13049       }
13050       if (!SIZE_ONLY) {
13051        cp_list = add_cp_to_invlist(cp_list, '-');
13052       }
13053       element_count++;
13054      } else
13055       range = 1; /* yeah, it's a range! */
13056      continue; /* but do it the next time */
13057     }
13058    }
13059   }
13060
13061   /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13062   * if not */
13063
13064   /* non-Latin1 code point implies unicode semantics.  Must be set in
13065   * pass1 so is there for the whole of pass 2 */
13066   if (value > 255) {
13067    RExC_uni_semantics = 1;
13068   }
13069
13070   /* Ready to process either the single value, or the completed range.
13071   * For single-valued non-inverted ranges, we consider the possibility
13072   * of multi-char folds.  (We made a conscious decision to not do this
13073   * for the other cases because it can often lead to non-intuitive
13074   * results.  For example, you have the peculiar case that:
13075   *  "s s" =~ /^[^\xDF]+$/i => Y
13076   *  "ss"  =~ /^[^\xDF]+$/i => N
13077   *
13078   * See [perl #89750] */
13079   if (FOLD && allow_multi_folds && value == prevvalue) {
13080    if (value == LATIN_SMALL_LETTER_SHARP_S
13081     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13082               value)))
13083    {
13084     /* Here <value> is indeed a multi-char fold.  Get what it is */
13085
13086     U8 foldbuf[UTF8_MAXBYTES_CASE];
13087     STRLEN foldlen;
13088
13089     UV folded = _to_uni_fold_flags(
13090         value,
13091         foldbuf,
13092         &foldlen,
13093         FOLD_FLAGS_FULL
13094         | ((LOC) ?  FOLD_FLAGS_LOCALE
13095            : (ASCII_FOLD_RESTRICTED)
13096            ? FOLD_FLAGS_NOMIX_ASCII
13097            : 0)
13098         );
13099
13100     /* Here, <folded> should be the first character of the
13101     * multi-char fold of <value>, with <foldbuf> containing the
13102     * whole thing.  But, if this fold is not allowed (because of
13103     * the flags), <fold> will be the same as <value>, and should
13104     * be processed like any other character, so skip the special
13105     * handling */
13106     if (folded != value) {
13107
13108      /* Skip if we are recursed, currently parsing the class
13109      * again.  Otherwise add this character to the list of
13110      * multi-char folds. */
13111      if (! RExC_in_multi_char_class) {
13112       AV** this_array_ptr;
13113       AV* this_array;
13114       STRLEN cp_count = utf8_length(foldbuf,
13115              foldbuf + foldlen);
13116       SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13117
13118       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13119
13120
13121       if (! multi_char_matches) {
13122        multi_char_matches = newAV();
13123       }
13124
13125       /* <multi_char_matches> is actually an array of arrays.
13126       * There will be one or two top-level elements: [2],
13127       * and/or [3].  The [2] element is an array, each
13128       * element thereof is a character which folds to TWO
13129       * characters; [3] is for folds to THREE characters.
13130       * (Unicode guarantees a maximum of 3 characters in any
13131       * fold.)  When we rewrite the character class below,
13132       * we will do so such that the longest folds are
13133       * written first, so that it prefers the longest
13134       * matching strings first.  This is done even if it
13135       * turns out that any quantifier is non-greedy, out of
13136       * programmer laziness.  Tom Christiansen has agreed
13137       * that this is ok.  This makes the test for the
13138       * ligature 'ffi' come before the test for 'ff' */
13139       if (av_exists(multi_char_matches, cp_count)) {
13140        this_array_ptr = (AV**) av_fetch(multi_char_matches,
13141                cp_count, FALSE);
13142        this_array = *this_array_ptr;
13143       }
13144       else {
13145        this_array = newAV();
13146        av_store(multi_char_matches, cp_count,
13147          (SV*) this_array);
13148       }
13149       av_push(this_array, multi_fold);
13150      }
13151
13152      /* This element should not be processed further in this
13153      * class */
13154      element_count--;
13155      value = save_value;
13156      prevvalue = save_prevvalue;
13157      continue;
13158     }
13159    }
13160   }
13161
13162   /* Deal with this element of the class */
13163   if (! SIZE_ONLY) {
13164 #ifndef EBCDIC
13165    cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13166 #else
13167    SV* this_range = _new_invlist(1);
13168    _append_range_to_invlist(this_range, prevvalue, value);
13169
13170    /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13171    * If this range was specified using something like 'i-j', we want
13172    * to include only the 'i' and the 'j', and not anything in
13173    * between, so exclude non-ASCII, non-alphabetics from it.
13174    * However, if the range was specified with something like
13175    * [\x89-\x91] or [\x89-j], all code points within it should be
13176    * included.  literal_endpoint==2 means both ends of the range used
13177    * a literal character, not \x{foo} */
13178    if (literal_endpoint == 2
13179     && (prevvalue >= 'a' && value <= 'z')
13180      || (prevvalue >= 'A' && value <= 'Z'))
13181    {
13182     _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13183          &this_range);
13184    }
13185    _invlist_union(cp_list, this_range, &cp_list);
13186    literal_endpoint = 0;
13187 #endif
13188   }
13189
13190   range = 0; /* this range (if it was one) is done now */
13191  } /* End of loop through all the text within the brackets */
13192
13193  /* If anything in the class expands to more than one character, we have to
13194  * deal with them by building up a substitute parse string, and recursively
13195  * calling reg() on it, instead of proceeding */
13196  if (multi_char_matches) {
13197   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13198   I32 cp_count;
13199   STRLEN len;
13200   char *save_end = RExC_end;
13201   char *save_parse = RExC_parse;
13202   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
13203          a "|" */
13204   I32 reg_flags;
13205
13206   assert(! invert);
13207 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
13208   because too confusing */
13209   if (invert) {
13210    sv_catpv(substitute_parse, "(?:");
13211   }
13212 #endif
13213
13214   /* Look at the longest folds first */
13215   for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13216
13217    if (av_exists(multi_char_matches, cp_count)) {
13218     AV** this_array_ptr;
13219     SV* this_sequence;
13220
13221     this_array_ptr = (AV**) av_fetch(multi_char_matches,
13222             cp_count, FALSE);
13223     while ((this_sequence = av_pop(*this_array_ptr)) !=
13224                 &PL_sv_undef)
13225     {
13226      if (! first_time) {
13227       sv_catpv(substitute_parse, "|");
13228      }
13229      first_time = FALSE;
13230
13231      sv_catpv(substitute_parse, SvPVX(this_sequence));
13232     }
13233    }
13234   }
13235
13236   /* If the character class contains anything else besides these
13237   * multi-character folds, have to include it in recursive parsing */
13238   if (element_count) {
13239    sv_catpv(substitute_parse, "|[");
13240    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13241    sv_catpv(substitute_parse, "]");
13242   }
13243
13244   sv_catpv(substitute_parse, ")");
13245 #if 0
13246   if (invert) {
13247    /* This is a way to get the parse to skip forward a whole named
13248    * sequence instead of matching the 2nd character when it fails the
13249    * first */
13250    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13251   }
13252 #endif
13253
13254   RExC_parse = SvPV(substitute_parse, len);
13255   RExC_end = RExC_parse + len;
13256   RExC_in_multi_char_class = 1;
13257   RExC_emit = (regnode *)orig_emit;
13258
13259   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13260
13261   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13262
13263   RExC_parse = save_parse;
13264   RExC_end = save_end;
13265   RExC_in_multi_char_class = 0;
13266   SvREFCNT_dec_NN(multi_char_matches);
13267   return ret;
13268  }
13269
13270  /* If the character class contains only a single element, it may be
13271  * optimizable into another node type which is smaller and runs faster.
13272  * Check if this is the case for this class */
13273  if (element_count == 1 && ! ret_invlist) {
13274   U8 op = END;
13275   U8 arg = 0;
13276
13277   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13278            [:digit:] or \p{foo} */
13279
13280    /* All named classes are mapped into POSIXish nodes, with its FLAG
13281    * argument giving which class it is */
13282    switch ((I32)namedclass) {
13283     case ANYOF_UNIPROP:
13284      break;
13285
13286     /* These don't depend on the charset modifiers.  They always
13287     * match under /u rules */
13288     case ANYOF_NHORIZWS:
13289     case ANYOF_HORIZWS:
13290      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13291      /* FALLTHROUGH */
13292
13293     case ANYOF_NVERTWS:
13294     case ANYOF_VERTWS:
13295      op = POSIXU;
13296      goto join_posix;
13297
13298     /* The actual POSIXish node for all the rest depends on the
13299     * charset modifier.  The ones in the first set depend only on
13300     * ASCII or, if available on this platform, locale */
13301     case ANYOF_ASCII:
13302     case ANYOF_NASCII:
13303 #ifdef HAS_ISASCII
13304      op = (LOC) ? POSIXL : POSIXA;
13305 #else
13306      op = POSIXA;
13307 #endif
13308      goto join_posix;
13309
13310     case ANYOF_NCASED:
13311     case ANYOF_LOWER:
13312     case ANYOF_NLOWER:
13313     case ANYOF_UPPER:
13314     case ANYOF_NUPPER:
13315      /* under /a could be alpha */
13316      if (FOLD) {
13317       if (ASCII_RESTRICTED) {
13318        namedclass = ANYOF_ALPHA + (namedclass % 2);
13319       }
13320       else if (! LOC) {
13321        break;
13322       }
13323      }
13324      /* FALLTHROUGH */
13325
13326     /* The rest have more possibilities depending on the charset.
13327     * We take advantage of the enum ordering of the charset
13328     * modifiers to get the exact node type, */
13329     default:
13330      op = POSIXD + get_regex_charset(RExC_flags);
13331      if (op > POSIXA) { /* /aa is same as /a */
13332       op = POSIXA;
13333      }
13334 #ifndef HAS_ISBLANK
13335      if (op == POSIXL
13336       && (namedclass == ANYOF_BLANK
13337        || namedclass == ANYOF_NBLANK))
13338      {
13339       op = POSIXA;
13340      }
13341 #endif
13342
13343     join_posix:
13344      /* The odd numbered ones are the complements of the
13345      * next-lower even number one */
13346      if (namedclass % 2 == 1) {
13347       invert = ! invert;
13348       namedclass--;
13349      }
13350      arg = namedclass_to_classnum(namedclass);
13351      break;
13352    }
13353   }
13354   else if (value == prevvalue) {
13355
13356    /* Here, the class consists of just a single code point */
13357
13358    if (invert) {
13359     if (! LOC && value == '\n') {
13360      op = REG_ANY; /* Optimize [^\n] */
13361      *flagp |= HASWIDTH|SIMPLE;
13362      RExC_naughty++;
13363     }
13364    }
13365    else if (value < 256 || UTF) {
13366
13367     /* Optimize a single value into an EXACTish node, but not if it
13368     * would require converting the pattern to UTF-8. */
13369     op = compute_EXACTish(pRExC_state);
13370    }
13371   } /* Otherwise is a range */
13372   else if (! LOC) {   /* locale could vary these */
13373    if (prevvalue == '0') {
13374     if (value == '9') {
13375      arg = _CC_DIGIT;
13376      op = POSIXA;
13377     }
13378    }
13379   }
13380
13381   /* Here, we have changed <op> away from its initial value iff we found
13382   * an optimization */
13383   if (op != END) {
13384
13385    /* Throw away this ANYOF regnode, and emit the calculated one,
13386    * which should correspond to the beginning, not current, state of
13387    * the parse */
13388    const char * cur_parse = RExC_parse;
13389    RExC_parse = (char *)orig_parse;
13390    if ( SIZE_ONLY) {
13391     if (! LOC) {
13392
13393      /* To get locale nodes to not use the full ANYOF size would
13394      * require moving the code above that writes the portions
13395      * of it that aren't in other nodes to after this point.
13396      * e.g.  ANYOF_CLASS_SET */
13397      RExC_size = orig_size;
13398     }
13399    }
13400    else {
13401     RExC_emit = (regnode *)orig_emit;
13402     if (PL_regkind[op] == POSIXD) {
13403      if (invert) {
13404       op += NPOSIXD - POSIXD;
13405      }
13406     }
13407    }
13408
13409    ret = reg_node(pRExC_state, op);
13410
13411    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13412     if (! SIZE_ONLY) {
13413      FLAGS(ret) = arg;
13414     }
13415     *flagp |= HASWIDTH|SIMPLE;
13416    }
13417    else if (PL_regkind[op] == EXACT) {
13418     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13419    }
13420
13421    RExC_parse = (char *) cur_parse;
13422
13423    SvREFCNT_dec(posixes);
13424    SvREFCNT_dec(cp_list);
13425    return ret;
13426   }
13427  }
13428
13429  if (SIZE_ONLY)
13430   return ret;
13431  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13432
13433  /* If folding, we calculate all characters that could fold to or from the
13434  * ones already on the list */
13435  if (FOLD && cp_list) {
13436   UV start, end; /* End points of code point ranges */
13437
13438   SV* fold_intersection = NULL;
13439
13440   /* If the highest code point is within Latin1, we can use the
13441   * compiled-in Alphas list, and not have to go out to disk.  This
13442   * yields two false positives, the masculine and feminine ordinal
13443   * indicators, which are weeded out below using the
13444   * IS_IN_SOME_FOLD_L1() macro */
13445   if (invlist_highest(cp_list) < 256) {
13446    _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13447               &fold_intersection);
13448   }
13449   else {
13450
13451    /* Here, there are non-Latin1 code points, so we will have to go
13452    * fetch the list of all the characters that participate in folds
13453    */
13454    if (! PL_utf8_foldable) {
13455     SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13456          &PL_sv_undef, 1, 0);
13457     PL_utf8_foldable = _get_swash_invlist(swash);
13458     SvREFCNT_dec_NN(swash);
13459    }
13460
13461    /* This is a hash that for a particular fold gives all characters
13462    * that are involved in it */
13463    if (! PL_utf8_foldclosures) {
13464
13465     /* If we were unable to find any folds, then we likely won't be
13466     * able to find the closures.  So just create an empty list.
13467     * Folding will effectively be restricted to the non-Unicode
13468     * rules hard-coded into Perl.  (This case happens legitimately
13469     * during compilation of Perl itself before the Unicode tables
13470     * are generated) */
13471     if (_invlist_len(PL_utf8_foldable) == 0) {
13472      PL_utf8_foldclosures = newHV();
13473     }
13474     else {
13475      /* If the folds haven't been read in, call a fold function
13476      * to force that */
13477      if (! PL_utf8_tofold) {
13478       U8 dummy[UTF8_MAXBYTES+1];
13479
13480       /* This string is just a short named one above \xff */
13481       to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13482       assert(PL_utf8_tofold); /* Verify that worked */
13483      }
13484      PL_utf8_foldclosures =
13485          _swash_inversion_hash(PL_utf8_tofold);
13486     }
13487    }
13488
13489    /* Only the characters in this class that participate in folds need
13490    * be checked.  Get the intersection of this class and all the
13491    * possible characters that are foldable.  This can quickly narrow
13492    * down a large class */
13493    _invlist_intersection(PL_utf8_foldable, cp_list,
13494         &fold_intersection);
13495   }
13496
13497   /* Now look at the foldable characters in this class individually */
13498   invlist_iterinit(fold_intersection);
13499   while (invlist_iternext(fold_intersection, &start, &end)) {
13500    UV j;
13501
13502    /* Locale folding for Latin1 characters is deferred until runtime */
13503    if (LOC && start < 256) {
13504     start = 256;
13505    }
13506
13507    /* Look at every character in the range */
13508    for (j = start; j <= end; j++) {
13509
13510     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13511     STRLEN foldlen;
13512     SV** listp;
13513
13514     if (j < 256) {
13515
13516      /* We have the latin1 folding rules hard-coded here so that
13517      * an innocent-looking character class, like /[ks]/i won't
13518      * have to go out to disk to find the possible matches.
13519      * XXX It would be better to generate these via regen, in
13520      * case a new version of the Unicode standard adds new
13521      * mappings, though that is not really likely, and may be
13522      * caught by the default: case of the switch below. */
13523
13524      if (IS_IN_SOME_FOLD_L1(j)) {
13525
13526       /* ASCII is always matched; non-ASCII is matched only
13527       * under Unicode rules */
13528       if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13529        cp_list =
13530         add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13531       }
13532       else {
13533        depends_list =
13534        add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13535       }
13536      }
13537
13538      if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13539       && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13540      {
13541       /* Certain Latin1 characters have matches outside
13542       * Latin1.  To get here, <j> is one of those
13543       * characters.   None of these matches is valid for
13544       * ASCII characters under /aa, which is why the 'if'
13545       * just above excludes those.  These matches only
13546       * happen when the target string is utf8.  The code
13547       * below adds the single fold closures for <j> to the
13548       * inversion list. */
13549       switch (j) {
13550        case 'k':
13551        case 'K':
13552         cp_list =
13553          add_cp_to_invlist(cp_list, KELVIN_SIGN);
13554         break;
13555        case 's':
13556        case 'S':
13557         cp_list = add_cp_to_invlist(cp_list,
13558              LATIN_SMALL_LETTER_LONG_S);
13559         break;
13560        case MICRO_SIGN:
13561         cp_list = add_cp_to_invlist(cp_list,
13562              GREEK_CAPITAL_LETTER_MU);
13563         cp_list = add_cp_to_invlist(cp_list,
13564              GREEK_SMALL_LETTER_MU);
13565         break;
13566        case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13567        case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13568         cp_list =
13569          add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13570         break;
13571        case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13572         cp_list = add_cp_to_invlist(cp_list,
13573           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13574         break;
13575        case LATIN_SMALL_LETTER_SHARP_S:
13576         cp_list = add_cp_to_invlist(cp_list,
13577             LATIN_CAPITAL_LETTER_SHARP_S);
13578         break;
13579        case 'F': case 'f':
13580        case 'I': case 'i':
13581        case 'L': case 'l':
13582        case 'T': case 't':
13583        case 'A': case 'a':
13584        case 'H': case 'h':
13585        case 'J': case 'j':
13586        case 'N': case 'n':
13587        case 'W': case 'w':
13588        case 'Y': case 'y':
13589         /* These all are targets of multi-character
13590         * folds from code points that require UTF8 to
13591         * express, so they can't match unless the
13592         * target string is in UTF-8, so no action here
13593         * is necessary, as regexec.c properly handles
13594         * the general case for UTF-8 matching and
13595         * multi-char folds */
13596         break;
13597        default:
13598         /* Use deprecated warning to increase the
13599         * chances of this being output */
13600         ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13601         break;
13602       }
13603      }
13604      continue;
13605     }
13606
13607     /* Here is an above Latin1 character.  We don't have the rules
13608     * hard-coded for it.  First, get its fold.  This is the simple
13609     * fold, as the multi-character folds have been handled earlier
13610     * and separated out */
13611     _to_uni_fold_flags(j, foldbuf, &foldlen,
13612            ((LOC)
13613            ? FOLD_FLAGS_LOCALE
13614            : (ASCII_FOLD_RESTRICTED)
13615             ? FOLD_FLAGS_NOMIX_ASCII
13616             : 0));
13617
13618     /* Single character fold of above Latin1.  Add everything in
13619     * its fold closure to the list that this node should match.
13620     * The fold closures data structure is a hash with the keys
13621     * being the UTF-8 of every character that is folded to, like
13622     * 'k', and the values each an array of all code points that
13623     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13624     * Multi-character folds are not included */
13625     if ((listp = hv_fetch(PL_utf8_foldclosures,
13626          (char *) foldbuf, foldlen, FALSE)))
13627     {
13628      AV* list = (AV*) *listp;
13629      IV k;
13630      for (k = 0; k <= av_len(list); k++) {
13631       SV** c_p = av_fetch(list, k, FALSE);
13632       UV c;
13633       if (c_p == NULL) {
13634        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13635       }
13636       c = SvUV(*c_p);
13637
13638       /* /aa doesn't allow folds between ASCII and non-; /l
13639       * doesn't allow them between above and below 256 */
13640       if ((ASCII_FOLD_RESTRICTED
13641         && (isASCII(c) != isASCII(j)))
13642        || (LOC && c < 256)) {
13643        continue;
13644       }
13645
13646       /* Folds involving non-ascii Latin1 characters
13647       * under /d are added to a separate list */
13648       if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13649       {
13650        cp_list = add_cp_to_invlist(cp_list, c);
13651       }
13652       else {
13653       depends_list = add_cp_to_invlist(depends_list, c);
13654       }
13655      }
13656     }
13657    }
13658   }
13659   SvREFCNT_dec_NN(fold_intersection);
13660  }
13661
13662  /* And combine the result (if any) with any inversion list from posix
13663  * classes.  The lists are kept separate up to now because we don't want to
13664  * fold the classes (folding of those is automatically handled by the swash
13665  * fetching code) */
13666  if (posixes) {
13667   if (! DEPENDS_SEMANTICS) {
13668    if (cp_list) {
13669     _invlist_union(cp_list, posixes, &cp_list);
13670     SvREFCNT_dec_NN(posixes);
13671    }
13672    else {
13673     cp_list = posixes;
13674    }
13675   }
13676   else {
13677    /* Under /d, we put into a separate list the Latin1 things that
13678    * match only when the target string is utf8 */
13679    SV* nonascii_but_latin1_properties = NULL;
13680    _invlist_intersection(posixes, PL_Latin1,
13681         &nonascii_but_latin1_properties);
13682    _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13683        &nonascii_but_latin1_properties);
13684    _invlist_subtract(posixes, nonascii_but_latin1_properties,
13685        &posixes);
13686    if (cp_list) {
13687     _invlist_union(cp_list, posixes, &cp_list);
13688     SvREFCNT_dec_NN(posixes);
13689    }
13690    else {
13691     cp_list = posixes;
13692    }
13693
13694    if (depends_list) {
13695     _invlist_union(depends_list, nonascii_but_latin1_properties,
13696        &depends_list);
13697     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13698    }
13699    else {
13700     depends_list = nonascii_but_latin1_properties;
13701    }
13702   }
13703  }
13704
13705  /* And combine the result (if any) with any inversion list from properties.
13706  * The lists are kept separate up to now so that we can distinguish the two
13707  * in regards to matching above-Unicode.  A run-time warning is generated
13708  * if a Unicode property is matched against a non-Unicode code point. But,
13709  * we allow user-defined properties to match anything, without any warning,
13710  * and we also suppress the warning if there is a portion of the character
13711  * class that isn't a Unicode property, and which matches above Unicode, \W
13712  * or [\x{110000}] for example.
13713  * (Note that in this case, unlike the Posix one above, there is no
13714  * <depends_list>, because having a Unicode property forces Unicode
13715  * semantics */
13716  if (properties) {
13717   bool warn_super = ! has_user_defined_property;
13718   if (cp_list) {
13719
13720    /* If it matters to the final outcome, see if a non-property
13721    * component of the class matches above Unicode.  If so, the
13722    * warning gets suppressed.  This is true even if just a single
13723    * such code point is specified, as though not strictly correct if
13724    * another such code point is matched against, the fact that they
13725    * are using above-Unicode code points indicates they should know
13726    * the issues involved */
13727    if (warn_super) {
13728     bool non_prop_matches_above_Unicode =
13729        runtime_posix_matches_above_Unicode
13730        | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13731     if (invert) {
13732      non_prop_matches_above_Unicode =
13733            !  non_prop_matches_above_Unicode;
13734     }
13735     warn_super = ! non_prop_matches_above_Unicode;
13736    }
13737
13738    _invlist_union(properties, cp_list, &cp_list);
13739    SvREFCNT_dec_NN(properties);
13740   }
13741   else {
13742    cp_list = properties;
13743   }
13744
13745   if (warn_super) {
13746    OP(ret) = ANYOF_WARN_SUPER;
13747   }
13748  }
13749
13750  /* Here, we have calculated what code points should be in the character
13751  * class.
13752  *
13753  * Now we can see about various optimizations.  Fold calculation (which we
13754  * did above) needs to take place before inversion.  Otherwise /[^k]/i
13755  * would invert to include K, which under /i would match k, which it
13756  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13757  * folded until runtime */
13758
13759  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13760  * at compile time.  Besides not inverting folded locale now, we can't
13761  * invert if there are things such as \w, which aren't known until runtime
13762  * */
13763  if (invert
13764   && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13765   && ! depends_list
13766   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13767  {
13768   _invlist_invert(cp_list);
13769
13770   /* Any swash can't be used as-is, because we've inverted things */
13771   if (swash) {
13772    SvREFCNT_dec_NN(swash);
13773    swash = NULL;
13774   }
13775
13776   /* Clear the invert flag since have just done it here */
13777   invert = FALSE;
13778  }
13779
13780  if (ret_invlist) {
13781   *ret_invlist = cp_list;
13782   SvREFCNT_dec(swash);
13783
13784   /* Discard the generated node */
13785   if (SIZE_ONLY) {
13786    RExC_size = orig_size;
13787   }
13788   else {
13789    RExC_emit = orig_emit;
13790   }
13791   return orig_emit;
13792  }
13793
13794  /* If we didn't do folding, it's because some information isn't available
13795  * until runtime; set the run-time fold flag for these.  (We don't have to
13796  * worry about properties folding, as that is taken care of by the swash
13797  * fetching) */
13798  if (FOLD && LOC)
13799  {
13800  ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13801  }
13802
13803  /* Some character classes are equivalent to other nodes.  Such nodes take
13804  * up less room and generally fewer operations to execute than ANYOF nodes.
13805  * Above, we checked for and optimized into some such equivalents for
13806  * certain common classes that are easy to test.  Getting to this point in
13807  * the code means that the class didn't get optimized there.  Since this
13808  * code is only executed in Pass 2, it is too late to save space--it has
13809  * been allocated in Pass 1, and currently isn't given back.  But turning
13810  * things into an EXACTish node can allow the optimizer to join it to any
13811  * adjacent such nodes.  And if the class is equivalent to things like /./,
13812  * expensive run-time swashes can be avoided.  Now that we have more
13813  * complete information, we can find things necessarily missed by the
13814  * earlier code.  I (khw) am not sure how much to look for here.  It would
13815  * be easy, but perhaps too slow, to check any candidates against all the
13816  * node types they could possibly match using _invlistEQ(). */
13817
13818  if (cp_list
13819   && ! invert
13820   && ! depends_list
13821   && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13822   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13823  {
13824   UV start, end;
13825   U8 op = END;  /* The optimzation node-type */
13826   const char * cur_parse= RExC_parse;
13827
13828   invlist_iterinit(cp_list);
13829   if (! invlist_iternext(cp_list, &start, &end)) {
13830
13831    /* Here, the list is empty.  This happens, for example, when a
13832    * Unicode property is the only thing in the character class, and
13833    * it doesn't match anything.  (perluniprops.pod notes such
13834    * properties) */
13835    op = OPFAIL;
13836    *flagp |= HASWIDTH|SIMPLE;
13837   }
13838   else if (start == end) {    /* The range is a single code point */
13839    if (! invlist_iternext(cp_list, &start, &end)
13840
13841      /* Don't do this optimization if it would require changing
13842      * the pattern to UTF-8 */
13843     && (start < 256 || UTF))
13844    {
13845     /* Here, the list contains a single code point.  Can optimize
13846     * into an EXACT node */
13847
13848     value = start;
13849
13850     if (! FOLD) {
13851      op = EXACT;
13852     }
13853     else if (LOC) {
13854
13855      /* A locale node under folding with one code point can be
13856      * an EXACTFL, as its fold won't be calculated until
13857      * runtime */
13858      op = EXACTFL;
13859     }
13860     else {
13861
13862      /* Here, we are generally folding, but there is only one
13863      * code point to match.  If we have to, we use an EXACT
13864      * node, but it would be better for joining with adjacent
13865      * nodes in the optimization pass if we used the same
13866      * EXACTFish node that any such are likely to be.  We can
13867      * do this iff the code point doesn't participate in any
13868      * folds.  For example, an EXACTF of a colon is the same as
13869      * an EXACT one, since nothing folds to or from a colon. */
13870      if (value < 256) {
13871       if (IS_IN_SOME_FOLD_L1(value)) {
13872        op = EXACT;
13873       }
13874      }
13875      else {
13876       if (! PL_utf8_foldable) {
13877        SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13878             &PL_sv_undef, 1, 0);
13879        PL_utf8_foldable = _get_swash_invlist(swash);
13880        SvREFCNT_dec_NN(swash);
13881       }
13882       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13883        op = EXACT;
13884       }
13885      }
13886
13887      /* If we haven't found the node type, above, it means we
13888      * can use the prevailing one */
13889      if (op == END) {
13890       op = compute_EXACTish(pRExC_state);
13891      }
13892     }
13893    }
13894   }
13895   else if (start == 0) {
13896    if (end == UV_MAX) {
13897     op = SANY;
13898     *flagp |= HASWIDTH|SIMPLE;
13899     RExC_naughty++;
13900    }
13901    else if (end == '\n' - 1
13902      && invlist_iternext(cp_list, &start, &end)
13903      && start == '\n' + 1 && end == UV_MAX)
13904    {
13905     op = REG_ANY;
13906     *flagp |= HASWIDTH|SIMPLE;
13907     RExC_naughty++;
13908    }
13909   }
13910   invlist_iterfinish(cp_list);
13911
13912   if (op != END) {
13913    RExC_parse = (char *)orig_parse;
13914    RExC_emit = (regnode *)orig_emit;
13915
13916    ret = reg_node(pRExC_state, op);
13917
13918    RExC_parse = (char *)cur_parse;
13919
13920    if (PL_regkind[op] == EXACT) {
13921     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13922    }
13923
13924    SvREFCNT_dec_NN(cp_list);
13925    return ret;
13926   }
13927  }
13928
13929  /* Here, <cp_list> contains all the code points we can determine at
13930  * compile time that match under all conditions.  Go through it, and
13931  * for things that belong in the bitmap, put them there, and delete from
13932  * <cp_list>.  While we are at it, see if everything above 255 is in the
13933  * list, and if so, set a flag to speed up execution */
13934  ANYOF_BITMAP_ZERO(ret);
13935  if (cp_list) {
13936
13937   /* This gets set if we actually need to modify things */
13938   bool change_invlist = FALSE;
13939
13940   UV start, end;
13941
13942   /* Start looking through <cp_list> */
13943   invlist_iterinit(cp_list);
13944   while (invlist_iternext(cp_list, &start, &end)) {
13945    UV high;
13946    int i;
13947
13948    if (end == UV_MAX && start <= 256) {
13949     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13950    }
13951
13952    /* Quit if are above what we should change */
13953    if (start > 255) {
13954     break;
13955    }
13956
13957    change_invlist = TRUE;
13958
13959    /* Set all the bits in the range, up to the max that we are doing */
13960    high = (end < 255) ? end : 255;
13961    for (i = start; i <= (int) high; i++) {
13962     if (! ANYOF_BITMAP_TEST(ret, i)) {
13963      ANYOF_BITMAP_SET(ret, i);
13964     }
13965    }
13966   }
13967   invlist_iterfinish(cp_list);
13968
13969   /* Done with loop; remove any code points that are in the bitmap from
13970   * <cp_list> */
13971   if (change_invlist) {
13972    _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13973   }
13974
13975   /* If have completely emptied it, remove it completely */
13976   if (_invlist_len(cp_list) == 0) {
13977    SvREFCNT_dec_NN(cp_list);
13978    cp_list = NULL;
13979   }
13980  }
13981
13982  if (invert) {
13983   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13984  }
13985
13986  /* Here, the bitmap has been populated with all the Latin1 code points that
13987  * always match.  Can now add to the overall list those that match only
13988  * when the target string is UTF-8 (<depends_list>). */
13989  if (depends_list) {
13990   if (cp_list) {
13991    _invlist_union(cp_list, depends_list, &cp_list);
13992    SvREFCNT_dec_NN(depends_list);
13993   }
13994   else {
13995    cp_list = depends_list;
13996   }
13997  }
13998
13999  /* If there is a swash and more than one element, we can't use the swash in
14000  * the optimization below. */
14001  if (swash && element_count > 1) {
14002   SvREFCNT_dec_NN(swash);
14003   swash = NULL;
14004  }
14005
14006  if (! cp_list
14007   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14008  {
14009   ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
14010  }
14011  else {
14012   /* av[0] stores the character class description in its textual form:
14013   *       used later (regexec.c:Perl_regclass_swash()) to initialize the
14014   *       appropriate swash, and is also useful for dumping the regnode.
14015   * av[1] if NULL, is a placeholder to later contain the swash computed
14016   *       from av[0].  But if no further computation need be done, the
14017   *       swash is stored there now.
14018   * av[2] stores the cp_list inversion list for use in addition or
14019   *       instead of av[0]; used only if av[1] is NULL
14020   * av[3] is set if any component of the class is from a user-defined
14021   *       property; used only if av[1] is NULL */
14022   AV * const av = newAV();
14023   SV *rv;
14024
14025   av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14026       ? SvREFCNT_inc(listsv) : &PL_sv_undef);
14027   if (swash) {
14028    av_store(av, 1, swash);
14029    SvREFCNT_dec_NN(cp_list);
14030   }
14031   else {
14032    av_store(av, 1, NULL);
14033    if (cp_list) {
14034     av_store(av, 2, cp_list);
14035     av_store(av, 3, newSVuv(has_user_defined_property));
14036    }
14037   }
14038
14039   rv = newRV_noinc(MUTABLE_SV(av));
14040   n = add_data(pRExC_state, 1, "s");
14041   RExC_rxi->data->data[n] = (void*)rv;
14042   ARG_SET(ret, n);
14043  }
14044
14045  *flagp |= HASWIDTH|SIMPLE;
14046  return ret;
14047 }
14048 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14049
14050
14051 /* reg_skipcomment()
14052
14053    Absorbs an /x style # comments from the input stream.
14054    Returns true if there is more text remaining in the stream.
14055    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14056    terminates the pattern without including a newline.
14057
14058    Note its the callers responsibility to ensure that we are
14059    actually in /x mode
14060
14061 */
14062
14063 STATIC bool
14064 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14065 {
14066  bool ended = 0;
14067
14068  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14069
14070  while (RExC_parse < RExC_end)
14071   if (*RExC_parse++ == '\n') {
14072    ended = 1;
14073    break;
14074   }
14075  if (!ended) {
14076   /* we ran off the end of the pattern without ending
14077   the comment, so we have to add an \n when wrapping */
14078   RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14079   return 0;
14080  } else
14081   return 1;
14082 }
14083
14084 /* nextchar()
14085
14086    Advances the parse position, and optionally absorbs
14087    "whitespace" from the inputstream.
14088
14089    Without /x "whitespace" means (?#...) style comments only,
14090    with /x this means (?#...) and # comments and whitespace proper.
14091
14092    Returns the RExC_parse point from BEFORE the scan occurs.
14093
14094    This is the /x friendly way of saying RExC_parse++.
14095 */
14096
14097 STATIC char*
14098 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14099 {
14100  char* const retval = RExC_parse++;
14101
14102  PERL_ARGS_ASSERT_NEXTCHAR;
14103
14104  for (;;) {
14105   if (RExC_end - RExC_parse >= 3
14106    && *RExC_parse == '('
14107    && RExC_parse[1] == '?'
14108    && RExC_parse[2] == '#')
14109   {
14110    while (*RExC_parse != ')') {
14111     if (RExC_parse == RExC_end)
14112      FAIL("Sequence (?#... not terminated");
14113     RExC_parse++;
14114    }
14115    RExC_parse++;
14116    continue;
14117   }
14118   if (RExC_flags & RXf_PMf_EXTENDED) {
14119    if (isSPACE(*RExC_parse)) {
14120     RExC_parse++;
14121     continue;
14122    }
14123    else if (*RExC_parse == '#') {
14124     if ( reg_skipcomment( pRExC_state ) )
14125      continue;
14126    }
14127   }
14128   return retval;
14129  }
14130 }
14131
14132 /*
14133 - reg_node - emit a node
14134 */
14135 STATIC regnode *   /* Location. */
14136 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14137 {
14138  dVAR;
14139  regnode *ptr;
14140  regnode * const ret = RExC_emit;
14141  GET_RE_DEBUG_FLAGS_DECL;
14142
14143  PERL_ARGS_ASSERT_REG_NODE;
14144
14145  if (SIZE_ONLY) {
14146   SIZE_ALIGN(RExC_size);
14147   RExC_size += 1;
14148   return(ret);
14149  }
14150  if (RExC_emit >= RExC_emit_bound)
14151   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14152     op, RExC_emit, RExC_emit_bound);
14153
14154  NODE_ALIGN_FILL(ret);
14155  ptr = ret;
14156  FILL_ADVANCE_NODE(ptr, op);
14157  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
14158 #ifdef RE_TRACK_PATTERN_OFFSETS
14159  if (RExC_offsets) {         /* MJD */
14160   MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14161    "reg_node", __LINE__,
14162    PL_reg_name[op],
14163    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14164     ? "Overwriting end of array!\n" : "OK",
14165    (UV)(RExC_emit - RExC_emit_start),
14166    (UV)(RExC_parse - RExC_start),
14167    (UV)RExC_offsets[0]));
14168   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14169  }
14170 #endif
14171  RExC_emit = ptr;
14172  return(ret);
14173 }
14174
14175 /*
14176 - reganode - emit a node with an argument
14177 */
14178 STATIC regnode *   /* Location. */
14179 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14180 {
14181  dVAR;
14182  regnode *ptr;
14183  regnode * const ret = RExC_emit;
14184  GET_RE_DEBUG_FLAGS_DECL;
14185
14186  PERL_ARGS_ASSERT_REGANODE;
14187
14188  if (SIZE_ONLY) {
14189   SIZE_ALIGN(RExC_size);
14190   RExC_size += 2;
14191   /*
14192   We can't do this:
14193
14194   assert(2==regarglen[op]+1);
14195
14196   Anything larger than this has to allocate the extra amount.
14197   If we changed this to be:
14198
14199   RExC_size += (1 + regarglen[op]);
14200
14201   then it wouldn't matter. Its not clear what side effect
14202   might come from that so its not done so far.
14203   -- dmq
14204   */
14205   return(ret);
14206  }
14207  if (RExC_emit >= RExC_emit_bound)
14208   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14209     op, RExC_emit, RExC_emit_bound);
14210
14211  NODE_ALIGN_FILL(ret);
14212  ptr = ret;
14213  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14214  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
14215 #ifdef RE_TRACK_PATTERN_OFFSETS
14216  if (RExC_offsets) {         /* MJD */
14217   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14218    "reganode",
14219    __LINE__,
14220    PL_reg_name[op],
14221    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14222    "Overwriting end of array!\n" : "OK",
14223    (UV)(RExC_emit - RExC_emit_start),
14224    (UV)(RExC_parse - RExC_start),
14225    (UV)RExC_offsets[0]));
14226   Set_Cur_Node_Offset;
14227  }
14228 #endif
14229  RExC_emit = ptr;
14230  return(ret);
14231 }
14232
14233 /*
14234 - reguni - emit (if appropriate) a Unicode character
14235 */
14236 STATIC STRLEN
14237 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14238 {
14239  dVAR;
14240
14241  PERL_ARGS_ASSERT_REGUNI;
14242
14243  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14244 }
14245
14246 /*
14247 - reginsert - insert an operator in front of already-emitted operand
14248 *
14249 * Means relocating the operand.
14250 */
14251 STATIC void
14252 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14253 {
14254  dVAR;
14255  regnode *src;
14256  regnode *dst;
14257  regnode *place;
14258  const int offset = regarglen[(U8)op];
14259  const int size = NODE_STEP_REGNODE + offset;
14260  GET_RE_DEBUG_FLAGS_DECL;
14261
14262  PERL_ARGS_ASSERT_REGINSERT;
14263  PERL_UNUSED_ARG(depth);
14264 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14265  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14266  if (SIZE_ONLY) {
14267   RExC_size += size;
14268   return;
14269  }
14270
14271  src = RExC_emit;
14272  RExC_emit += size;
14273  dst = RExC_emit;
14274  if (RExC_open_parens) {
14275   int paren;
14276   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14277   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14278    if ( RExC_open_parens[paren] >= opnd ) {
14279     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14280     RExC_open_parens[paren] += size;
14281    } else {
14282     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14283    }
14284    if ( RExC_close_parens[paren] >= opnd ) {
14285     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14286     RExC_close_parens[paren] += size;
14287    } else {
14288     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14289    }
14290   }
14291  }
14292
14293  while (src > opnd) {
14294   StructCopy(--src, --dst, regnode);
14295 #ifdef RE_TRACK_PATTERN_OFFSETS
14296   if (RExC_offsets) {     /* MJD 20010112 */
14297    MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14298     "reg_insert",
14299     __LINE__,
14300     PL_reg_name[op],
14301     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14302      ? "Overwriting end of array!\n" : "OK",
14303     (UV)(src - RExC_emit_start),
14304     (UV)(dst - RExC_emit_start),
14305     (UV)RExC_offsets[0]));
14306    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14307    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14308   }
14309 #endif
14310  }
14311
14312
14313  place = opnd;  /* Op node, where operand used to be. */
14314 #ifdef RE_TRACK_PATTERN_OFFSETS
14315  if (RExC_offsets) {         /* MJD */
14316   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14317    "reginsert",
14318    __LINE__,
14319    PL_reg_name[op],
14320    (UV)(place - RExC_emit_start) > RExC_offsets[0]
14321    ? "Overwriting end of array!\n" : "OK",
14322    (UV)(place - RExC_emit_start),
14323    (UV)(RExC_parse - RExC_start),
14324    (UV)RExC_offsets[0]));
14325   Set_Node_Offset(place, RExC_parse);
14326   Set_Node_Length(place, 1);
14327  }
14328 #endif
14329  src = NEXTOPER(place);
14330  FILL_ADVANCE_NODE(place, op);
14331  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
14332  Zero(src, offset, regnode);
14333 }
14334
14335 /*
14336 - regtail - set the next-pointer at the end of a node chain of p to val.
14337 - SEE ALSO: regtail_study
14338 */
14339 /* TODO: All three parms should be const */
14340 STATIC void
14341 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14342 {
14343  dVAR;
14344  regnode *scan;
14345  GET_RE_DEBUG_FLAGS_DECL;
14346
14347  PERL_ARGS_ASSERT_REGTAIL;
14348 #ifndef DEBUGGING
14349  PERL_UNUSED_ARG(depth);
14350 #endif
14351
14352  if (SIZE_ONLY)
14353   return;
14354
14355  /* Find last node. */
14356  scan = p;
14357  for (;;) {
14358   regnode * const temp = regnext(scan);
14359   DEBUG_PARSE_r({
14360    SV * const mysv=sv_newmortal();
14361    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14362    regprop(RExC_rx, mysv, scan);
14363    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14364     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14365      (temp == NULL ? "->" : ""),
14366      (temp == NULL ? PL_reg_name[OP(val)] : "")
14367    );
14368   });
14369   if (temp == NULL)
14370    break;
14371   scan = temp;
14372  }
14373
14374  if (reg_off_by_arg[OP(scan)]) {
14375   ARG_SET(scan, val - scan);
14376  }
14377  else {
14378   NEXT_OFF(scan) = val - scan;
14379  }
14380 }
14381
14382 #ifdef DEBUGGING
14383 /*
14384 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14385 - Look for optimizable sequences at the same time.
14386 - currently only looks for EXACT chains.
14387
14388 This is experimental code. The idea is to use this routine to perform
14389 in place optimizations on branches and groups as they are constructed,
14390 with the long term intention of removing optimization from study_chunk so
14391 that it is purely analytical.
14392
14393 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14394 to control which is which.
14395
14396 */
14397 /* TODO: All four parms should be const */
14398
14399 STATIC U8
14400 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14401 {
14402  dVAR;
14403  regnode *scan;
14404  U8 exact = PSEUDO;
14405 #ifdef EXPERIMENTAL_INPLACESCAN
14406  I32 min = 0;
14407 #endif
14408  GET_RE_DEBUG_FLAGS_DECL;
14409
14410  PERL_ARGS_ASSERT_REGTAIL_STUDY;
14411
14412
14413  if (SIZE_ONLY)
14414   return exact;
14415
14416  /* Find last node. */
14417
14418  scan = p;
14419  for (;;) {
14420   regnode * const temp = regnext(scan);
14421 #ifdef EXPERIMENTAL_INPLACESCAN
14422   if (PL_regkind[OP(scan)] == EXACT) {
14423    bool has_exactf_sharp_s; /* Unexamined in this routine */
14424    if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14425     return EXACT;
14426   }
14427 #endif
14428   if ( exact ) {
14429    switch (OP(scan)) {
14430     case EXACT:
14431     case EXACTF:
14432     case EXACTFA:
14433     case EXACTFU:
14434     case EXACTFU_SS:
14435     case EXACTFU_TRICKYFOLD:
14436     case EXACTFL:
14437       if( exact == PSEUDO )
14438        exact= OP(scan);
14439       else if ( exact != OP(scan) )
14440        exact= 0;
14441     case NOTHING:
14442      break;
14443     default:
14444      exact= 0;
14445    }
14446   }
14447   DEBUG_PARSE_r({
14448    SV * const mysv=sv_newmortal();
14449    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14450    regprop(RExC_rx, mysv, scan);
14451    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14452     SvPV_nolen_const(mysv),
14453     REG_NODE_NUM(scan),
14454     PL_reg_name[exact]);
14455   });
14456   if (temp == NULL)
14457    break;
14458   scan = temp;
14459  }
14460  DEBUG_PARSE_r({
14461   SV * const mysv_val=sv_newmortal();
14462   DEBUG_PARSE_MSG("");
14463   regprop(RExC_rx, mysv_val, val);
14464   PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14465      SvPV_nolen_const(mysv_val),
14466      (IV)REG_NODE_NUM(val),
14467      (IV)(val - scan)
14468   );
14469  });
14470  if (reg_off_by_arg[OP(scan)]) {
14471   ARG_SET(scan, val - scan);
14472  }
14473  else {
14474   NEXT_OFF(scan) = val - scan;
14475  }
14476
14477  return exact;
14478 }
14479 #endif
14480
14481 /*
14482  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14483  */
14484 #ifdef DEBUGGING
14485
14486 static void
14487 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
14488 {
14489  int bit;
14490  int set=0;
14491
14492  for (bit=0; bit<32; bit++) {
14493   if (flags & (1<<bit)) {
14494    if (!set++ && lead)
14495     PerlIO_printf(Perl_debug_log, "%s",lead);
14496    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
14497   }
14498  }
14499  if (lead)  {
14500   if (set)
14501    PerlIO_printf(Perl_debug_log, "\n");
14502   else
14503    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14504  }
14505 }
14506
14507 static void
14508 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14509 {
14510  int bit;
14511  int set=0;
14512  regex_charset cs;
14513
14514  for (bit=0; bit<32; bit++) {
14515   if (flags & (1<<bit)) {
14516    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
14517     continue;
14518    }
14519    if (!set++ && lead)
14520     PerlIO_printf(Perl_debug_log, "%s",lead);
14521    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14522   }
14523  }
14524  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14525    if (!set++ && lead) {
14526     PerlIO_printf(Perl_debug_log, "%s",lead);
14527    }
14528    switch (cs) {
14529     case REGEX_UNICODE_CHARSET:
14530      PerlIO_printf(Perl_debug_log, "UNICODE");
14531      break;
14532     case REGEX_LOCALE_CHARSET:
14533      PerlIO_printf(Perl_debug_log, "LOCALE");
14534      break;
14535     case REGEX_ASCII_RESTRICTED_CHARSET:
14536      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14537      break;
14538     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14539      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14540      break;
14541     default:
14542      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14543      break;
14544    }
14545  }
14546  if (lead)  {
14547   if (set)
14548    PerlIO_printf(Perl_debug_log, "\n");
14549   else
14550    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14551  }
14552 }
14553 #endif
14554
14555 void
14556 Perl_regdump(pTHX_ const regexp *r)
14557 {
14558 #ifdef DEBUGGING
14559  dVAR;
14560  SV * const sv = sv_newmortal();
14561  SV *dsv= sv_newmortal();
14562  RXi_GET_DECL(r,ri);
14563  GET_RE_DEBUG_FLAGS_DECL;
14564
14565  PERL_ARGS_ASSERT_REGDUMP;
14566
14567  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14568
14569  /* Header fields of interest. */
14570  if (r->anchored_substr) {
14571   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14572    RE_SV_DUMPLEN(r->anchored_substr), 30);
14573   PerlIO_printf(Perl_debug_log,
14574      "anchored %s%s at %"IVdf" ",
14575      s, RE_SV_TAIL(r->anchored_substr),
14576      (IV)r->anchored_offset);
14577  } else if (r->anchored_utf8) {
14578   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14579    RE_SV_DUMPLEN(r->anchored_utf8), 30);
14580   PerlIO_printf(Perl_debug_log,
14581      "anchored utf8 %s%s at %"IVdf" ",
14582      s, RE_SV_TAIL(r->anchored_utf8),
14583      (IV)r->anchored_offset);
14584  }
14585  if (r->float_substr) {
14586   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14587    RE_SV_DUMPLEN(r->float_substr), 30);
14588   PerlIO_printf(Perl_debug_log,
14589      "floating %s%s at %"IVdf"..%"UVuf" ",
14590      s, RE_SV_TAIL(r->float_substr),
14591      (IV)r->float_min_offset, (UV)r->float_max_offset);
14592  } else if (r->float_utf8) {
14593   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14594    RE_SV_DUMPLEN(r->float_utf8), 30);
14595   PerlIO_printf(Perl_debug_log,
14596      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14597      s, RE_SV_TAIL(r->float_utf8),
14598      (IV)r->float_min_offset, (UV)r->float_max_offset);
14599  }
14600  if (r->check_substr || r->check_utf8)
14601   PerlIO_printf(Perl_debug_log,
14602      (const char *)
14603      (r->check_substr == r->float_substr
14604      && r->check_utf8 == r->float_utf8
14605      ? "(checking floating" : "(checking anchored"));
14606  if (r->extflags & RXf_NOSCAN)
14607   PerlIO_printf(Perl_debug_log, " noscan");
14608  if (r->extflags & RXf_CHECK_ALL)
14609   PerlIO_printf(Perl_debug_log, " isall");
14610  if (r->check_substr || r->check_utf8)
14611   PerlIO_printf(Perl_debug_log, ") ");
14612
14613  if (ri->regstclass) {
14614   regprop(r, sv, ri->regstclass);
14615   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14616  }
14617  if (r->extflags & RXf_ANCH) {
14618   PerlIO_printf(Perl_debug_log, "anchored");
14619   if (r->extflags & RXf_ANCH_BOL)
14620    PerlIO_printf(Perl_debug_log, "(BOL)");
14621   if (r->extflags & RXf_ANCH_MBOL)
14622    PerlIO_printf(Perl_debug_log, "(MBOL)");
14623   if (r->extflags & RXf_ANCH_SBOL)
14624    PerlIO_printf(Perl_debug_log, "(SBOL)");
14625   if (r->extflags & RXf_ANCH_GPOS)
14626    PerlIO_printf(Perl_debug_log, "(GPOS)");
14627   PerlIO_putc(Perl_debug_log, ' ');
14628  }
14629  if (r->extflags & RXf_GPOS_SEEN)
14630   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14631  if (r->intflags & PREGf_SKIP)
14632   PerlIO_printf(Perl_debug_log, "plus ");
14633  if (r->intflags & PREGf_IMPLICIT)
14634   PerlIO_printf(Perl_debug_log, "implicit ");
14635  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14636  if (r->extflags & RXf_EVAL_SEEN)
14637   PerlIO_printf(Perl_debug_log, "with eval ");
14638  PerlIO_printf(Perl_debug_log, "\n");
14639  DEBUG_FLAGS_r({
14640   regdump_extflags("r->extflags: ",r->extflags);
14641   regdump_intflags("r->intflags: ",r->intflags);
14642  });
14643 #else
14644  PERL_ARGS_ASSERT_REGDUMP;
14645  PERL_UNUSED_CONTEXT;
14646  PERL_UNUSED_ARG(r);
14647 #endif /* DEBUGGING */
14648 }
14649
14650 /*
14651 - regprop - printable representation of opcode
14652 */
14653 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14654 STMT_START { \
14655   if (do_sep) {                           \
14656    Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14657    if (flags & ANYOF_INVERT)           \
14658     /*make sure the invert info is in each */ \
14659     sv_catpvs(sv, "^");             \
14660    do_sep = 0;                         \
14661   }                                       \
14662 } STMT_END
14663
14664 void
14665 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14666 {
14667 #ifdef DEBUGGING
14668  dVAR;
14669  int k;
14670
14671  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14672  static const char * const anyofs[] = {
14673 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14674  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14675  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14676  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14677  || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14678  || _CC_VERTSPACE != 16
14679   #error Need to adjust order of anyofs[]
14680 #endif
14681   "[\\w]",
14682   "[\\W]",
14683   "[\\d]",
14684   "[\\D]",
14685   "[:alpha:]",
14686   "[:^alpha:]",
14687   "[:lower:]",
14688   "[:^lower:]",
14689   "[:upper:]",
14690   "[:^upper:]",
14691   "[:punct:]",
14692   "[:^punct:]",
14693   "[:print:]",
14694   "[:^print:]",
14695   "[:alnum:]",
14696   "[:^alnum:]",
14697   "[:graph:]",
14698   "[:^graph:]",
14699   "[:cased:]",
14700   "[:^cased:]",
14701   "[\\s]",
14702   "[\\S]",
14703   "[:blank:]",
14704   "[:^blank:]",
14705   "[:xdigit:]",
14706   "[:^xdigit:]",
14707   "[:space:]",
14708   "[:^space:]",
14709   "[:cntrl:]",
14710   "[:^cntrl:]",
14711   "[:ascii:]",
14712   "[:^ascii:]",
14713   "[\\v]",
14714   "[\\V]"
14715  };
14716  RXi_GET_DECL(prog,progi);
14717  GET_RE_DEBUG_FLAGS_DECL;
14718
14719  PERL_ARGS_ASSERT_REGPROP;
14720
14721  sv_setpvs(sv, "");
14722
14723  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
14724   /* It would be nice to FAIL() here, but this may be called from
14725   regexec.c, and it would be hard to supply pRExC_state. */
14726   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14727  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14728
14729  k = PL_regkind[OP(o)];
14730
14731  if (k == EXACT) {
14732   sv_catpvs(sv, " ");
14733   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14734   * is a crude hack but it may be the best for now since
14735   * we have no flag "this EXACTish node was UTF-8"
14736   * --jhi */
14737   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14738     PERL_PV_ESCAPE_UNI_DETECT |
14739     PERL_PV_ESCAPE_NONASCII   |
14740     PERL_PV_PRETTY_ELLIPSES   |
14741     PERL_PV_PRETTY_LTGT       |
14742     PERL_PV_PRETTY_NOCLEAR
14743     );
14744  } else if (k == TRIE) {
14745   /* print the details of the trie in dumpuntil instead, as
14746   * progi->data isn't available here */
14747   const char op = OP(o);
14748   const U32 n = ARG(o);
14749   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14750    (reg_ac_data *)progi->data->data[n] :
14751    NULL;
14752   const reg_trie_data * const trie
14753    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14754
14755   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14756   DEBUG_TRIE_COMPILE_r(
14757    Perl_sv_catpvf(aTHX_ sv,
14758     "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14759     (UV)trie->startstate,
14760     (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14761     (UV)trie->wordcount,
14762     (UV)trie->minlen,
14763     (UV)trie->maxlen,
14764     (UV)TRIE_CHARCOUNT(trie),
14765     (UV)trie->uniquecharcount
14766    )
14767   );
14768   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14769    sv_catpvs(sv, "[");
14770    (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
14771             ? ANYOF_BITMAP(o)
14772             : TRIE_BITMAP(trie));
14773    sv_catpvs(sv, "]");
14774   }
14775
14776  } else if (k == CURLY) {
14777   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14778    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14779   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14780  }
14781  else if (k == WHILEM && o->flags)   /* Ordinal/of */
14782   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14783  else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14784   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14785   if ( RXp_PAREN_NAMES(prog) ) {
14786    if ( k != REF || (OP(o) < NREF)) {
14787     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14788     SV **name= av_fetch(list, ARG(o), 0 );
14789     if (name)
14790      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14791    }
14792    else {
14793     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14794     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14795     I32 *nums=(I32*)SvPVX(sv_dat);
14796     SV **name= av_fetch(list, nums[0], 0 );
14797     I32 n;
14798     if (name) {
14799      for ( n=0; n<SvIVX(sv_dat); n++ ) {
14800       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14801          (n ? "," : ""), (IV)nums[n]);
14802      }
14803      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14804     }
14805    }
14806   }
14807  } else if (k == GOSUB)
14808   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14809  else if (k == VERB) {
14810   if (!o->flags)
14811    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14812       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14813  } else if (k == LOGICAL)
14814   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14815  else if (k == ANYOF) {
14816   const U8 flags = ANYOF_FLAGS(o);
14817   int do_sep = 0;
14818
14819
14820   if (flags & ANYOF_LOCALE)
14821    sv_catpvs(sv, "{loc}");
14822   if (flags & ANYOF_LOC_FOLD)
14823    sv_catpvs(sv, "{i}");
14824   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14825   if (flags & ANYOF_INVERT)
14826    sv_catpvs(sv, "^");
14827
14828   /* output what the standard cp 0-255 bitmap matches */
14829   do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
14830
14831   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14832   /* output any special charclass tests (used entirely under use locale) */
14833   if (ANYOF_CLASS_TEST_ANY_SET(o)) {
14834    int i;
14835    for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) {
14836     if (ANYOF_CLASS_TEST(o,i)) {
14837      sv_catpv(sv, anyofs[i]);
14838      do_sep = 1;
14839     }
14840    }
14841   }
14842
14843   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14844
14845   if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14846    sv_catpvs(sv, "{non-utf8-latin1-all}");
14847   }
14848
14849   /* output information about the unicode matching */
14850   if (flags & ANYOF_UNICODE_ALL)
14851    sv_catpvs(sv, "{unicode_all}");
14852   else if (ANYOF_NONBITMAP(o)) {
14853    SV *lv; /* Set if there is something outside the bit map. */
14854    SV * sw;
14855    bool byte_output = FALSE;   /* If something in the bitmap has been
14856           output */
14857
14858    if (flags & ANYOF_NONBITMAP_NON_UTF8) {
14859     sv_catpvs(sv, "{outside bitmap}");
14860    }
14861    else {
14862     sv_catpvs(sv, "{utf8}");
14863    }
14864
14865    /* Get the stuff that wasn't in the bitmap */
14866    sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14867    if (lv && lv != &PL_sv_undef) {
14868     char *s = savesvpv(lv);
14869     char * const origs = s;
14870
14871     while (*s && *s != '\n')
14872      s++;
14873
14874     if (*s == '\n') {
14875      const char * const t = ++s;
14876
14877      if (byte_output) {
14878       sv_catpvs(sv, " ");
14879      }
14880
14881      while (*s) {
14882       if (*s == '\n') {
14883
14884        /* Truncate very long output */
14885        if (s - origs > 256) {
14886         Perl_sv_catpvf(aTHX_ sv,
14887            "%.*s...",
14888            (int) (s - origs - 1),
14889            t);
14890         goto out_dump;
14891        }
14892        *s = ' ';
14893       }
14894       else if (*s == '\t') {
14895        *s = '-';
14896       }
14897       s++;
14898      }
14899      if (s[-1] == ' ')
14900       s[-1] = 0;
14901
14902      sv_catpv(sv, t);
14903     }
14904
14905    out_dump:
14906
14907     Safefree(origs);
14908     SvREFCNT_dec_NN(lv);
14909    }
14910   }
14911
14912   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14913  }
14914  else if (k == POSIXD || k == NPOSIXD) {
14915   U8 index = FLAGS(o) * 2;
14916   if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14917    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14918   }
14919   else {
14920    sv_catpv(sv, anyofs[index]);
14921   }
14922  }
14923  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14924   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14925 #else
14926  PERL_UNUSED_CONTEXT;
14927  PERL_UNUSED_ARG(sv);
14928  PERL_UNUSED_ARG(o);
14929  PERL_UNUSED_ARG(prog);
14930 #endif /* DEBUGGING */
14931 }
14932
14933 SV *
14934 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14935 {    /* Assume that RE_INTUIT is set */
14936  dVAR;
14937  struct regexp *const prog = ReANY(r);
14938  GET_RE_DEBUG_FLAGS_DECL;
14939
14940  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14941  PERL_UNUSED_CONTEXT;
14942
14943  DEBUG_COMPILE_r(
14944   {
14945    const char * const s = SvPV_nolen_const(prog->check_substr
14946      ? prog->check_substr : prog->check_utf8);
14947
14948    if (!PL_colorset) reginitcolors();
14949    PerlIO_printf(Perl_debug_log,
14950      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14951      PL_colors[4],
14952      prog->check_substr ? "" : "utf8 ",
14953      PL_colors[5],PL_colors[0],
14954      s,
14955      PL_colors[1],
14956      (strlen(s) > 60 ? "..." : ""));
14957   } );
14958
14959  return prog->check_substr ? prog->check_substr : prog->check_utf8;
14960 }
14961
14962 /*
14963    pregfree()
14964
14965    handles refcounting and freeing the perl core regexp structure. When
14966    it is necessary to actually free the structure the first thing it
14967    does is call the 'free' method of the regexp_engine associated to
14968    the regexp, allowing the handling of the void *pprivate; member
14969    first. (This routine is not overridable by extensions, which is why
14970    the extensions free is called first.)
14971
14972    See regdupe and regdupe_internal if you change anything here.
14973 */
14974 #ifndef PERL_IN_XSUB_RE
14975 void
14976 Perl_pregfree(pTHX_ REGEXP *r)
14977 {
14978  SvREFCNT_dec(r);
14979 }
14980
14981 void
14982 Perl_pregfree2(pTHX_ REGEXP *rx)
14983 {
14984  dVAR;
14985  struct regexp *const r = ReANY(rx);
14986  GET_RE_DEBUG_FLAGS_DECL;
14987
14988  PERL_ARGS_ASSERT_PREGFREE2;
14989
14990  if (r->mother_re) {
14991   ReREFCNT_dec(r->mother_re);
14992  } else {
14993   CALLREGFREE_PVT(rx); /* free the private data */
14994   SvREFCNT_dec(RXp_PAREN_NAMES(r));
14995   Safefree(r->xpv_len_u.xpvlenu_pv);
14996  }
14997  if (r->substrs) {
14998   SvREFCNT_dec(r->anchored_substr);
14999   SvREFCNT_dec(r->anchored_utf8);
15000   SvREFCNT_dec(r->float_substr);
15001   SvREFCNT_dec(r->float_utf8);
15002   Safefree(r->substrs);
15003  }
15004  RX_MATCH_COPY_FREE(rx);
15005 #ifdef PERL_ANY_COW
15006  SvREFCNT_dec(r->saved_copy);
15007 #endif
15008  Safefree(r->offs);
15009  SvREFCNT_dec(r->qr_anoncv);
15010  rx->sv_u.svu_rx = 0;
15011 }
15012
15013 /*  reg_temp_copy()
15014
15015  This is a hacky workaround to the structural issue of match results
15016  being stored in the regexp structure which is in turn stored in
15017  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15018  could be PL_curpm in multiple contexts, and could require multiple
15019  result sets being associated with the pattern simultaneously, such
15020  as when doing a recursive match with (??{$qr})
15021
15022  The solution is to make a lightweight copy of the regexp structure
15023  when a qr// is returned from the code executed by (??{$qr}) this
15024  lightweight copy doesn't actually own any of its data except for
15025  the starp/end and the actual regexp structure itself.
15026
15027 */
15028
15029
15030 REGEXP *
15031 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15032 {
15033  struct regexp *ret;
15034  struct regexp *const r = ReANY(rx);
15035  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15036
15037  PERL_ARGS_ASSERT_REG_TEMP_COPY;
15038
15039  if (!ret_x)
15040   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15041  else {
15042   SvOK_off((SV *)ret_x);
15043   if (islv) {
15044    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15045    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
15046    made both spots point to the same regexp body.) */
15047    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15048    assert(!SvPVX(ret_x));
15049    ret_x->sv_u.svu_rx = temp->sv_any;
15050    temp->sv_any = NULL;
15051    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15052    SvREFCNT_dec_NN(temp);
15053    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15054    ing below will not set it. */
15055    SvCUR_set(ret_x, SvCUR(rx));
15056   }
15057  }
15058  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15059  sv_force_normal(sv) is called.  */
15060  SvFAKE_on(ret_x);
15061  ret = ReANY(ret_x);
15062
15063  SvFLAGS(ret_x) |= SvUTF8(rx);
15064  /* We share the same string buffer as the original regexp, on which we
15065  hold a reference count, incremented when mother_re is set below.
15066  The string pointer is copied here, being part of the regexp struct.
15067  */
15068  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15069   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15070  if (r->offs) {
15071   const I32 npar = r->nparens+1;
15072   Newx(ret->offs, npar, regexp_paren_pair);
15073   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15074  }
15075  if (r->substrs) {
15076   Newx(ret->substrs, 1, struct reg_substr_data);
15077   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15078
15079   SvREFCNT_inc_void(ret->anchored_substr);
15080   SvREFCNT_inc_void(ret->anchored_utf8);
15081   SvREFCNT_inc_void(ret->float_substr);
15082   SvREFCNT_inc_void(ret->float_utf8);
15083
15084   /* check_substr and check_utf8, if non-NULL, point to either their
15085   anchored or float namesakes, and don't hold a second reference.  */
15086  }
15087  RX_MATCH_COPIED_off(ret_x);
15088 #ifdef PERL_ANY_COW
15089  ret->saved_copy = NULL;
15090 #endif
15091  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15092  SvREFCNT_inc_void(ret->qr_anoncv);
15093
15094  return ret_x;
15095 }
15096 #endif
15097
15098 /* regfree_internal()
15099
15100    Free the private data in a regexp. This is overloadable by
15101    extensions. Perl takes care of the regexp structure in pregfree(),
15102    this covers the *pprivate pointer which technically perl doesn't
15103    know about, however of course we have to handle the
15104    regexp_internal structure when no extension is in use.
15105
15106    Note this is called before freeing anything in the regexp
15107    structure.
15108  */
15109
15110 void
15111 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15112 {
15113  dVAR;
15114  struct regexp *const r = ReANY(rx);
15115  RXi_GET_DECL(r,ri);
15116  GET_RE_DEBUG_FLAGS_DECL;
15117
15118  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15119
15120  DEBUG_COMPILE_r({
15121   if (!PL_colorset)
15122    reginitcolors();
15123   {
15124    SV *dsv= sv_newmortal();
15125    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15126     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15127    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15128     PL_colors[4],PL_colors[5],s);
15129   }
15130  });
15131 #ifdef RE_TRACK_PATTERN_OFFSETS
15132  if (ri->u.offsets)
15133   Safefree(ri->u.offsets);             /* 20010421 MJD */
15134 #endif
15135  if (ri->code_blocks) {
15136   int n;
15137   for (n = 0; n < ri->num_code_blocks; n++)
15138    SvREFCNT_dec(ri->code_blocks[n].src_regex);
15139   Safefree(ri->code_blocks);
15140  }
15141
15142  if (ri->data) {
15143   int n = ri->data->count;
15144
15145   while (--n >= 0) {
15146   /* If you add a ->what type here, update the comment in regcomp.h */
15147    switch (ri->data->what[n]) {
15148    case 'a':
15149    case 'r':
15150    case 's':
15151    case 'S':
15152    case 'u':
15153     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15154     break;
15155    case 'f':
15156     Safefree(ri->data->data[n]);
15157     break;
15158    case 'l':
15159    case 'L':
15160     break;
15161    case 'T':
15162     { /* Aho Corasick add-on structure for a trie node.
15163      Used in stclass optimization only */
15164      U32 refcount;
15165      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15166      OP_REFCNT_LOCK;
15167      refcount = --aho->refcount;
15168      OP_REFCNT_UNLOCK;
15169      if ( !refcount ) {
15170       PerlMemShared_free(aho->states);
15171       PerlMemShared_free(aho->fail);
15172       /* do this last!!!! */
15173       PerlMemShared_free(ri->data->data[n]);
15174       PerlMemShared_free(ri->regstclass);
15175      }
15176     }
15177     break;
15178    case 't':
15179     {
15180      /* trie structure. */
15181      U32 refcount;
15182      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15183      OP_REFCNT_LOCK;
15184      refcount = --trie->refcount;
15185      OP_REFCNT_UNLOCK;
15186      if ( !refcount ) {
15187       PerlMemShared_free(trie->charmap);
15188       PerlMemShared_free(trie->states);
15189       PerlMemShared_free(trie->trans);
15190       if (trie->bitmap)
15191        PerlMemShared_free(trie->bitmap);
15192       if (trie->jump)
15193        PerlMemShared_free(trie->jump);
15194       PerlMemShared_free(trie->wordinfo);
15195       /* do this last!!!! */
15196       PerlMemShared_free(ri->data->data[n]);
15197      }
15198     }
15199     break;
15200    default:
15201     Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15202    }
15203   }
15204   Safefree(ri->data->what);
15205   Safefree(ri->data);
15206  }
15207
15208  Safefree(ri);
15209 }
15210
15211 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15212 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15213 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15214
15215 /*
15216    re_dup - duplicate a regexp.
15217
15218    This routine is expected to clone a given regexp structure. It is only
15219    compiled under USE_ITHREADS.
15220
15221    After all of the core data stored in struct regexp is duplicated
15222    the regexp_engine.dupe method is used to copy any private data
15223    stored in the *pprivate pointer. This allows extensions to handle
15224    any duplication it needs to do.
15225
15226    See pregfree() and regfree_internal() if you change anything here.
15227 */
15228 #if defined(USE_ITHREADS)
15229 #ifndef PERL_IN_XSUB_RE
15230 void
15231 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15232 {
15233  dVAR;
15234  I32 npar;
15235  const struct regexp *r = ReANY(sstr);
15236  struct regexp *ret = ReANY(dstr);
15237
15238  PERL_ARGS_ASSERT_RE_DUP_GUTS;
15239
15240  npar = r->nparens+1;
15241  Newx(ret->offs, npar, regexp_paren_pair);
15242  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15243
15244  if (ret->substrs) {
15245   /* Do it this way to avoid reading from *r after the StructCopy().
15246   That way, if any of the sv_dup_inc()s dislodge *r from the L1
15247   cache, it doesn't matter.  */
15248   const bool anchored = r->check_substr
15249    ? r->check_substr == r->anchored_substr
15250    : r->check_utf8 == r->anchored_utf8;
15251   Newx(ret->substrs, 1, struct reg_substr_data);
15252   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15253
15254   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15255   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15256   ret->float_substr = sv_dup_inc(ret->float_substr, param);
15257   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15258
15259   /* check_substr and check_utf8, if non-NULL, point to either their
15260   anchored or float namesakes, and don't hold a second reference.  */
15261
15262   if (ret->check_substr) {
15263    if (anchored) {
15264     assert(r->check_utf8 == r->anchored_utf8);
15265     ret->check_substr = ret->anchored_substr;
15266     ret->check_utf8 = ret->anchored_utf8;
15267    } else {
15268     assert(r->check_substr == r->float_substr);
15269     assert(r->check_utf8 == r->float_utf8);
15270     ret->check_substr = ret->float_substr;
15271     ret->check_utf8 = ret->float_utf8;
15272    }
15273   } else if (ret->check_utf8) {
15274    if (anchored) {
15275     ret->check_utf8 = ret->anchored_utf8;
15276    } else {
15277     ret->check_utf8 = ret->float_utf8;
15278    }
15279   }
15280  }
15281
15282  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15283  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15284
15285  if (ret->pprivate)
15286   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15287
15288  if (RX_MATCH_COPIED(dstr))
15289   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15290  else
15291   ret->subbeg = NULL;
15292 #ifdef PERL_ANY_COW
15293  ret->saved_copy = NULL;
15294 #endif
15295
15296  /* Whether mother_re be set or no, we need to copy the string.  We
15297  cannot refrain from copying it when the storage points directly to
15298  our mother regexp, because that's
15299    1: a buffer in a different thread
15300    2: something we no longer hold a reference on
15301    so we need to copy it locally.  */
15302  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15303  ret->mother_re   = NULL;
15304 }
15305 #endif /* PERL_IN_XSUB_RE */
15306
15307 /*
15308    regdupe_internal()
15309
15310    This is the internal complement to regdupe() which is used to copy
15311    the structure pointed to by the *pprivate pointer in the regexp.
15312    This is the core version of the extension overridable cloning hook.
15313    The regexp structure being duplicated will be copied by perl prior
15314    to this and will be provided as the regexp *r argument, however
15315    with the /old/ structures pprivate pointer value. Thus this routine
15316    may override any copying normally done by perl.
15317
15318    It returns a pointer to the new regexp_internal structure.
15319 */
15320
15321 void *
15322 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15323 {
15324  dVAR;
15325  struct regexp *const r = ReANY(rx);
15326  regexp_internal *reti;
15327  int len;
15328  RXi_GET_DECL(r,ri);
15329
15330  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15331
15332  len = ProgLen(ri);
15333
15334  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15335  Copy(ri->program, reti->program, len+1, regnode);
15336
15337  reti->num_code_blocks = ri->num_code_blocks;
15338  if (ri->code_blocks) {
15339   int n;
15340   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15341     struct reg_code_block);
15342   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15343     struct reg_code_block);
15344   for (n = 0; n < ri->num_code_blocks; n++)
15345    reti->code_blocks[n].src_regex = (REGEXP*)
15346      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15347  }
15348  else
15349   reti->code_blocks = NULL;
15350
15351  reti->regstclass = NULL;
15352
15353  if (ri->data) {
15354   struct reg_data *d;
15355   const int count = ri->data->count;
15356   int i;
15357
15358   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15359     char, struct reg_data);
15360   Newx(d->what, count, U8);
15361
15362   d->count = count;
15363   for (i = 0; i < count; i++) {
15364    d->what[i] = ri->data->what[i];
15365    switch (d->what[i]) {
15366     /* see also regcomp.h and regfree_internal() */
15367    case 'a': /* actually an AV, but the dup function is identical.  */
15368    case 'r':
15369    case 's':
15370    case 'S':
15371    case 'u': /* actually an HV, but the dup function is identical.  */
15372     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15373     break;
15374    case 'f':
15375     /* This is cheating. */
15376     Newx(d->data[i], 1, struct regnode_charclass_class);
15377     StructCopy(ri->data->data[i], d->data[i],
15378        struct regnode_charclass_class);
15379     reti->regstclass = (regnode*)d->data[i];
15380     break;
15381    case 'T':
15382     /* Trie stclasses are readonly and can thus be shared
15383     * without duplication. We free the stclass in pregfree
15384     * when the corresponding reg_ac_data struct is freed.
15385     */
15386     reti->regstclass= ri->regstclass;
15387     /* Fall through */
15388    case 't':
15389     OP_REFCNT_LOCK;
15390     ((reg_trie_data*)ri->data->data[i])->refcount++;
15391     OP_REFCNT_UNLOCK;
15392     /* Fall through */
15393    case 'l':
15394    case 'L':
15395     d->data[i] = ri->data->data[i];
15396     break;
15397    default:
15398     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15399    }
15400   }
15401
15402   reti->data = d;
15403  }
15404  else
15405   reti->data = NULL;
15406
15407  reti->name_list_idx = ri->name_list_idx;
15408
15409 #ifdef RE_TRACK_PATTERN_OFFSETS
15410  if (ri->u.offsets) {
15411   Newx(reti->u.offsets, 2*len+1, U32);
15412   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15413  }
15414 #else
15415  SetProgLen(reti,len);
15416 #endif
15417
15418  return (void*)reti;
15419 }
15420
15421 #endif    /* USE_ITHREADS */
15422
15423 #ifndef PERL_IN_XSUB_RE
15424
15425 /*
15426  - regnext - dig the "next" pointer out of a node
15427  */
15428 regnode *
15429 Perl_regnext(pTHX_ regnode *p)
15430 {
15431  dVAR;
15432  I32 offset;
15433
15434  if (!p)
15435   return(NULL);
15436
15437  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
15438   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15439  }
15440
15441  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15442  if (offset == 0)
15443   return(NULL);
15444
15445  return(p+offset);
15446 }
15447 #endif
15448
15449 STATIC void
15450 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15451 {
15452  va_list args;
15453  STRLEN l1 = strlen(pat1);
15454  STRLEN l2 = strlen(pat2);
15455  char buf[512];
15456  SV *msv;
15457  const char *message;
15458
15459  PERL_ARGS_ASSERT_RE_CROAK2;
15460
15461  if (l1 > 510)
15462   l1 = 510;
15463  if (l1 + l2 > 510)
15464   l2 = 510 - l1;
15465  Copy(pat1, buf, l1 , char);
15466  Copy(pat2, buf + l1, l2 , char);
15467  buf[l1 + l2] = '\n';
15468  buf[l1 + l2 + 1] = '\0';
15469 #ifdef I_STDARG
15470  /* ANSI variant takes additional second argument */
15471  va_start(args, pat2);
15472 #else
15473  va_start(args);
15474 #endif
15475  msv = vmess(buf, &args);
15476  va_end(args);
15477  message = SvPV_const(msv,l1);
15478  if (l1 > 512)
15479   l1 = 512;
15480  Copy(message, buf, l1 , char);
15481  buf[l1-1] = '\0';   /* Overwrite \n */
15482  Perl_croak(aTHX_ "%s", buf);
15483 }
15484
15485 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15486
15487 #ifndef PERL_IN_XSUB_RE
15488 void
15489 Perl_save_re_context(pTHX)
15490 {
15491  dVAR;
15492
15493  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15494  if (PL_curpm) {
15495   const REGEXP * const rx = PM_GETRE(PL_curpm);
15496   if (rx) {
15497    U32 i;
15498    for (i = 1; i <= RX_NPARENS(rx); i++) {
15499     char digits[TYPE_CHARS(long)];
15500     const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15501     GV *const *const gvp
15502      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15503
15504     if (gvp) {
15505      GV * const gv = *gvp;
15506      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15507       save_scalar(gv);
15508     }
15509    }
15510   }
15511  }
15512 }
15513 #endif
15514
15515 #ifdef DEBUGGING
15516
15517 STATIC void
15518 S_put_byte(pTHX_ SV *sv, int c)
15519 {
15520  PERL_ARGS_ASSERT_PUT_BYTE;
15521
15522  /* Our definition of isPRINT() ignores locales, so only bytes that are
15523  not part of UTF-8 are considered printable. I assume that the same
15524  holds for UTF-EBCDIC.
15525  Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15526  which Wikipedia says:
15527
15528  EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15529  ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15530  identical, to the ASCII delete (DEL) or rubout control character. ...
15531  it is typically mapped to hexadecimal code 9F, in order to provide a
15532  unique character mapping in both directions)
15533
15534  So the old condition can be simplified to !isPRINT(c)  */
15535  if (!isPRINT(c)) {
15536   switch (c) {
15537    case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
15538    case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
15539    case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
15540    case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
15541    case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
15542
15543    default:
15544     Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15545     break;
15546   }
15547  }
15548  else {
15549   const char string = c;
15550   if (c == '-' || c == ']' || c == '\\' || c == '^')
15551    sv_catpvs(sv, "\\");
15552   sv_catpvn(sv, &string, 1);
15553  }
15554 }
15555
15556 STATIC bool
15557 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
15558 {
15559  /* Appends to 'sv' a displayable version of the innards of the bracketed
15560  * character class whose bitmap is 'bitmap';  Returns 'TRUE' if it actually
15561  * output anything */
15562
15563  int i;
15564  int rangestart = -1;
15565  bool has_output_anything = FALSE;
15566
15567  PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
15568
15569  for (i = 0; i <= 256; i++) {
15570   if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
15571    if (rangestart == -1)
15572     rangestart = i;
15573   } else if (rangestart != -1) {
15574    int j = i - 1;
15575    if (i <= rangestart + 3) {  /* Individual chars in short ranges */
15576     for (; rangestart < i; rangestart++)
15577      put_byte(sv, rangestart);
15578    }
15579    else if (   j > 255
15580      || ! isALPHANUMERIC(rangestart)
15581      || ! isALPHANUMERIC(j)
15582      || isDIGIT(rangestart) != isDIGIT(j)
15583      || isUPPER(rangestart) != isUPPER(j)
15584      || isLOWER(rangestart) != isLOWER(j)
15585
15586       /* This final test should get optimized out except
15587       * on EBCDIC platforms, where it causes ranges that
15588       * cross discontinuities like i/j to be shown as hex
15589       * instead of the misleading, e.g. H-K (since that
15590       * range includes more than H, I, J, K). */
15591      || (j - rangestart)
15592       != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
15593    {
15594     Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
15595        rangestart,
15596        (j < 256) ? j : 255);
15597    }
15598    else { /* Here, the ends of the range are both digits, or both
15599      uppercase, or both lowercase; and there's no
15600      discontinuity in the range (which could happen on EBCDIC
15601      platforms) */
15602     put_byte(sv, rangestart);
15603     sv_catpvs(sv, "-");
15604     put_byte(sv, j);
15605    }
15606    rangestart = -1;
15607    has_output_anything = TRUE;
15608   }
15609  }
15610
15611  return has_output_anything;
15612 }
15613
15614 #define CLEAR_OPTSTART \
15615  if (optstart) STMT_START { \
15616    DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15617    optstart=NULL; \
15618  } STMT_END
15619
15620 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15621
15622 STATIC const regnode *
15623 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15624    const regnode *last, const regnode *plast,
15625    SV* sv, I32 indent, U32 depth)
15626 {
15627  dVAR;
15628  U8 op = PSEUDO; /* Arbitrary non-END op. */
15629  const regnode *next;
15630  const regnode *optstart= NULL;
15631
15632  RXi_GET_DECL(r,ri);
15633  GET_RE_DEBUG_FLAGS_DECL;
15634
15635  PERL_ARGS_ASSERT_DUMPUNTIL;
15636
15637 #ifdef DEBUG_DUMPUNTIL
15638  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15639   last ? last-start : 0,plast ? plast-start : 0);
15640 #endif
15641
15642  if (plast && plast < last)
15643   last= plast;
15644
15645  while (PL_regkind[op] != END && (!last || node < last)) {
15646   /* While that wasn't END last time... */
15647   NODE_ALIGN(node);
15648   op = OP(node);
15649   if (op == CLOSE || op == WHILEM)
15650    indent--;
15651   next = regnext((regnode *)node);
15652
15653   /* Where, what. */
15654   if (OP(node) == OPTIMIZED) {
15655    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15656     optstart = node;
15657    else
15658     goto after_print;
15659   } else
15660    CLEAR_OPTSTART;
15661
15662   regprop(r, sv, node);
15663   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15664      (int)(2*indent + 1), "", SvPVX_const(sv));
15665
15666   if (OP(node) != OPTIMIZED) {
15667    if (next == NULL)  /* Next ptr. */
15668     PerlIO_printf(Perl_debug_log, " (0)");
15669    else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15670     PerlIO_printf(Perl_debug_log, " (FAIL)");
15671    else
15672     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15673    (void)PerlIO_putc(Perl_debug_log, '\n');
15674   }
15675
15676  after_print:
15677   if (PL_regkind[(U8)op] == BRANCHJ) {
15678    assert(next);
15679    {
15680     const regnode *nnode = (OP(next) == LONGJMP
15681          ? regnext((regnode *)next)
15682          : next);
15683     if (last && nnode > last)
15684      nnode = last;
15685     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15686    }
15687   }
15688   else if (PL_regkind[(U8)op] == BRANCH) {
15689    assert(next);
15690    DUMPUNTIL(NEXTOPER(node), next);
15691   }
15692   else if ( PL_regkind[(U8)op]  == TRIE ) {
15693    const regnode *this_trie = node;
15694    const char op = OP(node);
15695    const U32 n = ARG(node);
15696    const reg_ac_data * const ac = op>=AHOCORASICK ?
15697    (reg_ac_data *)ri->data->data[n] :
15698    NULL;
15699    const reg_trie_data * const trie =
15700     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15701 #ifdef DEBUGGING
15702    AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15703 #endif
15704    const regnode *nextbranch= NULL;
15705    I32 word_idx;
15706    sv_setpvs(sv, "");
15707    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15708     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15709
15710     PerlIO_printf(Perl_debug_log, "%*s%s ",
15711     (int)(2*(indent+3)), "",
15712      elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15713        PL_colors[0], PL_colors[1],
15714        (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15715        PERL_PV_PRETTY_ELLIPSES    |
15716        PERL_PV_PRETTY_LTGT
15717        )
15718        : "???"
15719     );
15720     if (trie->jump) {
15721      U16 dist= trie->jump[word_idx+1];
15722      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15723         (UV)((dist ? this_trie + dist : next) - start));
15724      if (dist) {
15725       if (!nextbranch)
15726        nextbranch= this_trie + trie->jump[0];
15727       DUMPUNTIL(this_trie + dist, nextbranch);
15728      }
15729      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15730       nextbranch= regnext((regnode *)nextbranch);
15731     } else {
15732      PerlIO_printf(Perl_debug_log, "\n");
15733     }
15734    }
15735    if (last && next > last)
15736     node= last;
15737    else
15738     node= next;
15739   }
15740   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15741    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15742      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15743   }
15744   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15745    assert(next);
15746    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15747   }
15748   else if ( op == PLUS || op == STAR) {
15749    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15750   }
15751   else if (PL_regkind[(U8)op] == ANYOF) {
15752    /* arglen 1 + class block */
15753    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15754      ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15755    node = NEXTOPER(node);
15756   }
15757   else if (PL_regkind[(U8)op] == EXACT) {
15758    /* Literal string, where present. */
15759    node += NODE_SZ_STR(node) - 1;
15760    node = NEXTOPER(node);
15761   }
15762   else {
15763    node = NEXTOPER(node);
15764    node += regarglen[(U8)op];
15765   }
15766   if (op == CURLYX || op == OPEN)
15767    indent++;
15768  }
15769  CLEAR_OPTSTART;
15770 #ifdef DEBUG_DUMPUNTIL
15771  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15772 #endif
15773  return node;
15774 }
15775
15776 #endif /* DEBUGGING */
15777
15778 /*
15779  * Local variables:
15780  * c-indentation-style: bsd
15781  * c-basic-offset: 4
15782  * indent-tabs-mode: nil
15783  * End:
15784  *
15785  * ex: set ts=8 sts=4 sw=4 et:
15786  */