src/5018001/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #undef PERL_IN_XSUB_RE
  76 #define PERL_IN_XSUB_RE 1
  77 #include "perl.h"
  78 #undef PERL_IN_XSUB_RE
  79
  80 #ifndef PERL_IN_XSUB_RE
  81 #include "re_defs.h"
  82 #endif
  83
  84 #define REG_COMP_C
  85 #ifdef PERL_IN_XSUB_RE
  86 #  include "re_comp.h"
  87 extern const struct regexp_engine my_reg_engine;
  88 #else
  89 #  include "regcomp.h"
  90 #endif
  91
  92 #include "dquote_static.c"
  93 #include "charclass_invlists.h"
  94 #include "inline_invlist.c"
  95 #include "unicode_constants.h"
  96
  97 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifdef op
 102 #undef op
 103 #endif /* op */
 104
 105 #ifdef MSDOS
 106 #  if defined(BUGGY_MSC6)
 107  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 108 #    pragma optimize("a",off)
 109  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 110 #    pragma optimize("w",on )
 111 #  endif /* BUGGY_MSC6 */
 112 #endif /* MSDOS */
 113
 114 #ifndef STATIC
 115 #define STATIC static
 116 #endif
 117
 118
 119 typedef struct RExC_state_t {
 120  U32  flags;   /* RXf_* are we folding, multilining? */
 121  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 122  char *precomp;  /* uncompiled string. */
 123  REGEXP *rx_sv;   /* The SV that is the regexp. */
 124  regexp *rx;                    /* perl core regexp structure */
 125  regexp_internal *rxi;           /* internal data for regexp object pprivate field */
 126  char *start;   /* Start of input for compile */
 127  char *end;   /* End of input for compile */
 128  char *parse;   /* Input-scan pointer. */
 129  I32  whilem_seen;  /* number of WHILEM in this expr */
 130  regnode *emit_start;  /* Start of emitted-code area */
 131  regnode *emit_bound;  /* First regnode outside of the allocated space */
 132  regnode *emit;   /* Code-emit pointer; &regdummy = don't = compiling */
 133  I32  naughty;  /* How bad is this pattern? */
 134  I32  sawback;  /* Did we see \1, ...? */
 135  U32  seen;
 136  I32  size;   /* Code size. */
 137  I32  npar;   /* Capture buffer count, (OPEN). */
 138  I32  cpar;   /* Capture buffer count, (CLOSE). */
 139  I32  nestroot;  /* root parens we are in - used by accept */
 140  I32  extralen;
 141  I32  seen_zerolen;
 142  regnode **open_parens;  /* pointers to open parens */
 143  regnode **close_parens;  /* pointers to close parens */
 144  regnode *opend;   /* END node in program */
 145  I32  utf8;  /* whether the pattern is utf8 or not */
 146  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 147         /* XXX use this for future optimisation of case
 148         * where pattern must be upgraded to utf8. */
 149  I32  uni_semantics; /* If a d charset modifier should use unicode
 150         rules, even if the pattern is not in
 151         utf8 */
 152  HV  *paren_names;  /* Paren names */
 153
 154  regnode **recurse;  /* Recurse regops */
 155  I32  recurse_count;  /* Number of recurse regops */
 156  I32  in_lookbehind;
 157  I32  contains_locale;
 158  I32  override_recoding;
 159  I32  in_multi_char_class;
 160  struct reg_code_block *code_blocks; /* positions of literal (?{})
 161            within pattern */
 162  int  num_code_blocks; /* size of code_blocks[] */
 163  int  code_index;  /* next code_blocks[] slot */
 164 #if ADD_TO_REGEXEC
 165  char  *starttry;  /* -Dr: where regtry was called. */
 166 #define RExC_starttry (pRExC_state->starttry)
 167 #endif
 168  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 169 #ifdef DEBUGGING
 170  const char  *lastparse;
 171  I32         lastnum;
 172  AV          *paren_name_list;       /* idx -> name */
 173 #define RExC_lastparse (pRExC_state->lastparse)
 174 #define RExC_lastnum (pRExC_state->lastnum)
 175 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 176 #endif
 177 } RExC_state_t;
 178
 179 #define RExC_flags (pRExC_state->flags)
 180 #define RExC_pm_flags (pRExC_state->pm_flags)
 181 #define RExC_precomp (pRExC_state->precomp)
 182 #define RExC_rx_sv (pRExC_state->rx_sv)
 183 #define RExC_rx  (pRExC_state->rx)
 184 #define RExC_rxi (pRExC_state->rxi)
 185 #define RExC_start (pRExC_state->start)
 186 #define RExC_end (pRExC_state->end)
 187 #define RExC_parse (pRExC_state->parse)
 188 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 189 #ifdef RE_TRACK_PATTERN_OFFSETS
 190 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
 191 #endif
 192 #define RExC_emit (pRExC_state->emit)
 193 #define RExC_emit_start (pRExC_state->emit_start)
 194 #define RExC_emit_bound (pRExC_state->emit_bound)
 195 #define RExC_naughty (pRExC_state->naughty)
 196 #define RExC_sawback (pRExC_state->sawback)
 197 #define RExC_seen (pRExC_state->seen)
 198 #define RExC_size (pRExC_state->size)
 199 #define RExC_npar (pRExC_state->npar)
 200 #define RExC_nestroot   (pRExC_state->nestroot)
 201 #define RExC_extralen (pRExC_state->extralen)
 202 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 203 #define RExC_utf8 (pRExC_state->utf8)
 204 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 205 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 206 #define RExC_open_parens (pRExC_state->open_parens)
 207 #define RExC_close_parens (pRExC_state->close_parens)
 208 #define RExC_opend (pRExC_state->opend)
 209 #define RExC_paren_names (pRExC_state->paren_names)
 210 #define RExC_recurse (pRExC_state->recurse)
 211 #define RExC_recurse_count (pRExC_state->recurse_count)
 212 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 213 #define RExC_contains_locale (pRExC_state->contains_locale)
 214 #define RExC_override_recoding (pRExC_state->override_recoding)
 215 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 216
 217
 218 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 219 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 220   ((*s) == '{' && regcurly(s, FALSE)))
 221
 222 #ifdef SPSTART
 223 #undef SPSTART  /* dratted cpp namespace... */
 224 #endif
 225 /*
 226  * Flags to be passed up and down.
 227  */
 228 #define WORST  0 /* Worst case. */
 229 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 230
 231 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 232  * character.  (There needs to be a case: in the switch statement in regexec.c
 233  * for any node marked SIMPLE.)  Note that this is not the same thing as
 234  * REGNODE_SIMPLE */
 235 #define SIMPLE  0x02
 236 #define SPSTART  0x04 /* Starts with * or + */
 237 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 238 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 239 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 240
 241 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 242
 243 /* whether trie related optimizations are enabled */
 244 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 245 #define TRIE_STUDY_OPT
 246 #define FULL_TRIE_STUDY
 247 #define TRIE_STCLASS
 248 #endif
 249
 250
 251
 252 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 253 #define PBITVAL(paren) (1 << ((paren) & 7))
 254 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 255 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 256 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 257
 258 #define REQUIRE_UTF8 STMT_START {                                       \
 259          if (!UTF) {                           \
 260           *flagp = RESTART_UTF8;            \
 261           return NULL;                      \
 262          }                                     \
 263       } STMT_END
 264
 265 /* This converts the named class defined in regcomp.h to its equivalent class
 266  * number defined in handy.h. */
 267 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 268 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 269
 270 /* About scan_data_t.
 271
 272   During optimisation we recurse through the regexp program performing
 273   various inplace (keyhole style) optimisations. In addition study_chunk
 274   and scan_commit populate this data structure with information about
 275   what strings MUST appear in the pattern. We look for the longest
 276   string that must appear at a fixed location, and we look for the
 277   longest string that may appear at a floating location. So for instance
 278   in the pattern:
 279
 280  /FOO[xX]A.*B[xX]BAR/
 281
 282   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 283   strings (because they follow a .* construct). study_chunk will identify
 284   both FOO and BAR as being the longest fixed and floating strings respectively.
 285
 286   The strings can be composites, for instance
 287
 288  /(f)(o)(o)/
 289
 290   will result in a composite fixed substring 'foo'.
 291
 292   For each string some basic information is maintained:
 293
 294   - offset or min_offset
 295  This is the position the string must appear at, or not before.
 296  It also implicitly (when combined with minlenp) tells us how many
 297  characters must match before the string we are searching for.
 298  Likewise when combined with minlenp and the length of the string it
 299  tells us how many characters must appear after the string we have
 300  found.
 301
 302   - max_offset
 303  Only used for floating strings. This is the rightmost point that
 304  the string can appear at. If set to I32 max it indicates that the
 305  string can occur infinitely far to the right.
 306
 307   - minlenp
 308  A pointer to the minimum number of characters of the pattern that the
 309  string was found inside. This is important as in the case of positive
 310  lookahead or positive lookbehind we can have multiple patterns
 311  involved. Consider
 312
 313  /(?=FOO).*F/
 314
 315  The minimum length of the pattern overall is 3, the minimum length
 316  of the lookahead part is 3, but the minimum length of the part that
 317  will actually match is 1. So 'FOO's minimum length is 3, but the
 318  minimum length for the F is 1. This is important as the minimum length
 319  is used to determine offsets in front of and behind the string being
 320  looked for.  Since strings can be composites this is the length of the
 321  pattern at the time it was committed with a scan_commit. Note that
 322  the length is calculated by study_chunk, so that the minimum lengths
 323  are not known until the full pattern has been compiled, thus the
 324  pointer to the value.
 325
 326   - lookbehind
 327
 328  In the case of lookbehind the string being searched for can be
 329  offset past the start point of the final matching string.
 330  If this value was just blithely removed from the min_offset it would
 331  invalidate some of the calculations for how many chars must match
 332  before or after (as they are derived from min_offset and minlen and
 333  the length of the string being searched for).
 334  When the final pattern is compiled and the data is moved from the
 335  scan_data_t structure into the regexp structure the information
 336  about lookbehind is factored in, with the information that would
 337  have been lost precalculated in the end_shift field for the
 338  associated string.
 339
 340   The fields pos_min and pos_delta are used to store the minimum offset
 341   and the delta to the maximum offset at the current point in the pattern.
 342
 343 */
 344
 345 typedef struct scan_data_t {
 346  /*I32 len_min;      unused */
 347  /*I32 len_delta;    unused */
 348  I32 pos_min;
 349  I32 pos_delta;
 350  SV *last_found;
 351  I32 last_end;     /* min value, <0 unless valid. */
 352  I32 last_start_min;
 353  I32 last_start_max;
 354  SV **longest;     /* Either &l_fixed, or &l_float. */
 355  SV *longest_fixed;      /* longest fixed string found in pattern */
 356  I32 offset_fixed;       /* offset where it starts */
 357  I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 358  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 359  SV *longest_float;      /* longest floating string found in pattern */
 360  I32 offset_float_min;   /* earliest point in string it can appear */
 361  I32 offset_float_max;   /* latest point in string it can appear */
 362  I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 363  I32 lookbehind_float;   /* is the position of the string modified by LB */
 364  I32 flags;
 365  I32 whilem_c;
 366  I32 *last_closep;
 367  struct regnode_charclass_class *start_class;
 368 } scan_data_t;
 369
 370 /*
 371  * Forward declarations for pregcomp()'s friends.
 372  */
 373
 374 static const scan_data_t zero_scan_data =
 375   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 376
 377 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 378 #define SF_BEFORE_SEOL  0x0001
 379 #define SF_BEFORE_MEOL  0x0002
 380 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 381 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 382
 383 #ifdef NO_UNARY_PLUS
 384 #  define SF_FIX_SHIFT_EOL (0+2)
 385 #  define SF_FL_SHIFT_EOL  (0+4)
 386 #else
 387 #  define SF_FIX_SHIFT_EOL (+2)
 388 #  define SF_FL_SHIFT_EOL  (+4)
 389 #endif
 390
 391 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 392 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 393
 394 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 395 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 396 #define SF_IS_INF  0x0040
 397 #define SF_HAS_PAR  0x0080
 398 #define SF_IN_PAR  0x0100
 399 #define SF_HAS_EVAL  0x0200
 400 #define SCF_DO_SUBSTR  0x0400
 401 #define SCF_DO_STCLASS_AND 0x0800
 402 #define SCF_DO_STCLASS_OR 0x1000
 403 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 404 #define SCF_WHILEM_VISITED_POS 0x2000
 405
 406 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 407 #define SCF_SEEN_ACCEPT         0x8000
 408
 409 #define UTF cBOOL(RExC_utf8)
 410
 411 /* The enums for all these are ordered so things work out correctly */
 412 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 413 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 414 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 415 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 416 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 417 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 418 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 419
 420 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 421
 422 #define OOB_NAMEDCLASS  -1
 423
 424 /* There is no code point that is out-of-bounds, so this is problematic.  But
 425  * its only current use is to initialize a variable that is always set before
 426  * looked at. */
 427 #define OOB_UNICODE  0xDEADBEEF
 428
 429 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 430 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 431
 432
 433 /* length of regex to show in messages that don't mark a position within */
 434 #define RegexLengthToShowInErrorMessages 127
 435
 436 /*
 437  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 438  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 439  * op/pragma/warn/regcomp.
 440  */
 441 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 442 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 443
 444 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 445
 446 /*
 447  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 448  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 449  * "...".
 450  */
 451 #define _FAIL(code) STMT_START {     \
 452  const char *ellipses = "";      \
 453  IV len = RExC_end - RExC_precomp;     \
 454                   \
 455  if (!SIZE_ONLY)       \
 456   SAVEFREESV(RExC_rx_sv);      \
 457  if (len > RegexLengthToShowInErrorMessages) {   \
 458   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 459   len = RegexLengthToShowInErrorMessages - 10;   \
 460   ellipses = "...";      \
 461  }         \
 462  code;                                                               \
 463 } STMT_END
 464
 465 #define FAIL(msg) _FAIL(       \
 466  Perl_croak(aTHX_ "%s in regex m/%.*s%s/",     \
 467    msg, (int)len, RExC_precomp, ellipses))
 468
 469 #define FAIL2(msg,arg) _FAIL(       \
 470  Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 471    arg, (int)len, RExC_precomp, ellipses))
 472
 473 /*
 474  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 475  */
 476 #define Simple_vFAIL(m) STMT_START {     \
 477  const IV offset = RExC_parse - RExC_precomp;   \
 478  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 479    m, (int)offset, RExC_precomp, RExC_precomp + offset); \
 480 } STMT_END
 481
 482 /*
 483  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 484  */
 485 #define vFAIL(m) STMT_START {    \
 486  if (!SIZE_ONLY)     \
 487   SAVEFREESV(RExC_rx_sv);    \
 488  Simple_vFAIL(m);     \
 489 } STMT_END
 490
 491 /*
 492  * Like Simple_vFAIL(), but accepts two arguments.
 493  */
 494 #define Simple_vFAIL2(m,a1) STMT_START {   \
 495  const IV offset = RExC_parse - RExC_precomp;   \
 496  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,   \
 497    (int)offset, RExC_precomp, RExC_precomp + offset); \
 498 } STMT_END
 499
 500 /*
 501  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 502  */
 503 #define vFAIL2(m,a1) STMT_START {   \
 504  if (!SIZE_ONLY)     \
 505   SAVEFREESV(RExC_rx_sv);    \
 506  Simple_vFAIL2(m, a1);    \
 507 } STMT_END
 508
 509
 510 /*
 511  * Like Simple_vFAIL(), but accepts three arguments.
 512  */
 513 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 514  const IV offset = RExC_parse - RExC_precomp;  \
 515  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,  \
 516    (int)offset, RExC_precomp, RExC_precomp + offset); \
 517 } STMT_END
 518
 519 /*
 520  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 521  */
 522 #define vFAIL3(m,a1,a2) STMT_START {   \
 523  if (!SIZE_ONLY)     \
 524   SAVEFREESV(RExC_rx_sv);    \
 525  Simple_vFAIL3(m, a1, a2);    \
 526 } STMT_END
 527
 528 /*
 529  * Like Simple_vFAIL(), but accepts four arguments.
 530  */
 531 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 532  const IV offset = RExC_parse - RExC_precomp;  \
 533  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,  \
 534    (int)offset, RExC_precomp, RExC_precomp + offset); \
 535 } STMT_END
 536
 537 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 538  if (!SIZE_ONLY)     \
 539   SAVEFREESV(RExC_rx_sv);    \
 540  Simple_vFAIL4(m, a1, a2, a3);   \
 541 } STMT_END
 542
 543 /* m is not necessarily a "literal string", in this macro */
 544 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 545  const IV offset = loc - RExC_precomp;                               \
 546  Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 547    m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 548 } STMT_END
 549
 550 #define ckWARNreg(loc,m) STMT_START {     \
 551  const IV offset = loc - RExC_precomp;    \
 552  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 553    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 554 } STMT_END
 555
 556 #define vWARN_dep(loc, m) STMT_START {            \
 557  const IV offset = loc - RExC_precomp;    \
 558  Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 559    (int)offset, RExC_precomp, RExC_precomp + offset);         \
 560 } STMT_END
 561
 562 #define ckWARNdep(loc,m) STMT_START {            \
 563  const IV offset = loc - RExC_precomp;    \
 564  Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 565    m REPORT_LOCATION,      \
 566    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 567 } STMT_END
 568
 569 #define ckWARNregdep(loc,m) STMT_START {    \
 570  const IV offset = loc - RExC_precomp;    \
 571  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 572    m REPORT_LOCATION,      \
 573    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 574 } STMT_END
 575
 576 #define ckWARN2regdep(loc,m, a1) STMT_START {    \
 577  const IV offset = loc - RExC_precomp;    \
 578  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 579    m REPORT_LOCATION,      \
 580    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 581 } STMT_END
 582
 583 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 584  const IV offset = loc - RExC_precomp;    \
 585  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 586    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 587 } STMT_END
 588
 589 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 590  const IV offset = loc - RExC_precomp;    \
 591  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 592    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 593 } STMT_END
 594
 595 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 596  const IV offset = loc - RExC_precomp;    \
 597  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 598    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 599 } STMT_END
 600
 601 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 602  const IV offset = loc - RExC_precomp;    \
 603  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 604    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 605 } STMT_END
 606
 607 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 608  const IV offset = loc - RExC_precomp;    \
 609  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 610    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 611 } STMT_END
 612
 613 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 614  const IV offset = loc - RExC_precomp;    \
 615  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 616    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 617 } STMT_END
 618
 619
 620 /* Allow for side effects in s */
 621 #define REGC(c,s) STMT_START {   \
 622  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 623 } STMT_END
 624
 625 /* Macros for recording node offsets.   20001227 mjd@plover.com
 626  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 627  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 628  * Element 0 holds the number n.
 629  * Position is 1 indexed.
 630  */
 631 #ifndef RE_TRACK_PATTERN_OFFSETS
 632 #define Set_Node_Offset_To_R(node,byte)
 633 #define Set_Node_Offset(node,byte)
 634 #define Set_Cur_Node_Offset
 635 #define Set_Node_Length_To_R(node,len)
 636 #define Set_Node_Length(node,len)
 637 #define Set_Node_Cur_Length(node)
 638 #define Node_Offset(n)
 639 #define Node_Length(n)
 640 #define Set_Node_Offset_Length(node,offset,len)
 641 #define ProgLen(ri) ri->u.proglen
 642 #define SetProgLen(ri,x) ri->u.proglen = x
 643 #else
 644 #define ProgLen(ri) ri->u.offsets[0]
 645 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 646 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 647  if (! SIZE_ONLY) {       \
 648   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 649      __LINE__, (int)(node), (int)(byte)));  \
 650   if((node) < 0) {      \
 651    Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 652   } else {       \
 653    RExC_offsets[2*(node)-1] = (byte);    \
 654   }        \
 655  }         \
 656 } STMT_END
 657
 658 #define Set_Node_Offset(node,byte) \
 659  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 660 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 661
 662 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 663  if (! SIZE_ONLY) {       \
 664   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 665     __LINE__, (int)(node), (int)(len)));   \
 666   if((node) < 0) {      \
 667    Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 668   } else {       \
 669    RExC_offsets[2*(node)] = (len);    \
 670   }        \
 671  }         \
 672 } STMT_END
 673
 674 #define Set_Node_Length(node,len) \
 675  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 676 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 677 #define Set_Node_Cur_Length(node) \
 678  Set_Node_Length(node, RExC_parse - parse_start)
 679
 680 /* Get offsets and lengths */
 681 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 682 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 683
 684 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 685  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 686  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 687 } STMT_END
 688 #endif
 689
 690 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 691 #define EXPERIMENTAL_INPLACESCAN
 692 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 693
 694 #define DEBUG_STUDYDATA(str,data,depth)                              \
 695 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 696  PerlIO_printf(Perl_debug_log,                                    \
 697   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 698   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 699   (int)(depth)*2, "",                                          \
 700   (IV)((data)->pos_min),                                       \
 701   (IV)((data)->pos_delta),                                     \
 702   (UV)((data)->flags),                                         \
 703   (IV)((data)->whilem_c),                                      \
 704   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 705   is_inf ? "INF " : ""                                         \
 706  );                                                               \
 707  if ((data)->last_found)                                          \
 708   PerlIO_printf(Perl_debug_log,                                \
 709    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 710    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 711    SvPVX_const((data)->last_found),                         \
 712    (IV)((data)->last_end),                                  \
 713    (IV)((data)->last_start_min),                            \
 714    (IV)((data)->last_start_max),                            \
 715    ((data)->longest &&                                      \
 716    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 717    SvPVX_const((data)->longest_fixed),                      \
 718    (IV)((data)->offset_fixed),                              \
 719    ((data)->longest &&                                      \
 720    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 721    SvPVX_const((data)->longest_float),                      \
 722    (IV)((data)->offset_float_min),                          \
 723    (IV)((data)->offset_float_max)                           \
 724   );                                                           \
 725  PerlIO_printf(Perl_debug_log,"\n");                              \
 726 });
 727
 728 /* Mark that we cannot extend a found fixed substring at this point.
 729    Update the longest found anchored substring and the longest found
 730    floating substrings if needed. */
 731
 732 STATIC void
 733 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 734 {
 735  const STRLEN l = CHR_SVLEN(data->last_found);
 736  const STRLEN old_l = CHR_SVLEN(*data->longest);
 737  GET_RE_DEBUG_FLAGS_DECL;
 738
 739  PERL_ARGS_ASSERT_SCAN_COMMIT;
 740
 741  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 742   SvSetMagicSV(*data->longest, data->last_found);
 743   if (*data->longest == data->longest_fixed) {
 744    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 745    if (data->flags & SF_BEFORE_EOL)
 746     data->flags
 747      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 748    else
 749     data->flags &= ~SF_FIX_BEFORE_EOL;
 750    data->minlen_fixed=minlenp;
 751    data->lookbehind_fixed=0;
 752   }
 753   else { /* *data->longest == data->longest_float */
 754    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 755    data->offset_float_max = (l
 756          ? data->last_start_max
 757          : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
 758    if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 759     data->offset_float_max = I32_MAX;
 760    if (data->flags & SF_BEFORE_EOL)
 761     data->flags
 762      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 763    else
 764     data->flags &= ~SF_FL_BEFORE_EOL;
 765    data->minlen_float=minlenp;
 766    data->lookbehind_float=0;
 767   }
 768  }
 769  SvCUR_set(data->last_found, 0);
 770  {
 771   SV * const sv = data->last_found;
 772   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 773    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 774    if (mg)
 775     mg->mg_len = 0;
 776   }
 777  }
 778  data->last_end = -1;
 779  data->flags &= ~SF_BEFORE_EOL;
 780  DEBUG_STUDYDATA("commit: ",data,0);
 781 }
 782
 783 /* These macros set, clear and test whether the synthetic start class ('ssc',
 784  * given by the parameter) matches an empty string (EOS).  This uses the
 785  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 786  * stands alone, so there is never a next_off, so this field is otherwise
 787  * unused.  The EOS information is used only for compilation, but theoretically
 788  * it could be passed on to the execution code.  This could be used to store
 789  * more than one bit of information, but only this one is currently used. */
 790 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 791 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 792 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 793
 794 /* Can match anything (initialization) */
 795 STATIC void
 796 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 797 {
 798  PERL_ARGS_ASSERT_CL_ANYTHING;
 799
 800  ANYOF_BITMAP_SETALL(cl);
 801  cl->flags = ANYOF_UNICODE_ALL;
 802  SET_SSC_EOS(cl);
 803
 804  /* If any portion of the regex is to operate under locale rules,
 805  * initialization includes it.  The reason this isn't done for all regexes
 806  * is that the optimizer was written under the assumption that locale was
 807  * all-or-nothing.  Given the complexity and lack of documentation in the
 808  * optimizer, and that there are inadequate test cases for locale, so many
 809  * parts of it may not work properly, it is safest to avoid locale unless
 810  * necessary. */
 811  if (RExC_contains_locale) {
 812   ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 813   cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 814  }
 815  else {
 816   ANYOF_CLASS_ZERO(cl);     /* Only /l uses class now */
 817  }
 818 }
 819
 820 /* Can match anything (initialization) */
 821 STATIC int
 822 S_cl_is_anything(const struct regnode_charclass_class *cl)
 823 {
 824  int value;
 825
 826  PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 827
 828  for (value = 0; value < ANYOF_MAX; value += 2)
 829   if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 830    return 1;
 831  if (!(cl->flags & ANYOF_UNICODE_ALL))
 832   return 0;
 833  if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 834   return 0;
 835  return 1;
 836 }
 837
 838 /* Can match anything (initialization) */
 839 STATIC void
 840 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 841 {
 842  PERL_ARGS_ASSERT_CL_INIT;
 843
 844  Zero(cl, 1, struct regnode_charclass_class);
 845  cl->type = ANYOF;
 846  cl_anything(pRExC_state, cl);
 847  ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 848 }
 849
 850 /* These two functions currently do the exact same thing */
 851 #define cl_init_zero  S_cl_init
 852
 853 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 854  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 855  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 856 STATIC void
 857 S_cl_and(struct regnode_charclass_class *cl,
 858   const struct regnode_charclass_class *and_with)
 859 {
 860  PERL_ARGS_ASSERT_CL_AND;
 861
 862  assert(PL_regkind[and_with->type] == ANYOF);
 863
 864  /* I (khw) am not sure all these restrictions are necessary XXX */
 865  if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 866   && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 867   && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 868   && !(and_with->flags & ANYOF_LOC_FOLD)
 869   && !(cl->flags & ANYOF_LOC_FOLD)) {
 870   int i;
 871
 872   if (and_with->flags & ANYOF_INVERT)
 873    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 874     cl->bitmap[i] &= ~and_with->bitmap[i];
 875   else
 876    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 877     cl->bitmap[i] &= and_with->bitmap[i];
 878  } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 879
 880  if (and_with->flags & ANYOF_INVERT) {
 881
 882   /* Here, the and'ed node is inverted.  Get the AND of the flags that
 883   * aren't affected by the inversion.  Those that are affected are
 884   * handled individually below */
 885   U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 886   cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 887   cl->flags |= affected_flags;
 888
 889   /* We currently don't know how to deal with things that aren't in the
 890   * bitmap, but we know that the intersection is no greater than what
 891   * is already in cl, so let there be false positives that get sorted
 892   * out after the synthetic start class succeeds, and the node is
 893   * matched for real. */
 894
 895   /* The inversion of these two flags indicate that the resulting
 896   * intersection doesn't have them */
 897   if (and_with->flags & ANYOF_UNICODE_ALL) {
 898    cl->flags &= ~ANYOF_UNICODE_ALL;
 899   }
 900   if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 901    cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 902   }
 903  }
 904  else {   /* and'd node is not inverted */
 905   U8 outside_bitmap_but_not_utf8; /* Temp variable */
 906
 907   if (! ANYOF_NONBITMAP(and_with)) {
 908
 909    /* Here 'and_with' doesn't match anything outside the bitmap
 910    * (except possibly ANYOF_UNICODE_ALL), which means the
 911    * intersection can't either, except for ANYOF_UNICODE_ALL, in
 912    * which case we don't know what the intersection is, but it's no
 913    * greater than what cl already has, so can just leave it alone,
 914    * with possible false positives */
 915    if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 916     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 917     cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 918    }
 919   }
 920   else if (! ANYOF_NONBITMAP(cl)) {
 921
 922    /* Here, 'and_with' does match something outside the bitmap, and cl
 923    * doesn't have a list of things to match outside the bitmap.  If
 924    * cl can match all code points above 255, the intersection will
 925    * be those above-255 code points that 'and_with' matches.  If cl
 926    * can't match all Unicode code points, it means that it can't
 927    * match anything outside the bitmap (since the 'if' that got us
 928    * into this block tested for that), so we leave the bitmap empty.
 929    */
 930    if (cl->flags & ANYOF_UNICODE_ALL) {
 931     ARG_SET(cl, ARG(and_with));
 932
 933     /* and_with's ARG may match things that don't require UTF8.
 934     * And now cl's will too, in spite of this being an 'and'.  See
 935     * the comments below about the kludge */
 936     cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 937    }
 938   }
 939   else {
 940    /* Here, both 'and_with' and cl match something outside the
 941    * bitmap.  Currently we do not do the intersection, so just match
 942    * whatever cl had at the beginning.  */
 943   }
 944
 945
 946   /* Take the intersection of the two sets of flags.  However, the
 947   * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 948   * kludge around the fact that this flag is not treated like the others
 949   * which are initialized in cl_anything().  The way the optimizer works
 950   * is that the synthetic start class (SSC) is initialized to match
 951   * anything, and then the first time a real node is encountered, its
 952   * values are AND'd with the SSC's with the result being the values of
 953   * the real node.  However, there are paths through the optimizer where
 954   * the AND never gets called, so those initialized bits are set
 955   * inappropriately, which is not usually a big deal, as they just cause
 956   * false positives in the SSC, which will just mean a probably
 957   * imperceptible slow down in execution.  However this bit has a
 958   * higher false positive consequence in that it can cause utf8.pm,
 959   * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 960   * bigger slowdown and also causes significant extra memory to be used.
 961   * In order to prevent this, the code now takes a different tack.  The
 962   * bit isn't set unless some part of the regular expression needs it,
 963   * but once set it won't get cleared.  This means that these extra
 964   * modules won't get loaded unless there was some path through the
 965   * pattern that would have required them anyway, and  so any false
 966   * positives that occur by not ANDing them out when they could be
 967   * aren't as severe as they would be if we treated this bit like all
 968   * the others */
 969   outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 970          & ANYOF_NONBITMAP_NON_UTF8;
 971   cl->flags &= and_with->flags;
 972   cl->flags |= outside_bitmap_but_not_utf8;
 973  }
 974 }
 975
 976 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 977  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 978  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 979 STATIC void
 980 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 981 {
 982  PERL_ARGS_ASSERT_CL_OR;
 983
 984  if (or_with->flags & ANYOF_INVERT) {
 985
 986   /* Here, the or'd node is to be inverted.  This means we take the
 987   * complement of everything not in the bitmap, but currently we don't
 988   * know what that is, so give up and match anything */
 989   if (ANYOF_NONBITMAP(or_with)) {
 990    cl_anything(pRExC_state, cl);
 991   }
 992   /* We do not use
 993   * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 994   *   <= (B1 | !B2) | (CL1 | !CL2)
 995   * which is wasteful if CL2 is small, but we ignore CL2:
 996   *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 997   * XXXX Can we handle case-fold?  Unclear:
 998   *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 999   *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1000   */
1001   else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1002    && !(or_with->flags & ANYOF_LOC_FOLD)
1003    && !(cl->flags & ANYOF_LOC_FOLD) ) {
1004    int i;
1005
1006    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1007     cl->bitmap[i] |= ~or_with->bitmap[i];
1008   } /* XXXX: logic is complicated otherwise */
1009   else {
1010    cl_anything(pRExC_state, cl);
1011   }
1012
1013   /* And, we can just take the union of the flags that aren't affected
1014   * by the inversion */
1015   cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1016
1017   /* For the remaining flags:
1018    ANYOF_UNICODE_ALL and inverted means to not match anything above
1019      255, which means that the union with cl should just be
1020      what cl has in it, so can ignore this flag
1021    ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1022      is 127-255 to match them, but then invert that, so the
1023      union with cl should just be what cl has in it, so can
1024      ignore this flag
1025   */
1026  } else {    /* 'or_with' is not inverted */
1027   /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1028   if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1029    && (!(or_with->flags & ANYOF_LOC_FOLD)
1030     || (cl->flags & ANYOF_LOC_FOLD)) ) {
1031    int i;
1032
1033    /* OR char bitmap and class bitmap separately */
1034    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1035     cl->bitmap[i] |= or_with->bitmap[i];
1036    if (or_with->flags & ANYOF_CLASS) {
1037     ANYOF_CLASS_OR(or_with, cl);
1038    }
1039   }
1040   else { /* XXXX: logic is complicated, leave it along for a moment. */
1041    cl_anything(pRExC_state, cl);
1042   }
1043
1044   if (ANYOF_NONBITMAP(or_with)) {
1045
1046    /* Use the added node's outside-the-bit-map match if there isn't a
1047    * conflict.  If there is a conflict (both nodes match something
1048    * outside the bitmap, but what they match outside is not the same
1049    * pointer, and hence not easily compared until XXX we extend
1050    * inversion lists this far), give up and allow the start class to
1051    * match everything outside the bitmap.  If that stuff is all above
1052    * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1053    if (! ANYOF_NONBITMAP(cl)) {
1054     ARG_SET(cl, ARG(or_with));
1055    }
1056    else if (ARG(cl) != ARG(or_with)) {
1057
1058     if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1059      cl_anything(pRExC_state, cl);
1060     }
1061     else {
1062      cl->flags |= ANYOF_UNICODE_ALL;
1063     }
1064    }
1065   }
1066
1067   /* Take the union */
1068   cl->flags |= or_with->flags;
1069  }
1070 }
1071
1072 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1073 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1074 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1075 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1076
1077
1078 #ifdef DEBUGGING
1079 /*
1080    dump_trie(trie,widecharmap,revcharmap)
1081    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1082    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1083
1084    These routines dump out a trie in a somewhat readable format.
1085    The _interim_ variants are used for debugging the interim
1086    tables that are used to generate the final compressed
1087    representation which is what dump_trie expects.
1088
1089    Part of the reason for their existence is to provide a form
1090    of documentation as to how the different representations function.
1091
1092 */
1093
1094 /*
1095   Dumps the final compressed table form of the trie to Perl_debug_log.
1096   Used for debugging make_trie().
1097 */
1098
1099 STATIC void
1100 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1101    AV *revcharmap, U32 depth)
1102 {
1103  U32 state;
1104  SV *sv=sv_newmortal();
1105  int colwidth= widecharmap ? 6 : 4;
1106  U16 word;
1107  GET_RE_DEBUG_FLAGS_DECL;
1108
1109  PERL_ARGS_ASSERT_DUMP_TRIE;
1110
1111  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1112   (int)depth * 2 + 2,"",
1113   "Match","Base","Ofs" );
1114
1115  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1116   SV ** const tmp = av_fetch( revcharmap, state, 0);
1117   if ( tmp ) {
1118    PerlIO_printf( Perl_debug_log, "%*s",
1119     colwidth,
1120     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1121        PL_colors[0], PL_colors[1],
1122        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1123        PERL_PV_ESCAPE_FIRSTCHAR
1124     )
1125    );
1126   }
1127  }
1128  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1129   (int)depth * 2 + 2,"");
1130
1131  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1132   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1133  PerlIO_printf( Perl_debug_log, "\n");
1134
1135  for( state = 1 ; state < trie->statecount ; state++ ) {
1136   const U32 base = trie->states[ state ].trans.base;
1137
1138   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1139
1140   if ( trie->states[ state ].wordnum ) {
1141    PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1142   } else {
1143    PerlIO_printf( Perl_debug_log, "%6s", "" );
1144   }
1145
1146   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1147
1148   if ( base ) {
1149    U32 ofs = 0;
1150
1151    while( ( base + ofs  < trie->uniquecharcount ) ||
1152     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1153      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1154      ofs++;
1155
1156    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1157
1158    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1159     if ( ( base + ofs >= trie->uniquecharcount ) &&
1160      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1161      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1162     {
1163     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1164      colwidth,
1165      (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1166     } else {
1167      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1168     }
1169    }
1170
1171    PerlIO_printf( Perl_debug_log, "]");
1172
1173   }
1174   PerlIO_printf( Perl_debug_log, "\n" );
1175  }
1176  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1177  for (word=1; word <= trie->wordcount; word++) {
1178   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1179    (int)word, (int)(trie->wordinfo[word].prev),
1180    (int)(trie->wordinfo[word].len));
1181  }
1182  PerlIO_printf(Perl_debug_log, "\n" );
1183 }
1184 /*
1185   Dumps a fully constructed but uncompressed trie in list form.
1186   List tries normally only are used for construction when the number of
1187   possible chars (trie->uniquecharcount) is very high.
1188   Used for debugging make_trie().
1189 */
1190 STATIC void
1191 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1192       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1193       U32 depth)
1194 {
1195  U32 state;
1196  SV *sv=sv_newmortal();
1197  int colwidth= widecharmap ? 6 : 4;
1198  GET_RE_DEBUG_FLAGS_DECL;
1199
1200  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1201
1202  /* print out the table precompression.  */
1203  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1204   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1205   "------:-----+-----------------\n" );
1206
1207  for( state=1 ; state < next_alloc ; state ++ ) {
1208   U16 charid;
1209
1210   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1211    (int)depth * 2 + 2,"", (UV)state  );
1212   if ( ! trie->states[ state ].wordnum ) {
1213    PerlIO_printf( Perl_debug_log, "%5s| ","");
1214   } else {
1215    PerlIO_printf( Perl_debug_log, "W%4x| ",
1216     trie->states[ state ].wordnum
1217    );
1218   }
1219   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1220    SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1221    if ( tmp ) {
1222     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1223      colwidth,
1224      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1225        PL_colors[0], PL_colors[1],
1226        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1227        PERL_PV_ESCAPE_FIRSTCHAR
1228      ) ,
1229      TRIE_LIST_ITEM(state,charid).forid,
1230      (UV)TRIE_LIST_ITEM(state,charid).newstate
1231     );
1232     if (!(charid % 10))
1233      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1234       (int)((depth * 2) + 14), "");
1235    }
1236   }
1237   PerlIO_printf( Perl_debug_log, "\n");
1238  }
1239 }
1240
1241 /*
1242   Dumps a fully constructed but uncompressed trie in table form.
1243   This is the normal DFA style state transition table, with a few
1244   twists to facilitate compression later.
1245   Used for debugging make_trie().
1246 */
1247 STATIC void
1248 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1249       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1250       U32 depth)
1251 {
1252  U32 state;
1253  U16 charid;
1254  SV *sv=sv_newmortal();
1255  int colwidth= widecharmap ? 6 : 4;
1256  GET_RE_DEBUG_FLAGS_DECL;
1257
1258  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1259
1260  /*
1261  print out the table precompression so that we can do a visual check
1262  that they are identical.
1263  */
1264
1265  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1266
1267  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1268   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1269   if ( tmp ) {
1270    PerlIO_printf( Perl_debug_log, "%*s",
1271     colwidth,
1272     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1273        PL_colors[0], PL_colors[1],
1274        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1275        PERL_PV_ESCAPE_FIRSTCHAR
1276     )
1277    );
1278   }
1279  }
1280
1281  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1282
1283  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1284   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1285  }
1286
1287  PerlIO_printf( Perl_debug_log, "\n" );
1288
1289  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1290
1291   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1292    (int)depth * 2 + 2,"",
1293    (UV)TRIE_NODENUM( state ) );
1294
1295   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1296    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1297    if (v)
1298     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1299    else
1300     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1301   }
1302   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1303    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1304   } else {
1305    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1306    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1307   }
1308  }
1309 }
1310
1311 #endif
1312
1313
1314 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1315   startbranch: the first branch in the whole branch sequence
1316   first      : start branch of sequence of branch-exact nodes.
1317    May be the same as startbranch
1318   last       : Thing following the last branch.
1319    May be the same as tail.
1320   tail       : item following the branch sequence
1321   count      : words in the sequence
1322   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1323   depth      : indent depth
1324
1325 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1326
1327 A trie is an N'ary tree where the branches are determined by digital
1328 decomposition of the key. IE, at the root node you look up the 1st character and
1329 follow that branch repeat until you find the end of the branches. Nodes can be
1330 marked as "accepting" meaning they represent a complete word. Eg:
1331
1332   /he|she|his|hers/
1333
1334 would convert into the following structure. Numbers represent states, letters
1335 following numbers represent valid transitions on the letter from that state, if
1336 the number is in square brackets it represents an accepting state, otherwise it
1337 will be in parenthesis.
1338
1339  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1340  |    |
1341  |   (2)
1342  |    |
1343  (1)   +-i->(6)-+-s->[7]
1344  |
1345  +-s->(3)-+-h->(4)-+-e->[5]
1346
1347  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1348
1349 This shows that when matching against the string 'hers' we will begin at state 1
1350 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1351 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1352 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1353 single traverse. We store a mapping from accepting to state to which word was
1354 matched, and then when we have multiple possibilities we try to complete the
1355 rest of the regex in the order in which they occured in the alternation.
1356
1357 The only prior NFA like behaviour that would be changed by the TRIE support is
1358 the silent ignoring of duplicate alternations which are of the form:
1359
1360  / (DUPE|DUPE) X? (?{ ... }) Y /x
1361
1362 Thus EVAL blocks following a trie may be called a different number of times with
1363 and without the optimisation. With the optimisations dupes will be silently
1364 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1365 the following demonstrates:
1366
1367  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1368
1369 which prints out 'word' three times, but
1370
1371  'words'=~/(word|word|word)(?{ print $1 })S/
1372
1373 which doesnt print it out at all. This is due to other optimisations kicking in.
1374
1375 Example of what happens on a structural level:
1376
1377 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1378
1379    1: CURLYM[1] {1,32767}(18)
1380    5:   BRANCH(8)
1381    6:     EXACT <ac>(16)
1382    8:   BRANCH(11)
1383    9:     EXACT <ad>(16)
1384   11:   BRANCH(14)
1385   12:     EXACT <ab>(16)
1386   16:   SUCCEED(0)
1387   17:   NOTHING(18)
1388   18: END(0)
1389
1390 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1391 and should turn into:
1392
1393    1: CURLYM[1] {1,32767}(18)
1394    5:   TRIE(16)
1395   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1396   <ac>
1397   <ad>
1398   <ab>
1399   16:   SUCCEED(0)
1400   17:   NOTHING(18)
1401   18: END(0)
1402
1403 Cases where tail != last would be like /(?foo|bar)baz/:
1404
1405    1: BRANCH(4)
1406    2:   EXACT <foo>(8)
1407    4: BRANCH(7)
1408    5:   EXACT <bar>(8)
1409    7: TAIL(8)
1410    8: EXACT <baz>(10)
1411   10: END(0)
1412
1413 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1414 and would end up looking like:
1415
1416  1: TRIE(8)
1417  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1418   <foo>
1419   <bar>
1420    7: TAIL(8)
1421    8: EXACT <baz>(10)
1422   10: END(0)
1423
1424  d = uvuni_to_utf8_flags(d, uv, 0);
1425
1426 is the recommended Unicode-aware way of saying
1427
1428  *(d++) = uv;
1429 */
1430
1431 #define TRIE_STORE_REVCHAR(val)                                            \
1432  STMT_START {                                                           \
1433   if (UTF) {          \
1434    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1435    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1436    unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1437    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1438    SvPOK_on(zlopp);         \
1439    SvUTF8_on(zlopp);         \
1440    av_push(revcharmap, zlopp);        \
1441   } else {          \
1442    char ooooff = (char)val;                                           \
1443    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1444   }           \
1445   } STMT_END
1446
1447 #define TRIE_READ_CHAR STMT_START {                                                     \
1448  wordlen++;                                                                          \
1449  if ( UTF ) {                                                                        \
1450   /* if it is UTF then it is either already folded, or does not need folding */   \
1451   uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1452  }                                                                                   \
1453  else if (folder == PL_fold_latin1) {                                                \
1454   /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1455   if ( foldlen > 0 ) {                                                            \
1456   uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1457   foldlen -= len;                                                              \
1458   scan += len;                                                                 \
1459   len = 0;                                                                     \
1460   } else {                                                                        \
1461    len = 1;                                                                    \
1462    uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1463    skiplen = UNISKIP(uvc);                                                     \
1464    foldlen -= skiplen;                                                         \
1465    scan = foldbuf + skiplen;                                                   \
1466   }                                                                               \
1467  } else {                                                                            \
1468   /* raw data, will be folded later if needed */                                  \
1469   uvc = (U32)*uc;                                                                 \
1470   len = 1;                                                                        \
1471  }                                                                                   \
1472 } STMT_END
1473
1474
1475
1476 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1477  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1478   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1479   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1480  }                                                           \
1481  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1482  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1483  TRIE_LIST_CUR( state )++;                                   \
1484 } STMT_END
1485
1486 #define TRIE_LIST_NEW(state) STMT_START {                       \
1487  Newxz( trie->states[ state ].trans.list,               \
1488   4, reg_trie_trans_le );                                 \
1489  TRIE_LIST_CUR( state ) = 1;                                \
1490  TRIE_LIST_LEN( state ) = 4;                                \
1491 } STMT_END
1492
1493 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1494  U16 dupe= trie->states[ state ].wordnum;                    \
1495  regnode * const noper_next = regnext( noper );              \
1496                 \
1497  DEBUG_r({                                                   \
1498   /* store the word for dumping */                        \
1499   SV* tmp;                                                \
1500   if (OP(noper) != NOTHING)                               \
1501    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1502   else                                                    \
1503    tmp = newSVpvn_utf8( "", 0, UTF );   \
1504   av_push( trie_words, tmp );                             \
1505  });                                                         \
1506                 \
1507  curword++;                                                  \
1508  trie->wordinfo[curword].prev   = 0;                         \
1509  trie->wordinfo[curword].len    = wordlen;                   \
1510  trie->wordinfo[curword].accept = state;                     \
1511                 \
1512  if ( noper_next < tail ) {                                  \
1513   if (!trie->jump)                                        \
1514    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1515   trie->jump[curword] = (U16)(noper_next - convert);      \
1516   if (!jumper)                                            \
1517    jumper = noper_next;                                \
1518   if (!nextbranch)                                        \
1519    nextbranch= regnext(cur);                           \
1520  }                                                           \
1521                 \
1522  if ( dupe ) {                                               \
1523   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1524   /* chain, so that when the bits of chain are later    */\
1525   /* linked together, the dups appear in the chain      */\
1526   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1527   trie->wordinfo[dupe].prev = curword;                    \
1528  } else {                                                    \
1529   /* we haven't inserted this word yet.                */ \
1530   trie->states[ state ].wordnum = curword;                \
1531  }                                                           \
1532 } STMT_END
1533
1534
1535 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1536  ( ( base + charid >=  ucharcount     \
1537   && base + charid < ubound     \
1538   && state == trie->trans[ base - ucharcount + charid ].check \
1539   && trie->trans[ base - ucharcount + charid ].next )  \
1540   ? trie->trans[ base - ucharcount + charid ].next  \
1541   : ( state==1 ? special : 0 )     \
1542  )
1543
1544 #define MADE_TRIE       1
1545 #define MADE_JUMP_TRIE  2
1546 #define MADE_EXACT_TRIE 4
1547
1548 STATIC I32
1549 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1550 {
1551  dVAR;
1552  /* first pass, loop through and scan words */
1553  reg_trie_data *trie;
1554  HV *widecharmap = NULL;
1555  AV *revcharmap = newAV();
1556  regnode *cur;
1557  const U32 uniflags = UTF8_ALLOW_DEFAULT;
1558  STRLEN len = 0;
1559  UV uvc = 0;
1560  U16 curword = 0;
1561  U32 next_alloc = 0;
1562  regnode *jumper = NULL;
1563  regnode *nextbranch = NULL;
1564  regnode *convert = NULL;
1565  U32 *prev_states; /* temp array mapping each state to previous one */
1566  /* we just use folder as a flag in utf8 */
1567  const U8 * folder = NULL;
1568
1569 #ifdef DEBUGGING
1570  const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1571  AV *trie_words = NULL;
1572  /* along with revcharmap, this only used during construction but both are
1573  * useful during debugging so we store them in the struct when debugging.
1574  */
1575 #else
1576  const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1577  STRLEN trie_charcount=0;
1578 #endif
1579  SV *re_trie_maxbuff;
1580  GET_RE_DEBUG_FLAGS_DECL;
1581
1582  PERL_ARGS_ASSERT_MAKE_TRIE;
1583 #ifndef DEBUGGING
1584  PERL_UNUSED_ARG(depth);
1585 #endif
1586
1587  switch (flags) {
1588   case EXACT: break;
1589   case EXACTFA:
1590   case EXACTFU_SS:
1591   case EXACTFU_TRICKYFOLD:
1592   case EXACTFU: folder = PL_fold_latin1; break;
1593   case EXACTF:  folder = PL_fold; break;
1594   case EXACTFL: folder = PL_fold_locale; break;
1595   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1596  }
1597
1598  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1599  trie->refcount = 1;
1600  trie->startstate = 1;
1601  trie->wordcount = word_count;
1602  RExC_rxi->data->data[ data_slot ] = (void*)trie;
1603  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1604  if (flags == EXACT)
1605   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1606  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1607      trie->wordcount+1, sizeof(reg_trie_wordinfo));
1608
1609  DEBUG_r({
1610   trie_words = newAV();
1611  });
1612
1613  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1614  if (!SvIOK(re_trie_maxbuff)) {
1615   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1616  }
1617  DEBUG_TRIE_COMPILE_r({
1618     PerlIO_printf( Perl_debug_log,
1619     "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1620     (int)depth * 2 + 2, "",
1621     REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1622     REG_NODE_NUM(last), REG_NODE_NUM(tail),
1623     (int)depth);
1624  });
1625
1626    /* Find the node we are going to overwrite */
1627  if ( first == startbranch && OP( last ) != BRANCH ) {
1628   /* whole branch chain */
1629   convert = first;
1630  } else {
1631   /* branch sub-chain */
1632   convert = NEXTOPER( first );
1633  }
1634
1635  /*  -- First loop and Setup --
1636
1637  We first traverse the branches and scan each word to determine if it
1638  contains widechars, and how many unique chars there are, this is
1639  important as we have to build a table with at least as many columns as we
1640  have unique chars.
1641
1642  We use an array of integers to represent the character codes 0..255
1643  (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1644  native representation of the character value as the key and IV's for the
1645  coded index.
1646
1647  *TODO* If we keep track of how many times each character is used we can
1648  remap the columns so that the table compression later on is more
1649  efficient in terms of memory by ensuring the most common value is in the
1650  middle and the least common are on the outside.  IMO this would be better
1651  than a most to least common mapping as theres a decent chance the most
1652  common letter will share a node with the least common, meaning the node
1653  will not be compressible. With a middle is most common approach the worst
1654  case is when we have the least common nodes twice.
1655
1656  */
1657
1658  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1659   regnode *noper = NEXTOPER( cur );
1660   const U8 *uc = (U8*)STRING( noper );
1661   const U8 *e  = uc + STR_LEN( noper );
1662   STRLEN foldlen = 0;
1663   U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1664   STRLEN skiplen = 0;
1665   const U8 *scan = (U8*)NULL;
1666   U32 wordlen      = 0;         /* required init */
1667   STRLEN chars = 0;
1668   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1669
1670   if (OP(noper) == NOTHING) {
1671    regnode *noper_next= regnext(noper);
1672    if (noper_next != tail && OP(noper_next) == flags) {
1673     noper = noper_next;
1674     uc= (U8*)STRING(noper);
1675     e= uc + STR_LEN(noper);
1676     trie->minlen= STR_LEN(noper);
1677    } else {
1678     trie->minlen= 0;
1679     continue;
1680    }
1681   }
1682
1683   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1684    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1685           regardless of encoding */
1686    if (OP( noper ) == EXACTFU_SS) {
1687     /* false positives are ok, so just set this */
1688     TRIE_BITMAP_SET(trie,0xDF);
1689    }
1690   }
1691   for ( ; uc < e ; uc += len ) {
1692    TRIE_CHARCOUNT(trie)++;
1693    TRIE_READ_CHAR;
1694    chars++;
1695    if ( uvc < 256 ) {
1696     if ( folder ) {
1697      U8 folded= folder[ (U8) uvc ];
1698      if ( !trie->charmap[ folded ] ) {
1699       trie->charmap[ folded ]=( ++trie->uniquecharcount );
1700       TRIE_STORE_REVCHAR( folded );
1701      }
1702     }
1703     if ( !trie->charmap[ uvc ] ) {
1704      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1705      TRIE_STORE_REVCHAR( uvc );
1706     }
1707     if ( set_bit ) {
1708      /* store the codepoint in the bitmap, and its folded
1709      * equivalent. */
1710      TRIE_BITMAP_SET(trie, uvc);
1711
1712      /* store the folded codepoint */
1713      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1714
1715      if ( !UTF ) {
1716       /* store first byte of utf8 representation of
1717       variant codepoints */
1718       if (! UNI_IS_INVARIANT(uvc)) {
1719        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1720       }
1721      }
1722      set_bit = 0; /* We've done our bit :-) */
1723     }
1724    } else {
1725     SV** svpp;
1726     if ( !widecharmap )
1727      widecharmap = newHV();
1728
1729     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1730
1731     if ( !svpp )
1732      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1733
1734     if ( !SvTRUE( *svpp ) ) {
1735      sv_setiv( *svpp, ++trie->uniquecharcount );
1736      TRIE_STORE_REVCHAR(uvc);
1737     }
1738    }
1739   }
1740   if( cur == first ) {
1741    trie->minlen = chars;
1742    trie->maxlen = chars;
1743   } else if (chars < trie->minlen) {
1744    trie->minlen = chars;
1745   } else if (chars > trie->maxlen) {
1746    trie->maxlen = chars;
1747   }
1748   if (OP( noper ) == EXACTFU_SS) {
1749    /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1750    if (trie->minlen > 1)
1751     trie->minlen= 1;
1752   }
1753   if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1754    /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1755    *        - We assume that any such sequence might match a 2 byte string */
1756    if (trie->minlen > 2 )
1757     trie->minlen= 2;
1758   }
1759
1760  } /* end first pass */
1761  DEBUG_TRIE_COMPILE_r(
1762   PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1763     (int)depth * 2 + 2,"",
1764     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1765     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1766     (int)trie->minlen, (int)trie->maxlen )
1767  );
1768
1769  /*
1770   We now know what we are dealing with in terms of unique chars and
1771   string sizes so we can calculate how much memory a naive
1772   representation using a flat table  will take. If it's over a reasonable
1773   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1774   conservative but potentially much slower representation using an array
1775   of lists.
1776
1777   At the end we convert both representations into the same compressed
1778   form that will be used in regexec.c for matching with. The latter
1779   is a form that cannot be used to construct with but has memory
1780   properties similar to the list form and access properties similar
1781   to the table form making it both suitable for fast searches and
1782   small enough that its feasable to store for the duration of a program.
1783
1784   See the comment in the code where the compressed table is produced
1785   inplace from the flat tabe representation for an explanation of how
1786   the compression works.
1787
1788  */
1789
1790
1791  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1792  prev_states[1] = 0;
1793
1794  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1795   /*
1796    Second Pass -- Array Of Lists Representation
1797
1798    Each state will be represented by a list of charid:state records
1799    (reg_trie_trans_le) the first such element holds the CUR and LEN
1800    points of the allocated array. (See defines above).
1801
1802    We build the initial structure using the lists, and then convert
1803    it into the compressed table form which allows faster lookups
1804    (but cant be modified once converted).
1805   */
1806
1807   STRLEN transcount = 1;
1808
1809   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1810    "%*sCompiling trie using list compiler\n",
1811    (int)depth * 2 + 2, ""));
1812
1813   trie->states = (reg_trie_state *)
1814    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1815         sizeof(reg_trie_state) );
1816   TRIE_LIST_NEW(1);
1817   next_alloc = 2;
1818
1819   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1820
1821    regnode *noper   = NEXTOPER( cur );
1822    U8 *uc           = (U8*)STRING( noper );
1823    const U8 *e      = uc + STR_LEN( noper );
1824    U32 state        = 1;         /* required init */
1825    U16 charid       = 0;         /* sanity init */
1826    U8 *scan         = (U8*)NULL; /* sanity init */
1827    STRLEN foldlen   = 0;         /* required init */
1828    U32 wordlen      = 0;         /* required init */
1829    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1830    STRLEN skiplen   = 0;
1831
1832    if (OP(noper) == NOTHING) {
1833     regnode *noper_next= regnext(noper);
1834     if (noper_next != tail && OP(noper_next) == flags) {
1835      noper = noper_next;
1836      uc= (U8*)STRING(noper);
1837      e= uc + STR_LEN(noper);
1838     }
1839    }
1840
1841    if (OP(noper) != NOTHING) {
1842     for ( ; uc < e ; uc += len ) {
1843
1844      TRIE_READ_CHAR;
1845
1846      if ( uvc < 256 ) {
1847       charid = trie->charmap[ uvc ];
1848      } else {
1849       SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1850       if ( !svpp ) {
1851        charid = 0;
1852       } else {
1853        charid=(U16)SvIV( *svpp );
1854       }
1855      }
1856      /* charid is now 0 if we dont know the char read, or nonzero if we do */
1857      if ( charid ) {
1858
1859       U16 check;
1860       U32 newstate = 0;
1861
1862       charid--;
1863       if ( !trie->states[ state ].trans.list ) {
1864        TRIE_LIST_NEW( state );
1865       }
1866       for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1867        if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1868         newstate = TRIE_LIST_ITEM( state, check ).newstate;
1869         break;
1870        }
1871       }
1872       if ( ! newstate ) {
1873        newstate = next_alloc++;
1874        prev_states[newstate] = state;
1875        TRIE_LIST_PUSH( state, charid, newstate );
1876        transcount++;
1877       }
1878       state = newstate;
1879      } else {
1880       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1881      }
1882     }
1883    }
1884    TRIE_HANDLE_WORD(state);
1885
1886   } /* end second pass */
1887
1888   /* next alloc is the NEXT state to be allocated */
1889   trie->statecount = next_alloc;
1890   trie->states = (reg_trie_state *)
1891    PerlMemShared_realloc( trie->states,
1892         next_alloc
1893         * sizeof(reg_trie_state) );
1894
1895   /* and now dump it out before we compress it */
1896   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1897               revcharmap, next_alloc,
1898               depth+1)
1899   );
1900
1901   trie->trans = (reg_trie_trans *)
1902    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1903   {
1904    U32 state;
1905    U32 tp = 0;
1906    U32 zp = 0;
1907
1908
1909    for( state=1 ; state < next_alloc ; state ++ ) {
1910     U32 base=0;
1911
1912     /*
1913     DEBUG_TRIE_COMPILE_MORE_r(
1914      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1915     );
1916     */
1917
1918     if (trie->states[state].trans.list) {
1919      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1920      U16 maxid=minid;
1921      U16 idx;
1922
1923      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1924       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1925       if ( forid < minid ) {
1926        minid=forid;
1927       } else if ( forid > maxid ) {
1928        maxid=forid;
1929       }
1930      }
1931      if ( transcount < tp + maxid - minid + 1) {
1932       transcount *= 2;
1933       trie->trans = (reg_trie_trans *)
1934        PerlMemShared_realloc( trie->trans,
1935              transcount
1936              * sizeof(reg_trie_trans) );
1937       Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1938      }
1939      base = trie->uniquecharcount + tp - minid;
1940      if ( maxid == minid ) {
1941       U32 set = 0;
1942       for ( ; zp < tp ; zp++ ) {
1943        if ( ! trie->trans[ zp ].next ) {
1944         base = trie->uniquecharcount + zp - minid;
1945         trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1946         trie->trans[ zp ].check = state;
1947         set = 1;
1948         break;
1949        }
1950       }
1951       if ( !set ) {
1952        trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1953        trie->trans[ tp ].check = state;
1954        tp++;
1955        zp = tp;
1956       }
1957      } else {
1958       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1959        const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1960        trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1961        trie->trans[ tid ].check = state;
1962       }
1963       tp += ( maxid - minid + 1 );
1964      }
1965      Safefree(trie->states[ state ].trans.list);
1966     }
1967     /*
1968     DEBUG_TRIE_COMPILE_MORE_r(
1969      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1970     );
1971     */
1972     trie->states[ state ].trans.base=base;
1973    }
1974    trie->lasttrans = tp + 1;
1975   }
1976  } else {
1977   /*
1978   Second Pass -- Flat Table Representation.
1979
1980   we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1981   We know that we will need Charcount+1 trans at most to store the data
1982   (one row per char at worst case) So we preallocate both structures
1983   assuming worst case.
1984
1985   We then construct the trie using only the .next slots of the entry
1986   structs.
1987
1988   We use the .check field of the first entry of the node temporarily to
1989   make compression both faster and easier by keeping track of how many non
1990   zero fields are in the node.
1991
1992   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1993   transition.
1994
1995   There are two terms at use here: state as a TRIE_NODEIDX() which is a
1996   number representing the first entry of the node, and state as a
1997   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1998   TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1999   are 2 entrys per node. eg:
2000
2001    A B       A B
2002   1. 2 4    1. 3 7
2003   2. 0 3    3. 0 5
2004   3. 0 0    5. 0 0
2005   4. 0 0    7. 0 0
2006
2007   The table is internally in the right hand, idx form. However as we also
2008   have to deal with the states array which is indexed by nodenum we have to
2009   use TRIE_NODENUM() to convert.
2010
2011   */
2012   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2013    "%*sCompiling trie using table compiler\n",
2014    (int)depth * 2 + 2, ""));
2015
2016   trie->trans = (reg_trie_trans *)
2017    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2018         * trie->uniquecharcount + 1,
2019         sizeof(reg_trie_trans) );
2020   trie->states = (reg_trie_state *)
2021    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2022         sizeof(reg_trie_state) );
2023   next_alloc = trie->uniquecharcount + 1;
2024
2025
2026   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2027
2028    regnode *noper   = NEXTOPER( cur );
2029    const U8 *uc     = (U8*)STRING( noper );
2030    const U8 *e      = uc + STR_LEN( noper );
2031
2032    U32 state        = 1;         /* required init */
2033
2034    U16 charid       = 0;         /* sanity init */
2035    U32 accept_state = 0;         /* sanity init */
2036    U8 *scan         = (U8*)NULL; /* sanity init */
2037
2038    STRLEN foldlen   = 0;         /* required init */
2039    U32 wordlen      = 0;         /* required init */
2040    STRLEN skiplen   = 0;
2041    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2042
2043    if (OP(noper) == NOTHING) {
2044     regnode *noper_next= regnext(noper);
2045     if (noper_next != tail && OP(noper_next) == flags) {
2046      noper = noper_next;
2047      uc= (U8*)STRING(noper);
2048      e= uc + STR_LEN(noper);
2049     }
2050    }
2051
2052    if ( OP(noper) != NOTHING ) {
2053     for ( ; uc < e ; uc += len ) {
2054
2055      TRIE_READ_CHAR;
2056
2057      if ( uvc < 256 ) {
2058       charid = trie->charmap[ uvc ];
2059      } else {
2060       SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2061       charid = svpp ? (U16)SvIV(*svpp) : 0;
2062      }
2063      if ( charid ) {
2064       charid--;
2065       if ( !trie->trans[ state + charid ].next ) {
2066        trie->trans[ state + charid ].next = next_alloc;
2067        trie->trans[ state ].check++;
2068        prev_states[TRIE_NODENUM(next_alloc)]
2069          = TRIE_NODENUM(state);
2070        next_alloc += trie->uniquecharcount;
2071       }
2072       state = trie->trans[ state + charid ].next;
2073      } else {
2074       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2075      }
2076      /* charid is now 0 if we dont know the char read, or nonzero if we do */
2077     }
2078    }
2079    accept_state = TRIE_NODENUM( state );
2080    TRIE_HANDLE_WORD(accept_state);
2081
2082   } /* end second pass */
2083
2084   /* and now dump it out before we compress it */
2085   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2086               revcharmap,
2087               next_alloc, depth+1));
2088
2089   {
2090   /*
2091   * Inplace compress the table.*
2092
2093   For sparse data sets the table constructed by the trie algorithm will
2094   be mostly 0/FAIL transitions or to put it another way mostly empty.
2095   (Note that leaf nodes will not contain any transitions.)
2096
2097   This algorithm compresses the tables by eliminating most such
2098   transitions, at the cost of a modest bit of extra work during lookup:
2099
2100   - Each states[] entry contains a .base field which indicates the
2101   index in the state[] array wheres its transition data is stored.
2102
2103   - If .base is 0 there are no valid transitions from that node.
2104
2105   - If .base is nonzero then charid is added to it to find an entry in
2106   the trans array.
2107
2108   -If trans[states[state].base+charid].check!=state then the
2109   transition is taken to be a 0/Fail transition. Thus if there are fail
2110   transitions at the front of the node then the .base offset will point
2111   somewhere inside the previous nodes data (or maybe even into a node
2112   even earlier), but the .check field determines if the transition is
2113   valid.
2114
2115   XXX - wrong maybe?
2116   The following process inplace converts the table to the compressed
2117   table: We first do not compress the root node 1,and mark all its
2118   .check pointers as 1 and set its .base pointer as 1 as well. This
2119   allows us to do a DFA construction from the compressed table later,
2120   and ensures that any .base pointers we calculate later are greater
2121   than 0.
2122
2123   - We set 'pos' to indicate the first entry of the second node.
2124
2125   - We then iterate over the columns of the node, finding the first and
2126   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2127   and set the .check pointers accordingly, and advance pos
2128   appropriately and repreat for the next node. Note that when we copy
2129   the next pointers we have to convert them from the original
2130   NODEIDX form to NODENUM form as the former is not valid post
2131   compression.
2132
2133   - If a node has no transitions used we mark its base as 0 and do not
2134   advance the pos pointer.
2135
2136   - If a node only has one transition we use a second pointer into the
2137   structure to fill in allocated fail transitions from other states.
2138   This pointer is independent of the main pointer and scans forward
2139   looking for null transitions that are allocated to a state. When it
2140   finds one it writes the single transition into the "hole".  If the
2141   pointer doesnt find one the single transition is appended as normal.
2142
2143   - Once compressed we can Renew/realloc the structures to release the
2144   excess space.
2145
2146   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2147   specifically Fig 3.47 and the associated pseudocode.
2148
2149   demq
2150   */
2151   const U32 laststate = TRIE_NODENUM( next_alloc );
2152   U32 state, charid;
2153   U32 pos = 0, zp=0;
2154   trie->statecount = laststate;
2155
2156   for ( state = 1 ; state < laststate ; state++ ) {
2157    U8 flag = 0;
2158    const U32 stateidx = TRIE_NODEIDX( state );
2159    const U32 o_used = trie->trans[ stateidx ].check;
2160    U32 used = trie->trans[ stateidx ].check;
2161    trie->trans[ stateidx ].check = 0;
2162
2163    for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2164     if ( flag || trie->trans[ stateidx + charid ].next ) {
2165      if ( trie->trans[ stateidx + charid ].next ) {
2166       if (o_used == 1) {
2167        for ( ; zp < pos ; zp++ ) {
2168         if ( ! trie->trans[ zp ].next ) {
2169          break;
2170         }
2171        }
2172        trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2173        trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2174        trie->trans[ zp ].check = state;
2175        if ( ++zp > pos ) pos = zp;
2176        break;
2177       }
2178       used--;
2179      }
2180      if ( !flag ) {
2181       flag = 1;
2182       trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2183      }
2184      trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2185      trie->trans[ pos ].check = state;
2186      pos++;
2187     }
2188    }
2189   }
2190   trie->lasttrans = pos + 1;
2191   trie->states = (reg_trie_state *)
2192    PerlMemShared_realloc( trie->states, laststate
2193         * sizeof(reg_trie_state) );
2194   DEBUG_TRIE_COMPILE_MORE_r(
2195     PerlIO_printf( Perl_debug_log,
2196      "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2197      (int)depth * 2 + 2,"",
2198      (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2199      (IV)next_alloc,
2200      (IV)pos,
2201      ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2202    );
2203
2204   } /* end table compress */
2205  }
2206  DEBUG_TRIE_COMPILE_MORE_r(
2207    PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2208     (int)depth * 2 + 2, "",
2209     (UV)trie->statecount,
2210     (UV)trie->lasttrans)
2211  );
2212  /* resize the trans array to remove unused space */
2213  trie->trans = (reg_trie_trans *)
2214   PerlMemShared_realloc( trie->trans, trie->lasttrans
2215        * sizeof(reg_trie_trans) );
2216
2217  {   /* Modify the program and insert the new TRIE node */
2218   U8 nodetype =(U8)(flags & 0xFF);
2219   char *str=NULL;
2220
2221 #ifdef DEBUGGING
2222   regnode *optimize = NULL;
2223 #ifdef RE_TRACK_PATTERN_OFFSETS
2224
2225   U32 mjd_offset = 0;
2226   U32 mjd_nodelen = 0;
2227 #endif /* RE_TRACK_PATTERN_OFFSETS */
2228 #endif /* DEBUGGING */
2229   /*
2230   This means we convert either the first branch or the first Exact,
2231   depending on whether the thing following (in 'last') is a branch
2232   or not and whther first is the startbranch (ie is it a sub part of
2233   the alternation or is it the whole thing.)
2234   Assuming its a sub part we convert the EXACT otherwise we convert
2235   the whole branch sequence, including the first.
2236   */
2237   /* Find the node we are going to overwrite */
2238   if ( first != startbranch || OP( last ) == BRANCH ) {
2239    /* branch sub-chain */
2240    NEXT_OFF( first ) = (U16)(last - first);
2241 #ifdef RE_TRACK_PATTERN_OFFSETS
2242    DEBUG_r({
2243     mjd_offset= Node_Offset((convert));
2244     mjd_nodelen= Node_Length((convert));
2245    });
2246 #endif
2247    /* whole branch chain */
2248   }
2249 #ifdef RE_TRACK_PATTERN_OFFSETS
2250   else {
2251    DEBUG_r({
2252     const  regnode *nop = NEXTOPER( convert );
2253     mjd_offset= Node_Offset((nop));
2254     mjd_nodelen= Node_Length((nop));
2255    });
2256   }
2257   DEBUG_OPTIMISE_r(
2258    PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2259     (int)depth * 2 + 2, "",
2260     (UV)mjd_offset, (UV)mjd_nodelen)
2261   );
2262 #endif
2263   /* But first we check to see if there is a common prefix we can
2264   split out as an EXACT and put in front of the TRIE node.  */
2265   trie->startstate= 1;
2266   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2267    U32 state;
2268    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2269     U32 ofs = 0;
2270     I32 idx = -1;
2271     U32 count = 0;
2272     const U32 base = trie->states[ state ].trans.base;
2273
2274     if ( trie->states[state].wordnum )
2275       count = 1;
2276
2277     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2278      if ( ( base + ofs >= trie->uniquecharcount ) &&
2279       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2280       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2281      {
2282       if ( ++count > 1 ) {
2283        SV **tmp = av_fetch( revcharmap, ofs, 0);
2284        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2285        if ( state == 1 ) break;
2286        if ( count == 2 ) {
2287         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2288         DEBUG_OPTIMISE_r(
2289          PerlIO_printf(Perl_debug_log,
2290           "%*sNew Start State=%"UVuf" Class: [",
2291           (int)depth * 2 + 2, "",
2292           (UV)state));
2293         if (idx >= 0) {
2294          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2295          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2296
2297          TRIE_BITMAP_SET(trie,*ch);
2298          if ( folder )
2299           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2300          DEBUG_OPTIMISE_r(
2301           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2302          );
2303         }
2304        }
2305        TRIE_BITMAP_SET(trie,*ch);
2306        if ( folder )
2307         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2308        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2309       }
2310       idx = ofs;
2311      }
2312     }
2313     if ( count == 1 ) {
2314      SV **tmp = av_fetch( revcharmap, idx, 0);
2315      STRLEN len;
2316      char *ch = SvPV( *tmp, len );
2317      DEBUG_OPTIMISE_r({
2318       SV *sv=sv_newmortal();
2319       PerlIO_printf( Perl_debug_log,
2320        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2321        (int)depth * 2 + 2, "",
2322        (UV)state, (UV)idx,
2323        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2324         PL_colors[0], PL_colors[1],
2325         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2326         PERL_PV_ESCAPE_FIRSTCHAR
2327        )
2328       );
2329      });
2330      if ( state==1 ) {
2331       OP( convert ) = nodetype;
2332       str=STRING(convert);
2333       STR_LEN(convert)=0;
2334      }
2335      STR_LEN(convert) += len;
2336      while (len--)
2337       *str++ = *ch++;
2338     } else {
2339 #ifdef DEBUGGING
2340      if (state>1)
2341       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2342 #endif
2343      break;
2344     }
2345    }
2346    trie->prefixlen = (state-1);
2347    if (str) {
2348     regnode *n = convert+NODE_SZ_STR(convert);
2349     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2350     trie->startstate = state;
2351     trie->minlen -= (state - 1);
2352     trie->maxlen -= (state - 1);
2353 #ifdef DEBUGGING
2354    /* At least the UNICOS C compiler choked on this
2355     * being argument to DEBUG_r(), so let's just have
2356     * it right here. */
2357    if (
2358 #ifdef PERL_EXT_RE_BUILD
2359     1
2360 #else
2361     DEBUG_r_TEST
2362 #endif
2363     ) {
2364     regnode *fix = convert;
2365     U32 word = trie->wordcount;
2366     mjd_nodelen++;
2367     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2368     while( ++fix < n ) {
2369      Set_Node_Offset_Length(fix, 0, 0);
2370     }
2371     while (word--) {
2372      SV ** const tmp = av_fetch( trie_words, word, 0 );
2373      if (tmp) {
2374       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2375        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2376       else
2377        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2378      }
2379     }
2380    }
2381 #endif
2382     if (trie->maxlen) {
2383      convert = n;
2384     } else {
2385      NEXT_OFF(convert) = (U16)(tail - convert);
2386      DEBUG_r(optimize= n);
2387     }
2388    }
2389   }
2390   if (!jumper)
2391    jumper = last;
2392   if ( trie->maxlen ) {
2393    NEXT_OFF( convert ) = (U16)(tail - convert);
2394    ARG_SET( convert, data_slot );
2395    /* Store the offset to the first unabsorbed branch in
2396    jump[0], which is otherwise unused by the jump logic.
2397    We use this when dumping a trie and during optimisation. */
2398    if (trie->jump)
2399     trie->jump[0] = (U16)(nextbranch - convert);
2400
2401    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2402    *   and there is a bitmap
2403    *   and the first "jump target" node we found leaves enough room
2404    * then convert the TRIE node into a TRIEC node, with the bitmap
2405    * embedded inline in the opcode - this is hypothetically faster.
2406    */
2407    if ( !trie->states[trie->startstate].wordnum
2408     && trie->bitmap
2409     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2410    {
2411     OP( convert ) = TRIEC;
2412     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2413     PerlMemShared_free(trie->bitmap);
2414     trie->bitmap= NULL;
2415    } else
2416     OP( convert ) = TRIE;
2417
2418    /* store the type in the flags */
2419    convert->flags = nodetype;
2420    DEBUG_r({
2421    optimize = convert
2422      + NODE_STEP_REGNODE
2423      + regarglen[ OP( convert ) ];
2424    });
2425    /* XXX We really should free up the resource in trie now,
2426     as we won't use them - (which resources?) dmq */
2427   }
2428   /* needed for dumping*/
2429   DEBUG_r(if (optimize) {
2430    regnode *opt = convert;
2431
2432    while ( ++opt < optimize) {
2433     Set_Node_Offset_Length(opt,0,0);
2434    }
2435    /*
2436     Try to clean up some of the debris left after the
2437     optimisation.
2438    */
2439    while( optimize < jumper ) {
2440     mjd_nodelen += Node_Length((optimize));
2441     OP( optimize ) = OPTIMIZED;
2442     Set_Node_Offset_Length(optimize,0,0);
2443     optimize++;
2444    }
2445    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2446   });
2447  } /* end node insert */
2448  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2449
2450  /*  Finish populating the prev field of the wordinfo array.  Walk back
2451  *  from each accept state until we find another accept state, and if
2452  *  so, point the first word's .prev field at the second word. If the
2453  *  second already has a .prev field set, stop now. This will be the
2454  *  case either if we've already processed that word's accept state,
2455  *  or that state had multiple words, and the overspill words were
2456  *  already linked up earlier.
2457  */
2458  {
2459   U16 word;
2460   U32 state;
2461   U16 prev;
2462
2463   for (word=1; word <= trie->wordcount; word++) {
2464    prev = 0;
2465    if (trie->wordinfo[word].prev)
2466     continue;
2467    state = trie->wordinfo[word].accept;
2468    while (state) {
2469     state = prev_states[state];
2470     if (!state)
2471      break;
2472     prev = trie->states[state].wordnum;
2473     if (prev)
2474      break;
2475    }
2476    trie->wordinfo[word].prev = prev;
2477   }
2478   Safefree(prev_states);
2479  }
2480
2481
2482  /* and now dump out the compressed format */
2483  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2484
2485  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2486 #ifdef DEBUGGING
2487  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2488  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2489 #else
2490  SvREFCNT_dec_NN(revcharmap);
2491 #endif
2492  return trie->jump
2493   ? MADE_JUMP_TRIE
2494   : trie->startstate>1
2495    ? MADE_EXACT_TRIE
2496    : MADE_TRIE;
2497 }
2498
2499 STATIC void
2500 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2501 {
2502 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2503
2504    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2505    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2506    ISBN 0-201-10088-6
2507
2508    We find the fail state for each state in the trie, this state is the longest proper
2509    suffix of the current state's 'word' that is also a proper prefix of another word in our
2510    trie. State 1 represents the word '' and is thus the default fail state. This allows
2511    the DFA not to have to restart after its tried and failed a word at a given point, it
2512    simply continues as though it had been matching the other word in the first place.
2513    Consider
2514  'abcdgu'=~/abcdefg|cdgu/
2515    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2516    fail, which would bring us to the state representing 'd' in the second word where we would
2517    try 'g' and succeed, proceeding to match 'cdgu'.
2518  */
2519  /* add a fail transition */
2520  const U32 trie_offset = ARG(source);
2521  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2522  U32 *q;
2523  const U32 ucharcount = trie->uniquecharcount;
2524  const U32 numstates = trie->statecount;
2525  const U32 ubound = trie->lasttrans + ucharcount;
2526  U32 q_read = 0;
2527  U32 q_write = 0;
2528  U32 charid;
2529  U32 base = trie->states[ 1 ].trans.base;
2530  U32 *fail;
2531  reg_ac_data *aho;
2532  const U32 data_slot = add_data( pRExC_state, 1, "T" );
2533  GET_RE_DEBUG_FLAGS_DECL;
2534
2535  PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2536 #ifndef DEBUGGING
2537  PERL_UNUSED_ARG(depth);
2538 #endif
2539
2540
2541  ARG_SET( stclass, data_slot );
2542  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2543  RExC_rxi->data->data[ data_slot ] = (void*)aho;
2544  aho->trie=trie_offset;
2545  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2546  Copy( trie->states, aho->states, numstates, reg_trie_state );
2547  Newxz( q, numstates, U32);
2548  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2549  aho->refcount = 1;
2550  fail = aho->fail;
2551  /* initialize fail[0..1] to be 1 so that we always have
2552  a valid final fail state */
2553  fail[ 0 ] = fail[ 1 ] = 1;
2554
2555  for ( charid = 0; charid < ucharcount ; charid++ ) {
2556   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2557   if ( newstate ) {
2558    q[ q_write ] = newstate;
2559    /* set to point at the root */
2560    fail[ q[ q_write++ ] ]=1;
2561   }
2562  }
2563  while ( q_read < q_write) {
2564   const U32 cur = q[ q_read++ % numstates ];
2565   base = trie->states[ cur ].trans.base;
2566
2567   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2568    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2569    if (ch_state) {
2570     U32 fail_state = cur;
2571     U32 fail_base;
2572     do {
2573      fail_state = fail[ fail_state ];
2574      fail_base = aho->states[ fail_state ].trans.base;
2575     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2576
2577     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2578     fail[ ch_state ] = fail_state;
2579     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2580     {
2581       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2582     }
2583     q[ q_write++ % numstates] = ch_state;
2584    }
2585   }
2586  }
2587  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2588  when we fail in state 1, this allows us to use the
2589  charclass scan to find a valid start char. This is based on the principle
2590  that theres a good chance the string being searched contains lots of stuff
2591  that cant be a start char.
2592  */
2593  fail[ 0 ] = fail[ 1 ] = 0;
2594  DEBUG_TRIE_COMPILE_r({
2595   PerlIO_printf(Perl_debug_log,
2596      "%*sStclass Failtable (%"UVuf" states): 0",
2597      (int)(depth * 2), "", (UV)numstates
2598   );
2599   for( q_read=1; q_read<numstates; q_read++ ) {
2600    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2601   }
2602   PerlIO_printf(Perl_debug_log, "\n");
2603  });
2604  Safefree(q);
2605  /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2606 }
2607
2608
2609 /*
2610  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2611  * These need to be revisited when a newer toolchain becomes available.
2612  */
2613 #if defined(__sparc64__) && defined(__GNUC__)
2614 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2615 #       undef  SPARC64_GCC_WORKAROUND
2616 #       define SPARC64_GCC_WORKAROUND 1
2617 #   endif
2618 #endif
2619
2620 #define DEBUG_PEEP(str,scan,depth) \
2621  DEBUG_OPTIMISE_r({if (scan){ \
2622  SV * const mysv=sv_newmortal(); \
2623  regnode *Next = regnext(scan); \
2624  regprop(RExC_rx, mysv, scan); \
2625  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2626  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2627  Next ? (REG_NODE_NUM(Next)) : 0 ); \
2628    }});
2629
2630
2631 /* The below joins as many adjacent EXACTish nodes as possible into a single
2632  * one.  The regop may be changed if the node(s) contain certain sequences that
2633  * require special handling.  The joining is only done if:
2634  * 1) there is room in the current conglomerated node to entirely contain the
2635  *    next one.
2636  * 2) they are the exact same node type
2637  *
2638  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2639  * these get optimized out
2640  *
2641  * If a node is to match under /i (folded), the number of characters it matches
2642  * can be different than its character length if it contains a multi-character
2643  * fold.  *min_subtract is set to the total delta of the input nodes.
2644  *
2645  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2646  * and contains LATIN SMALL LETTER SHARP S
2647  *
2648  * This is as good a place as any to discuss the design of handling these
2649  * multi-character fold sequences.  It's been wrong in Perl for a very long
2650  * time.  There are three code points in Unicode whose multi-character folds
2651  * were long ago discovered to mess things up.  The previous designs for
2652  * dealing with these involved assigning a special node for them.  This
2653  * approach doesn't work, as evidenced by this example:
2654  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2655  * Both these fold to "sss", but if the pattern is parsed to create a node that
2656  * would match just the \xDF, it won't be able to handle the case where a
2657  * successful match would have to cross the node's boundary.  The new approach
2658  * that hopefully generally solves the problem generates an EXACTFU_SS node
2659  * that is "sss".
2660  *
2661  * It turns out that there are problems with all multi-character folds, and not
2662  * just these three.  Now the code is general, for all such cases, but the
2663  * three still have some special handling.  The approach taken is:
2664  * 1)   This routine examines each EXACTFish node that could contain multi-
2665  *      character fold sequences.  It returns in *min_subtract how much to
2666  *      subtract from the the actual length of the string to get a real minimum
2667  *      match length; it is 0 if there are no multi-char folds.  This delta is
2668  *      used by the caller to adjust the min length of the match, and the delta
2669  *      between min and max, so that the optimizer doesn't reject these
2670  *      possibilities based on size constraints.
2671  * 2)   Certain of these sequences require special handling by the trie code,
2672  *      so, if found, this code changes the joined node type to special ops:
2673  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2674  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2675  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2676  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2677  *      there is a possible fold length change.  That means that a regular
2678  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2679  *      with length changes, and so can be processed faster.  regexec.c takes
2680  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2681  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2682  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2683  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2684  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2685  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2686  *      possibilities for the non-UTF8 patterns are quite simple, except for
2687  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2688  *      members of a fold-pair, and arrays are set up for all of them so that
2689  *      the other member of the pair can be found quickly.  Code elsewhere in
2690  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2691  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2692  *      described in the next item.
2693  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2694  *      'ss' or not is not knowable at compile time.  It will match iff the
2695  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2696  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2697  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2698  *      described in item 3).  An assumption that the optimizer part of
2699  *      regexec.c (probably unwittingly) makes is that a character in the
2700  *      pattern corresponds to at most a single character in the target string.
2701  *      (And I do mean character, and not byte here, unlike other parts of the
2702  *      documentation that have never been updated to account for multibyte
2703  *      Unicode.)  This assumption is wrong only in this case, as all other
2704  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2705  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2706  *      reluctant to try to change this assumption, so instead the code punts.
2707  *      This routine examines EXACTF nodes for the sharp s, and returns a
2708  *      boolean indicating whether or not the node is an EXACTF node that
2709  *      contains a sharp s.  When it is true, the caller sets a flag that later
2710  *      causes the optimizer in this file to not set values for the floating
2711  *      and fixed string lengths, and thus avoids the optimizer code in
2712  *      regexec.c that makes the invalid assumption.  Thus, there is no
2713  *      optimization based on string lengths for EXACTF nodes that contain the
2714  *      sharp s.  This only happens for /id rules (which means the pattern
2715  *      isn't in UTF-8).
2716  */
2717
2718 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2719  if (PL_regkind[OP(scan)] == EXACT) \
2720   join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2721
2722 STATIC U32
2723 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2724  /* Merge several consecutive EXACTish nodes into one. */
2725  regnode *n = regnext(scan);
2726  U32 stringok = 1;
2727  regnode *next = scan + NODE_SZ_STR(scan);
2728  U32 merged = 0;
2729  U32 stopnow = 0;
2730 #ifdef DEBUGGING
2731  regnode *stop = scan;
2732  GET_RE_DEBUG_FLAGS_DECL;
2733 #else
2734  PERL_UNUSED_ARG(depth);
2735 #endif
2736
2737  PERL_ARGS_ASSERT_JOIN_EXACT;
2738 #ifndef EXPERIMENTAL_INPLACESCAN
2739  PERL_UNUSED_ARG(flags);
2740  PERL_UNUSED_ARG(val);
2741 #endif
2742  DEBUG_PEEP("join",scan,depth);
2743
2744  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2745  * EXACT ones that are mergeable to the current one. */
2746  while (n
2747   && (PL_regkind[OP(n)] == NOTHING
2748    || (stringok && OP(n) == OP(scan)))
2749   && NEXT_OFF(n)
2750   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2751  {
2752
2753   if (OP(n) == TAIL || n > next)
2754    stringok = 0;
2755   if (PL_regkind[OP(n)] == NOTHING) {
2756    DEBUG_PEEP("skip:",n,depth);
2757    NEXT_OFF(scan) += NEXT_OFF(n);
2758    next = n + NODE_STEP_REGNODE;
2759 #ifdef DEBUGGING
2760    if (stringok)
2761     stop = n;
2762 #endif
2763    n = regnext(n);
2764   }
2765   else if (stringok) {
2766    const unsigned int oldl = STR_LEN(scan);
2767    regnode * const nnext = regnext(n);
2768
2769    /* XXX I (khw) kind of doubt that this works on platforms where
2770    * U8_MAX is above 255 because of lots of other assumptions */
2771    /* Don't join if the sum can't fit into a single node */
2772    if (oldl + STR_LEN(n) > U8_MAX)
2773     break;
2774
2775    DEBUG_PEEP("merg",n,depth);
2776    merged++;
2777
2778    NEXT_OFF(scan) += NEXT_OFF(n);
2779    STR_LEN(scan) += STR_LEN(n);
2780    next = n + NODE_SZ_STR(n);
2781    /* Now we can overwrite *n : */
2782    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2783 #ifdef DEBUGGING
2784    stop = next - 1;
2785 #endif
2786    n = nnext;
2787    if (stopnow) break;
2788   }
2789
2790 #ifdef EXPERIMENTAL_INPLACESCAN
2791   if (flags && !NEXT_OFF(n)) {
2792    DEBUG_PEEP("atch", val, depth);
2793    if (reg_off_by_arg[OP(n)]) {
2794     ARG_SET(n, val - n);
2795    }
2796    else {
2797     NEXT_OFF(n) = val - n;
2798    }
2799    stopnow = 1;
2800   }
2801 #endif
2802  }
2803
2804  *min_subtract = 0;
2805  *has_exactf_sharp_s = FALSE;
2806
2807  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2808  * can now analyze for sequences of problematic code points.  (Prior to
2809  * this final joining, sequences could have been split over boundaries, and
2810  * hence missed).  The sequences only happen in folding, hence for any
2811  * non-EXACT EXACTish node */
2812  if (OP(scan) != EXACT) {
2813   const U8 * const s0 = (U8*) STRING(scan);
2814   const U8 * s = s0;
2815   const U8 * const s_end = s0 + STR_LEN(scan);
2816
2817   /* One pass is made over the node's string looking for all the
2818   * possibilities.  to avoid some tests in the loop, there are two main
2819   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2820   * non-UTF-8 */
2821   if (UTF) {
2822
2823    /* Examine the string for a multi-character fold sequence.  UTF-8
2824    * patterns have all characters pre-folded by the time this code is
2825    * executed */
2826    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2827          length sequence we are looking for is 2 */
2828    {
2829     int count = 0;
2830     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2831     if (! len) {    /* Not a multi-char fold: get next char */
2832      s += UTF8SKIP(s);
2833      continue;
2834     }
2835
2836     /* Nodes with 'ss' require special handling, except for EXACTFL
2837     * and EXACTFA for which there is no multi-char fold to this */
2838     if (len == 2 && *s == 's' && *(s+1) == 's'
2839      && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2840     {
2841      count = 2;
2842      OP(scan) = EXACTFU_SS;
2843      s += 2;
2844     }
2845     else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2846       && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2847          COMBINING_DIAERESIS_UTF8
2848          COMBINING_ACUTE_ACCENT_UTF8,
2849         6)
2850        || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2851           COMBINING_DIAERESIS_UTF8
2852           COMBINING_ACUTE_ACCENT_UTF8,
2853          6)))
2854     {
2855      count = 3;
2856
2857      /* These two folds require special handling by trie's, so
2858      * change the node type to indicate this.  If EXACTFA and
2859      * EXACTFL were ever to be handled by trie's, this would
2860      * have to be changed.  If this node has already been
2861      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2862      * (khw) think it doesn't matter in regexec.c for UTF
2863      * patterns, but no need to change it */
2864      if (OP(scan) == EXACTFU) {
2865       OP(scan) = EXACTFU_TRICKYFOLD;
2866      }
2867      s += 6;
2868     }
2869     else { /* Here is a generic multi-char fold. */
2870      const U8* multi_end  = s + len;
2871
2872      /* Count how many characters in it.  In the case of /l and
2873      * /aa, no folds which contain ASCII code points are
2874      * allowed, so check for those, and skip if found.  (In
2875      * EXACTFL, no folds are allowed to any Latin1 code point,
2876      * not just ASCII.  But there aren't any of these
2877      * currently, nor ever likely, so don't take the time to
2878      * test for them.  The code that generates the
2879      * is_MULTI_foo() macros croaks should one actually get put
2880      * into Unicode .) */
2881      if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2882       count = utf8_length(s, multi_end);
2883       s = multi_end;
2884      }
2885      else {
2886       while (s < multi_end) {
2887        if (isASCII(*s)) {
2888         s++;
2889         goto next_iteration;
2890        }
2891        else {
2892         s += UTF8SKIP(s);
2893        }
2894        count++;
2895       }
2896      }
2897     }
2898
2899     /* The delta is how long the sequence is minus 1 (1 is how long
2900     * the character that folds to the sequence is) */
2901     *min_subtract += count - 1;
2902    next_iteration: ;
2903    }
2904   }
2905   else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2906
2907    /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2908    * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2909    * nodes can't have multi-char folds to this range (and there are
2910    * no existing ones in the upper latin1 range).  In the EXACTF
2911    * case we look also for the sharp s, which can be in the final
2912    * position.  Otherwise we can stop looking 1 byte earlier because
2913    * have to find at least two characters for a multi-fold */
2914    const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2915
2916    /* The below is perhaps overboard, but this allows us to save a
2917    * test each time through the loop at the expense of a mask.  This
2918    * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2919    * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2920    * are 64.  This uses an exclusive 'or' to find that bit and then
2921    * inverts it to form a mask, with just a single 0, in the bit
2922    * position where 'S' and 's' differ. */
2923    const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2924    const U8 s_masked = 's' & S_or_s_mask;
2925
2926    while (s < upper) {
2927     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2928     if (! len) {    /* Not a multi-char fold. */
2929      if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2930      {
2931       *has_exactf_sharp_s = TRUE;
2932      }
2933      s++;
2934      continue;
2935     }
2936
2937     if (len == 2
2938      && ((*s & S_or_s_mask) == s_masked)
2939      && ((*(s+1) & S_or_s_mask) == s_masked))
2940     {
2941
2942      /* EXACTF nodes need to know that the minimum length
2943      * changed so that a sharp s in the string can match this
2944      * ss in the pattern, but they remain EXACTF nodes, as they
2945      * won't match this unless the target string is is UTF-8,
2946      * which we don't know until runtime */
2947      if (OP(scan) != EXACTF) {
2948       OP(scan) = EXACTFU_SS;
2949      }
2950     }
2951
2952     *min_subtract += len - 1;
2953     s += len;
2954    }
2955   }
2956  }
2957
2958 #ifdef DEBUGGING
2959  /* Allow dumping but overwriting the collection of skipped
2960  * ops and/or strings with fake optimized ops */
2961  n = scan + NODE_SZ_STR(scan);
2962  while (n <= stop) {
2963   OP(n) = OPTIMIZED;
2964   FLAGS(n) = 0;
2965   NEXT_OFF(n) = 0;
2966   n++;
2967  }
2968 #endif
2969  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2970  return stopnow;
2971 }
2972
2973 /* REx optimizer.  Converts nodes into quicker variants "in place".
2974    Finds fixed substrings.  */
2975
2976 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2977    to the position after last scanned or to NULL. */
2978
2979 #define INIT_AND_WITHP \
2980  assert(!and_withp); \
2981  Newx(and_withp,1,struct regnode_charclass_class); \
2982  SAVEFREEPV(and_withp)
2983
2984 /* this is a chain of data about sub patterns we are processing that
2985    need to be handled separately/specially in study_chunk. Its so
2986    we can simulate recursion without losing state.  */
2987 struct scan_frame;
2988 typedef struct scan_frame {
2989  regnode *last;  /* last node to process in this frame */
2990  regnode *next;  /* next node to process when last is reached */
2991  struct scan_frame *prev; /*previous frame*/
2992  I32 stop; /* what stopparen do we use */
2993 } scan_frame;
2994
2995
2996 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2997
2998 STATIC I32
2999 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3000       I32 *minlenp, I32 *deltap,
3001       regnode *last,
3002       scan_data_t *data,
3003       I32 stopparen,
3004       U8* recursed,
3005       struct regnode_charclass_class *and_withp,
3006       U32 flags, U32 depth)
3007       /* scanp: Start here (read-write). */
3008       /* deltap: Write maxlen-minlen here. */
3009       /* last: Stop before this one. */
3010       /* data: string data about the pattern */
3011       /* stopparen: treat close N as END */
3012       /* recursed: which subroutines have we recursed into */
3013       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3014 {
3015  dVAR;
3016  I32 min = 0;    /* There must be at least this number of characters to match */
3017  I32 pars = 0, code;
3018  regnode *scan = *scanp, *next;
3019  I32 delta = 0;
3020  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3021  int is_inf_internal = 0;  /* The studied chunk is infinite */
3022  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3023  scan_data_t data_fake;
3024  SV *re_trie_maxbuff = NULL;
3025  regnode *first_non_open = scan;
3026  I32 stopmin = I32_MAX;
3027  scan_frame *frame = NULL;
3028  GET_RE_DEBUG_FLAGS_DECL;
3029
3030  PERL_ARGS_ASSERT_STUDY_CHUNK;
3031
3032 #ifdef DEBUGGING
3033  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3034 #endif
3035
3036  if ( depth == 0 ) {
3037   while (first_non_open && OP(first_non_open) == OPEN)
3038    first_non_open=regnext(first_non_open);
3039  }
3040
3041
3042   fake_study_recurse:
3043  while ( scan && OP(scan) != END && scan < last ){
3044   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3045         node length to get a real minimum (because
3046         the folded version may be shorter) */
3047   bool has_exactf_sharp_s = FALSE;
3048   /* Peephole optimizer: */
3049   DEBUG_STUDYDATA("Peep:", data,depth);
3050   DEBUG_PEEP("Peep",scan,depth);
3051
3052   /* Its not clear to khw or hv why this is done here, and not in the
3053   * clauses that deal with EXACT nodes.  khw's guess is that it's
3054   * because of a previous design */
3055   JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3056
3057   /* Follow the next-chain of the current node and optimize
3058   away all the NOTHINGs from it.  */
3059   if (OP(scan) != CURLYX) {
3060    const int max = (reg_off_by_arg[OP(scan)]
3061      ? I32_MAX
3062      /* I32 may be smaller than U16 on CRAYs! */
3063      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3064    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3065    int noff;
3066    regnode *n = scan;
3067
3068    /* Skip NOTHING and LONGJMP. */
3069    while ((n = regnext(n))
3070     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3071      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3072     && off + noff < max)
3073     off += noff;
3074    if (reg_off_by_arg[OP(scan)])
3075     ARG(scan) = off;
3076    else
3077     NEXT_OFF(scan) = off;
3078   }
3079
3080
3081
3082   /* The principal pseudo-switch.  Cannot be a switch, since we
3083   look into several different things.  */
3084   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3085     || OP(scan) == IFTHEN) {
3086    next = regnext(scan);
3087    code = OP(scan);
3088    /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3089
3090    if (OP(next) == code || code == IFTHEN) {
3091     /* NOTE - There is similar code to this block below for handling
3092     TRIE nodes on a re-study.  If you change stuff here check there
3093     too. */
3094     I32 max1 = 0, min1 = I32_MAX, num = 0;
3095     struct regnode_charclass_class accum;
3096     regnode * const startbranch=scan;
3097
3098     if (flags & SCF_DO_SUBSTR)
3099      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3100     if (flags & SCF_DO_STCLASS)
3101      cl_init_zero(pRExC_state, &accum);
3102
3103     while (OP(scan) == code) {
3104      I32 deltanext, minnext, f = 0, fake;
3105      struct regnode_charclass_class this_class;
3106
3107      num++;
3108      data_fake.flags = 0;
3109      if (data) {
3110       data_fake.whilem_c = data->whilem_c;
3111       data_fake.last_closep = data->last_closep;
3112      }
3113      else
3114       data_fake.last_closep = &fake;
3115
3116      data_fake.pos_delta = delta;
3117      next = regnext(scan);
3118      scan = NEXTOPER(scan);
3119      if (code != BRANCH)
3120       scan = NEXTOPER(scan);
3121      if (flags & SCF_DO_STCLASS) {
3122       cl_init(pRExC_state, &this_class);
3123       data_fake.start_class = &this_class;
3124       f = SCF_DO_STCLASS_AND;
3125      }
3126      if (flags & SCF_WHILEM_VISITED_POS)
3127       f |= SCF_WHILEM_VISITED_POS;
3128
3129      /* we suppose the run is continuous, last=next...*/
3130      minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3131           next, &data_fake,
3132           stopparen, recursed, NULL, f,depth+1);
3133      if (min1 > minnext)
3134       min1 = minnext;
3135      if (deltanext == I32_MAX) {
3136       is_inf = is_inf_internal = 1;
3137       max1 = I32_MAX;
3138      } else if (max1 < minnext + deltanext)
3139       max1 = minnext + deltanext;
3140      scan = next;
3141      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3142       pars++;
3143      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3144       if ( stopmin > minnext)
3145        stopmin = min + min1;
3146       flags &= ~SCF_DO_SUBSTR;
3147       if (data)
3148        data->flags |= SCF_SEEN_ACCEPT;
3149      }
3150      if (data) {
3151       if (data_fake.flags & SF_HAS_EVAL)
3152        data->flags |= SF_HAS_EVAL;
3153       data->whilem_c = data_fake.whilem_c;
3154      }
3155      if (flags & SCF_DO_STCLASS)
3156       cl_or(pRExC_state, &accum, &this_class);
3157     }
3158     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3159      min1 = 0;
3160     if (flags & SCF_DO_SUBSTR) {
3161      data->pos_min += min1;
3162      if (data->pos_delta >= I32_MAX - (max1 - min1))
3163       data->pos_delta = I32_MAX;
3164      else
3165       data->pos_delta += max1 - min1;
3166      if (max1 != min1 || is_inf)
3167       data->longest = &(data->longest_float);
3168     }
3169     min += min1;
3170     if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
3171      delta = I32_MAX;
3172     else
3173      delta += max1 - min1;
3174     if (flags & SCF_DO_STCLASS_OR) {
3175      cl_or(pRExC_state, data->start_class, &accum);
3176      if (min1) {
3177       cl_and(data->start_class, and_withp);
3178       flags &= ~SCF_DO_STCLASS;
3179      }
3180     }
3181     else if (flags & SCF_DO_STCLASS_AND) {
3182      if (min1) {
3183       cl_and(data->start_class, &accum);
3184       flags &= ~SCF_DO_STCLASS;
3185      }
3186      else {
3187       /* Switch to OR mode: cache the old value of
3188       * data->start_class */
3189       INIT_AND_WITHP;
3190       StructCopy(data->start_class, and_withp,
3191         struct regnode_charclass_class);
3192       flags &= ~SCF_DO_STCLASS_AND;
3193       StructCopy(&accum, data->start_class,
3194         struct regnode_charclass_class);
3195       flags |= SCF_DO_STCLASS_OR;
3196       SET_SSC_EOS(data->start_class);
3197      }
3198     }
3199
3200     if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3201     /* demq.
3202
3203     Assuming this was/is a branch we are dealing with: 'scan' now
3204     points at the item that follows the branch sequence, whatever
3205     it is. We now start at the beginning of the sequence and look
3206     for subsequences of
3207
3208     BRANCH->EXACT=>x1
3209     BRANCH->EXACT=>x2
3210     tail
3211
3212     which would be constructed from a pattern like /A|LIST|OF|WORDS/
3213
3214     If we can find such a subsequence we need to turn the first
3215     element into a trie and then add the subsequent branch exact
3216     strings to the trie.
3217
3218     We have two cases
3219
3220      1. patterns where the whole set of branches can be converted.
3221
3222      2. patterns where only a subset can be converted.
3223
3224     In case 1 we can replace the whole set with a single regop
3225     for the trie. In case 2 we need to keep the start and end
3226     branches so
3227
3228      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3229      becomes BRANCH TRIE; BRANCH X;
3230
3231     There is an additional case, that being where there is a
3232     common prefix, which gets split out into an EXACT like node
3233     preceding the TRIE node.
3234
3235     If x(1..n)==tail then we can do a simple trie, if not we make
3236     a "jump" trie, such that when we match the appropriate word
3237     we "jump" to the appropriate tail node. Essentially we turn
3238     a nested if into a case structure of sorts.
3239
3240     */
3241
3242      int made=0;
3243      if (!re_trie_maxbuff) {
3244       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3245       if (!SvIOK(re_trie_maxbuff))
3246        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3247      }
3248      if ( SvIV(re_trie_maxbuff)>=0  ) {
3249       regnode *cur;
3250       regnode *first = (regnode *)NULL;
3251       regnode *last = (regnode *)NULL;
3252       regnode *tail = scan;
3253       U8 trietype = 0;
3254       U32 count=0;
3255
3256 #ifdef DEBUGGING
3257       SV * const mysv = sv_newmortal();       /* for dumping */
3258 #endif
3259       /* var tail is used because there may be a TAIL
3260       regop in the way. Ie, the exacts will point to the
3261       thing following the TAIL, but the last branch will
3262       point at the TAIL. So we advance tail. If we
3263       have nested (?:) we may have to move through several
3264       tails.
3265       */
3266
3267       while ( OP( tail ) == TAIL ) {
3268        /* this is the TAIL generated by (?:) */
3269        tail = regnext( tail );
3270       }
3271
3272
3273       DEBUG_TRIE_COMPILE_r({
3274        regprop(RExC_rx, mysv, tail );
3275        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3276         (int)depth * 2 + 2, "",
3277         "Looking for TRIE'able sequences. Tail node is: ",
3278         SvPV_nolen_const( mysv )
3279        );
3280       });
3281
3282       /*
3283
3284        Step through the branches
3285         cur represents each branch,
3286         noper is the first thing to be matched as part of that branch
3287         noper_next is the regnext() of that node.
3288
3289        We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3290        via a "jump trie" but we also support building with NOJUMPTRIE,
3291        which restricts the trie logic to structures like /FOO|BAR/.
3292
3293        If noper is a trieable nodetype then the branch is a possible optimization
3294        target. If we are building under NOJUMPTRIE then we require that noper_next
3295        is the same as scan (our current position in the regex program).
3296
3297        Once we have two or more consecutive such branches we can create a
3298        trie of the EXACT's contents and stitch it in place into the program.
3299
3300        If the sequence represents all of the branches in the alternation we
3301        replace the entire thing with a single TRIE node.
3302
3303        Otherwise when it is a subsequence we need to stitch it in place and
3304        replace only the relevant branches. This means the first branch has
3305        to remain as it is used by the alternation logic, and its next pointer,
3306        and needs to be repointed at the item on the branch chain following
3307        the last branch we have optimized away.
3308
3309        This could be either a BRANCH, in which case the subsequence is internal,
3310        or it could be the item following the branch sequence in which case the
3311        subsequence is at the end (which does not necessarily mean the first node
3312        is the start of the alternation).
3313
3314        TRIE_TYPE(X) is a define which maps the optype to a trietype.
3315
3316         optype          |  trietype
3317         ----------------+-----------
3318         NOTHING         | NOTHING
3319         EXACT           | EXACT
3320         EXACTFU         | EXACTFU
3321         EXACTFU_SS      | EXACTFU
3322         EXACTFU_TRICKYFOLD | EXACTFU
3323         EXACTFA         | 0
3324
3325
3326       */
3327 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3328      ( EXACT == (X) )   ? EXACT :        \
3329      ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3330      0 )
3331
3332       /* dont use tail as the end marker for this traverse */
3333       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3334        regnode * const noper = NEXTOPER( cur );
3335        U8 noper_type = OP( noper );
3336        U8 noper_trietype = TRIE_TYPE( noper_type );
3337 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3338        regnode * const noper_next = regnext( noper );
3339        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3340        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3341 #endif
3342
3343        DEBUG_TRIE_COMPILE_r({
3344         regprop(RExC_rx, mysv, cur);
3345         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3346         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3347
3348         regprop(RExC_rx, mysv, noper);
3349         PerlIO_printf( Perl_debug_log, " -> %s",
3350          SvPV_nolen_const(mysv));
3351
3352         if ( noper_next ) {
3353         regprop(RExC_rx, mysv, noper_next );
3354         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3355          SvPV_nolen_const(mysv));
3356         }
3357         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3358         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3359         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3360         );
3361        });
3362
3363        /* Is noper a trieable nodetype that can be merged with the
3364        * current trie (if there is one)? */
3365        if ( noper_trietype
3366         &&
3367         (
3368           ( noper_trietype == NOTHING)
3369           || ( trietype == NOTHING )
3370           || ( trietype == noper_trietype )
3371         )
3372 #ifdef NOJUMPTRIE
3373         && noper_next == tail
3374 #endif
3375         && count < U16_MAX)
3376        {
3377         /* Handle mergable triable node
3378         * Either we are the first node in a new trieable sequence,
3379         * in which case we do some bookkeeping, otherwise we update
3380         * the end pointer. */
3381         if ( !first ) {
3382          first = cur;
3383          if ( noper_trietype == NOTHING ) {
3384 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3385           regnode * const noper_next = regnext( noper );
3386           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3387           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3388 #endif
3389
3390           if ( noper_next_trietype ) {
3391            trietype = noper_next_trietype;
3392           } else if (noper_next_type)  {
3393            /* a NOTHING regop is 1 regop wide. We need at least two
3394            * for a trie so we can't merge this in */
3395            first = NULL;
3396           }
3397          } else {
3398           trietype = noper_trietype;
3399          }
3400         } else {
3401          if ( trietype == NOTHING )
3402           trietype = noper_trietype;
3403          last = cur;
3404         }
3405         if (first)
3406          count++;
3407        } /* end handle mergable triable node */
3408        else {
3409         /* handle unmergable node -
3410         * noper may either be a triable node which can not be tried
3411         * together with the current trie, or a non triable node */
3412         if ( last ) {
3413          /* If last is set and trietype is not NOTHING then we have found
3414          * at least two triable branch sequences in a row of a similar
3415          * trietype so we can turn them into a trie. If/when we
3416          * allow NOTHING to start a trie sequence this condition will be
3417          * required, and it isn't expensive so we leave it in for now. */
3418          if ( trietype && trietype != NOTHING )
3419           make_trie( pRExC_state,
3420             startbranch, first, cur, tail, count,
3421             trietype, depth+1 );
3422          last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3423         }
3424         if ( noper_trietype
3425 #ifdef NOJUMPTRIE
3426          && noper_next == tail
3427 #endif
3428         ){
3429          /* noper is triable, so we can start a new trie sequence */
3430          count = 1;
3431          first = cur;
3432          trietype = noper_trietype;
3433         } else if (first) {
3434          /* if we already saw a first but the current node is not triable then we have
3435          * to reset the first information. */
3436          count = 0;
3437          first = NULL;
3438          trietype = 0;
3439         }
3440        } /* end handle unmergable node */
3441       } /* loop over branches */
3442       DEBUG_TRIE_COMPILE_r({
3443        regprop(RExC_rx, mysv, cur);
3444        PerlIO_printf( Perl_debug_log,
3445        "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3446        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3447
3448       });
3449       if ( last && trietype ) {
3450        if ( trietype != NOTHING ) {
3451         /* the last branch of the sequence was part of a trie,
3452         * so we have to construct it here outside of the loop
3453         */
3454         made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3455 #ifdef TRIE_STUDY_OPT
3456         if ( ((made == MADE_EXACT_TRIE &&
3457          startbranch == first)
3458          || ( first_non_open == first )) &&
3459          depth==0 ) {
3460          flags |= SCF_TRIE_RESTUDY;
3461          if ( startbranch == first
3462           && scan == tail )
3463          {
3464           RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3465          }
3466         }
3467 #endif
3468        } else {
3469         /* at this point we know whatever we have is a NOTHING sequence/branch
3470         * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3471         */
3472         if ( startbranch == first ) {
3473          regnode *opt;
3474          /* the entire thing is a NOTHING sequence, something like this:
3475          * (?:|) So we can turn it into a plain NOTHING op. */
3476          DEBUG_TRIE_COMPILE_r({
3477           regprop(RExC_rx, mysv, cur);
3478           PerlIO_printf( Perl_debug_log,
3479           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3480           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3481
3482          });
3483          OP(startbranch)= NOTHING;
3484          NEXT_OFF(startbranch)= tail - startbranch;
3485          for ( opt= startbranch + 1; opt < tail ; opt++ )
3486           OP(opt)= OPTIMIZED;
3487         }
3488        }
3489       } /* end if ( last) */
3490      } /* TRIE_MAXBUF is non zero */
3491
3492     } /* do trie */
3493
3494    }
3495    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3496     scan = NEXTOPER(NEXTOPER(scan));
3497    } else   /* single branch is optimized. */
3498     scan = NEXTOPER(scan);
3499    continue;
3500   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3501    scan_frame *newframe = NULL;
3502    I32 paren;
3503    regnode *start;
3504    regnode *end;
3505
3506    if (OP(scan) != SUSPEND) {
3507    /* set the pointer */
3508     if (OP(scan) == GOSUB) {
3509      paren = ARG(scan);
3510      RExC_recurse[ARG2L(scan)] = scan;
3511      start = RExC_open_parens[paren-1];
3512      end   = RExC_close_parens[paren-1];
3513     } else {
3514      paren = 0;
3515      start = RExC_rxi->program + 1;
3516      end   = RExC_opend;
3517     }
3518     if (!recursed) {
3519      Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3520      SAVEFREEPV(recursed);
3521     }
3522     if (!PAREN_TEST(recursed,paren+1)) {
3523      PAREN_SET(recursed,paren+1);
3524      Newx(newframe,1,scan_frame);
3525     } else {
3526      if (flags & SCF_DO_SUBSTR) {
3527       SCAN_COMMIT(pRExC_state,data,minlenp);
3528       data->longest = &(data->longest_float);
3529      }
3530      is_inf = is_inf_internal = 1;
3531      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3532       cl_anything(pRExC_state, data->start_class);
3533      flags &= ~SCF_DO_STCLASS;
3534     }
3535    } else {
3536     Newx(newframe,1,scan_frame);
3537     paren = stopparen;
3538     start = scan+2;
3539     end = regnext(scan);
3540    }
3541    if (newframe) {
3542     assert(start);
3543     assert(end);
3544     SAVEFREEPV(newframe);
3545     newframe->next = regnext(scan);
3546     newframe->last = last;
3547     newframe->stop = stopparen;
3548     newframe->prev = frame;
3549
3550     frame = newframe;
3551     scan =  start;
3552     stopparen = paren;
3553     last = end;
3554
3555     continue;
3556    }
3557   }
3558   else if (OP(scan) == EXACT) {
3559    I32 l = STR_LEN(scan);
3560    UV uc;
3561    if (UTF) {
3562     const U8 * const s = (U8*)STRING(scan);
3563     uc = utf8_to_uvchr_buf(s, s + l, NULL);
3564     l = utf8_length(s, s + l);
3565    } else {
3566     uc = *((U8*)STRING(scan));
3567    }
3568    min += l;
3569    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3570     /* The code below prefers earlier match for fixed
3571     offset, later match for variable offset.  */
3572     if (data->last_end == -1) { /* Update the start info. */
3573      data->last_start_min = data->pos_min;
3574      data->last_start_max = is_inf
3575       ? I32_MAX : data->pos_min + data->pos_delta;
3576     }
3577     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3578     if (UTF)
3579      SvUTF8_on(data->last_found);
3580     {
3581      SV * const sv = data->last_found;
3582      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3583       mg_find(sv, PERL_MAGIC_utf8) : NULL;
3584      if (mg && mg->mg_len >= 0)
3585       mg->mg_len += utf8_length((U8*)STRING(scan),
3586             (U8*)STRING(scan)+STR_LEN(scan));
3587     }
3588     data->last_end = data->pos_min + l;
3589     data->pos_min += l; /* As in the first entry. */
3590     data->flags &= ~SF_BEFORE_EOL;
3591    }
3592    if (flags & SCF_DO_STCLASS_AND) {
3593     /* Check whether it is compatible with what we know already! */
3594     int compat = 1;
3595
3596
3597     /* If compatible, we or it in below.  It is compatible if is
3598     * in the bitmp and either 1) its bit or its fold is set, or 2)
3599     * it's for a locale.  Even if there isn't unicode semantics
3600     * here, at runtime there may be because of matching against a
3601     * utf8 string, so accept a possible false positive for
3602     * latin1-range folds */
3603     if (uc >= 0x100 ||
3604      (!(data->start_class->flags & ANYOF_LOCALE)
3605      && !ANYOF_BITMAP_TEST(data->start_class, uc)
3606      && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3607       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3608      )
3609     {
3610      compat = 0;
3611     }
3612     ANYOF_CLASS_ZERO(data->start_class);
3613     ANYOF_BITMAP_ZERO(data->start_class);
3614     if (compat)
3615      ANYOF_BITMAP_SET(data->start_class, uc);
3616     else if (uc >= 0x100) {
3617      int i;
3618
3619      /* Some Unicode code points fold to the Latin1 range; as
3620      * XXX temporary code, instead of figuring out if this is
3621      * one, just assume it is and set all the start class bits
3622      * that could be some such above 255 code point's fold
3623      * which will generate fals positives.  As the code
3624      * elsewhere that does compute the fold settles down, it
3625      * can be extracted out and re-used here */
3626      for (i = 0; i < 256; i++){
3627       if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3628        ANYOF_BITMAP_SET(data->start_class, i);
3629       }
3630      }
3631     }
3632     CLEAR_SSC_EOS(data->start_class);
3633     if (uc < 0x100)
3634     data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3635    }
3636    else if (flags & SCF_DO_STCLASS_OR) {
3637     /* false positive possible if the class is case-folded */
3638     if (uc < 0x100)
3639      ANYOF_BITMAP_SET(data->start_class, uc);
3640     else
3641      data->start_class->flags |= ANYOF_UNICODE_ALL;
3642     CLEAR_SSC_EOS(data->start_class);
3643     cl_and(data->start_class, and_withp);
3644    }
3645    flags &= ~SCF_DO_STCLASS;
3646   }
3647   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3648    I32 l = STR_LEN(scan);
3649    UV uc = *((U8*)STRING(scan));
3650
3651    /* Search for fixed substrings supports EXACT only. */
3652    if (flags & SCF_DO_SUBSTR) {
3653     assert(data);
3654     SCAN_COMMIT(pRExC_state, data, minlenp);
3655    }
3656    if (UTF) {
3657     const U8 * const s = (U8 *)STRING(scan);
3658     uc = utf8_to_uvchr_buf(s, s + l, NULL);
3659     l = utf8_length(s, s + l);
3660    }
3661    if (has_exactf_sharp_s) {
3662     RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3663    }
3664    min += l - min_subtract;
3665    assert (min >= 0);
3666    delta += min_subtract;
3667    if (flags & SCF_DO_SUBSTR) {
3668     data->pos_min += l - min_subtract;
3669     if (data->pos_min < 0) {
3670      data->pos_min = 0;
3671     }
3672     data->pos_delta += min_subtract;
3673     if (min_subtract) {
3674      data->longest = &(data->longest_float);
3675     }
3676    }
3677    if (flags & SCF_DO_STCLASS_AND) {
3678     /* Check whether it is compatible with what we know already! */
3679     int compat = 1;
3680     if (uc >= 0x100 ||
3681     (!(data->start_class->flags & ANYOF_LOCALE)
3682     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3683     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3684     {
3685      compat = 0;
3686     }
3687     ANYOF_CLASS_ZERO(data->start_class);
3688     ANYOF_BITMAP_ZERO(data->start_class);
3689     if (compat) {
3690      ANYOF_BITMAP_SET(data->start_class, uc);
3691      CLEAR_SSC_EOS(data->start_class);
3692      if (OP(scan) == EXACTFL) {
3693       /* XXX This set is probably no longer necessary, and
3694       * probably wrong as LOCALE now is on in the initial
3695       * state */
3696       data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3697      }
3698      else {
3699
3700       /* Also set the other member of the fold pair.  In case
3701       * that unicode semantics is called for at runtime, use
3702       * the full latin1 fold.  (Can't do this for locale,
3703       * because not known until runtime) */
3704       ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3705
3706       /* All other (EXACTFL handled above) folds except under
3707       * /iaa that include s, S, and sharp_s also may include
3708       * the others */
3709       if (OP(scan) != EXACTFA) {
3710        if (uc == 's' || uc == 'S') {
3711         ANYOF_BITMAP_SET(data->start_class,
3712             LATIN_SMALL_LETTER_SHARP_S);
3713        }
3714        else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3715         ANYOF_BITMAP_SET(data->start_class, 's');
3716         ANYOF_BITMAP_SET(data->start_class, 'S');
3717        }
3718       }
3719      }
3720     }
3721     else if (uc >= 0x100) {
3722      int i;
3723      for (i = 0; i < 256; i++){
3724       if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3725        ANYOF_BITMAP_SET(data->start_class, i);
3726       }
3727      }
3728     }
3729    }
3730    else if (flags & SCF_DO_STCLASS_OR) {
3731     if (data->start_class->flags & ANYOF_LOC_FOLD) {
3732      /* false positive possible if the class is case-folded.
3733      Assume that the locale settings are the same... */
3734      if (uc < 0x100) {
3735       ANYOF_BITMAP_SET(data->start_class, uc);
3736       if (OP(scan) != EXACTFL) {
3737
3738        /* And set the other member of the fold pair, but
3739        * can't do that in locale because not known until
3740        * run-time */
3741        ANYOF_BITMAP_SET(data->start_class,
3742            PL_fold_latin1[uc]);
3743
3744        /* All folds except under /iaa that include s, S,
3745        * and sharp_s also may include the others */
3746        if (OP(scan) != EXACTFA) {
3747         if (uc == 's' || uc == 'S') {
3748          ANYOF_BITMAP_SET(data->start_class,
3749             LATIN_SMALL_LETTER_SHARP_S);
3750         }
3751         else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3752          ANYOF_BITMAP_SET(data->start_class, 's');
3753          ANYOF_BITMAP_SET(data->start_class, 'S');
3754         }
3755        }
3756       }
3757      }
3758      CLEAR_SSC_EOS(data->start_class);
3759     }
3760     cl_and(data->start_class, and_withp);
3761    }
3762    flags &= ~SCF_DO_STCLASS;
3763   }
3764   else if (REGNODE_VARIES(OP(scan))) {
3765    I32 mincount, maxcount, minnext, deltanext, fl = 0;
3766    I32 f = flags, pos_before = 0;
3767    regnode * const oscan = scan;
3768    struct regnode_charclass_class this_class;
3769    struct regnode_charclass_class *oclass = NULL;
3770    I32 next_is_eval = 0;
3771
3772    switch (PL_regkind[OP(scan)]) {
3773    case WHILEM:  /* End of (?:...)* . */
3774     scan = NEXTOPER(scan);
3775     goto finish;
3776    case PLUS:
3777     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3778      next = NEXTOPER(scan);
3779      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3780       mincount = 1;
3781       maxcount = REG_INFTY;
3782       next = regnext(scan);
3783       scan = NEXTOPER(scan);
3784       goto do_curly;
3785      }
3786     }
3787     if (flags & SCF_DO_SUBSTR)
3788      data->pos_min++;
3789     min++;
3790     /* Fall through. */
3791    case STAR:
3792     if (flags & SCF_DO_STCLASS) {
3793      mincount = 0;
3794      maxcount = REG_INFTY;
3795      next = regnext(scan);
3796      scan = NEXTOPER(scan);
3797      goto do_curly;
3798     }
3799     is_inf = is_inf_internal = 1;
3800     scan = regnext(scan);
3801     if (flags & SCF_DO_SUBSTR) {
3802      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3803      data->longest = &(data->longest_float);
3804     }
3805     goto optimize_curly_tail;
3806    case CURLY:
3807     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3808      && (scan->flags == stopparen))
3809     {
3810      mincount = 1;
3811      maxcount = 1;
3812     } else {
3813      mincount = ARG1(scan);
3814      maxcount = ARG2(scan);
3815     }
3816     next = regnext(scan);
3817     if (OP(scan) == CURLYX) {
3818      I32 lp = (data ? *(data->last_closep) : 0);
3819      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3820     }
3821     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3822     next_is_eval = (OP(scan) == EVAL);
3823    do_curly:
3824     if (flags & SCF_DO_SUBSTR) {
3825      if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3826      pos_before = data->pos_min;
3827     }
3828     if (data) {
3829      fl = data->flags;
3830      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3831      if (is_inf)
3832       data->flags |= SF_IS_INF;
3833     }
3834     if (flags & SCF_DO_STCLASS) {
3835      cl_init(pRExC_state, &this_class);
3836      oclass = data->start_class;
3837      data->start_class = &this_class;
3838      f |= SCF_DO_STCLASS_AND;
3839      f &= ~SCF_DO_STCLASS_OR;
3840     }
3841     /* Exclude from super-linear cache processing any {n,m}
3842     regops for which the combination of input pos and regex
3843     pos is not enough information to determine if a match
3844     will be possible.
3845
3846     For example, in the regex /foo(bar\s*){4,8}baz/ with the
3847     regex pos at the \s*, the prospects for a match depend not
3848     only on the input position but also on how many (bar\s*)
3849     repeats into the {4,8} we are. */
3850    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3851      f &= ~SCF_WHILEM_VISITED_POS;
3852
3853     /* This will finish on WHILEM, setting scan, or on NULL: */
3854     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3855          last, data, stopparen, recursed, NULL,
3856          (mincount == 0
3857           ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3858
3859     if (flags & SCF_DO_STCLASS)
3860      data->start_class = oclass;
3861     if (mincount == 0 || minnext == 0) {
3862      if (flags & SCF_DO_STCLASS_OR) {
3863       cl_or(pRExC_state, data->start_class, &this_class);
3864      }
3865      else if (flags & SCF_DO_STCLASS_AND) {
3866       /* Switch to OR mode: cache the old value of
3867       * data->start_class */
3868       INIT_AND_WITHP;
3869       StructCopy(data->start_class, and_withp,
3870         struct regnode_charclass_class);
3871       flags &= ~SCF_DO_STCLASS_AND;
3872       StructCopy(&this_class, data->start_class,
3873         struct regnode_charclass_class);
3874       flags |= SCF_DO_STCLASS_OR;
3875       SET_SSC_EOS(data->start_class);
3876      }
3877     } else {  /* Non-zero len */
3878      if (flags & SCF_DO_STCLASS_OR) {
3879       cl_or(pRExC_state, data->start_class, &this_class);
3880       cl_and(data->start_class, and_withp);
3881      }
3882      else if (flags & SCF_DO_STCLASS_AND)
3883       cl_and(data->start_class, &this_class);
3884      flags &= ~SCF_DO_STCLASS;
3885     }
3886     if (!scan)   /* It was not CURLYX, but CURLY. */
3887      scan = next;
3888     if ( /* ? quantifier ok, except for (?{ ... }) */
3889      (next_is_eval || !(mincount == 0 && maxcount == 1))
3890      && (minnext == 0) && (deltanext == 0)
3891      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3892      && maxcount <= REG_INFTY/3) /* Complement check for big count */
3893     {
3894      /* Fatal warnings may leak the regexp without this: */
3895      SAVEFREESV(RExC_rx_sv);
3896      ckWARNreg(RExC_parse,
3897        "Quantifier unexpected on zero-length expression");
3898      (void)ReREFCNT_inc(RExC_rx_sv);
3899     }
3900
3901     min += minnext * mincount;
3902     is_inf_internal |= deltanext == I32_MAX
3903          || (maxcount == REG_INFTY && minnext + deltanext > 0);
3904     is_inf |= is_inf_internal;
3905     if (is_inf)
3906      delta = I32_MAX;
3907     else
3908      delta += (minnext + deltanext) * maxcount - minnext * mincount;
3909
3910     /* Try powerful optimization CURLYX => CURLYN. */
3911     if (  OP(oscan) == CURLYX && data
3912      && data->flags & SF_IN_PAR
3913      && !(data->flags & SF_HAS_EVAL)
3914      && !deltanext && minnext == 1 ) {
3915      /* Try to optimize to CURLYN.  */
3916      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3917      regnode * const nxt1 = nxt;
3918 #ifdef DEBUGGING
3919      regnode *nxt2;
3920 #endif
3921
3922      /* Skip open. */
3923      nxt = regnext(nxt);
3924      if (!REGNODE_SIMPLE(OP(nxt))
3925       && !(PL_regkind[OP(nxt)] == EXACT
3926        && STR_LEN(nxt) == 1))
3927       goto nogo;
3928 #ifdef DEBUGGING
3929      nxt2 = nxt;
3930 #endif
3931      nxt = regnext(nxt);
3932      if (OP(nxt) != CLOSE)
3933       goto nogo;
3934      if (RExC_open_parens) {
3935       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3936       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3937      }
3938      /* Now we know that nxt2 is the only contents: */
3939      oscan->flags = (U8)ARG(nxt);
3940      OP(oscan) = CURLYN;
3941      OP(nxt1) = NOTHING; /* was OPEN. */
3942
3943 #ifdef DEBUGGING
3944      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3945      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3946      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3947      OP(nxt) = OPTIMIZED; /* was CLOSE. */
3948      OP(nxt + 1) = OPTIMIZED; /* was count. */
3949      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3950 #endif
3951     }
3952    nogo:
3953
3954     /* Try optimization CURLYX => CURLYM. */
3955     if (  OP(oscan) == CURLYX && data
3956      && !(data->flags & SF_HAS_PAR)
3957      && !(data->flags & SF_HAS_EVAL)
3958      && !deltanext /* atom is fixed width */
3959      && minnext != 0 /* CURLYM can't handle zero width */
3960      && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3961     ) {
3962      /* XXXX How to optimize if data == 0? */
3963      /* Optimize to a simpler form.  */
3964      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3965      regnode *nxt2;
3966
3967      OP(oscan) = CURLYM;
3968      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3969        && (OP(nxt2) != WHILEM))
3970       nxt = nxt2;
3971      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3972      /* Need to optimize away parenths. */
3973      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3974       /* Set the parenth number.  */
3975       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3976
3977       oscan->flags = (U8)ARG(nxt);
3978       if (RExC_open_parens) {
3979        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3980        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3981       }
3982       OP(nxt1) = OPTIMIZED; /* was OPEN. */
3983       OP(nxt) = OPTIMIZED; /* was CLOSE. */
3984
3985 #ifdef DEBUGGING
3986       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3987       OP(nxt + 1) = OPTIMIZED; /* was count. */
3988       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3989       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3990 #endif
3991 #if 0
3992       while ( nxt1 && (OP(nxt1) != WHILEM)) {
3993        regnode *nnxt = regnext(nxt1);
3994        if (nnxt == nxt) {
3995         if (reg_off_by_arg[OP(nxt1)])
3996          ARG_SET(nxt1, nxt2 - nxt1);
3997         else if (nxt2 - nxt1 < U16_MAX)
3998          NEXT_OFF(nxt1) = nxt2 - nxt1;
3999         else
4000          OP(nxt) = NOTHING; /* Cannot beautify */
4001        }
4002        nxt1 = nnxt;
4003       }
4004 #endif
4005       /* Optimize again: */
4006       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4007          NULL, stopparen, recursed, NULL, 0,depth+1);
4008      }
4009      else
4010       oscan->flags = 0;
4011     }
4012     else if ((OP(oscan) == CURLYX)
4013       && (flags & SCF_WHILEM_VISITED_POS)
4014       /* See the comment on a similar expression above.
4015        However, this time it's not a subexpression
4016        we care about, but the expression itself. */
4017       && (maxcount == REG_INFTY)
4018       && data && ++data->whilem_c < 16) {
4019      /* This stays as CURLYX, we can put the count/of pair. */
4020      /* Find WHILEM (as in regexec.c) */
4021      regnode *nxt = oscan + NEXT_OFF(oscan);
4022
4023      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4024       nxt += ARG(nxt);
4025      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4026       | (RExC_whilem_seen << 4)); /* On WHILEM */
4027     }
4028     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4029      pars++;
4030     if (flags & SCF_DO_SUBSTR) {
4031      SV *last_str = NULL;
4032      int counted = mincount != 0;
4033
4034      if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4035 #if defined(SPARC64_GCC_WORKAROUND)
4036       I32 b = 0;
4037       STRLEN l = 0;
4038       const char *s = NULL;
4039       I32 old = 0;
4040
4041       if (pos_before >= data->last_start_min)
4042        b = pos_before;
4043       else
4044        b = data->last_start_min;
4045
4046       l = 0;
4047       s = SvPV_const(data->last_found, l);
4048       old = b - data->last_start_min;
4049
4050 #else
4051       I32 b = pos_before >= data->last_start_min
4052        ? pos_before : data->last_start_min;
4053       STRLEN l;
4054       const char * const s = SvPV_const(data->last_found, l);
4055       I32 old = b - data->last_start_min;
4056 #endif
4057
4058       if (UTF)
4059        old = utf8_hop((U8*)s, old) - (U8*)s;
4060       l -= old;
4061       /* Get the added string: */
4062       last_str = newSVpvn_utf8(s  + old, l, UTF);
4063       if (deltanext == 0 && pos_before == b) {
4064        /* What was added is a constant string */
4065        if (mincount > 1) {
4066         SvGROW(last_str, (mincount * l) + 1);
4067         repeatcpy(SvPVX(last_str) + l,
4068           SvPVX_const(last_str), l, mincount - 1);
4069         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4070         /* Add additional parts. */
4071         SvCUR_set(data->last_found,
4072           SvCUR(data->last_found) - l);
4073         sv_catsv(data->last_found, last_str);
4074         {
4075          SV * sv = data->last_found;
4076          MAGIC *mg =
4077           SvUTF8(sv) && SvMAGICAL(sv) ?
4078           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4079          if (mg && mg->mg_len >= 0)
4080           mg->mg_len += CHR_SVLEN(last_str) - l;
4081         }
4082         data->last_end += l * (mincount - 1);
4083        }
4084       } else {
4085        /* start offset must point into the last copy */
4086        data->last_start_min += minnext * (mincount - 1);
4087        data->last_start_max += is_inf ? I32_MAX
4088         : (maxcount - 1) * (minnext + data->pos_delta);
4089       }
4090      }
4091      /* It is counted once already... */
4092      data->pos_min += minnext * (mincount - counted);
4093 #if 0
4094 PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
4095  counted, deltanext, I32_MAX, minnext, maxcount, mincount);
4096 if (deltanext != I32_MAX)
4097 PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
4098 #endif
4099      if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
4100       data->pos_delta = I32_MAX;
4101      else
4102       data->pos_delta += - counted * deltanext +
4103       (minnext + deltanext) * maxcount - minnext * mincount;
4104      if (mincount != maxcount) {
4105       /* Cannot extend fixed substrings found inside
4106        the group.  */
4107       SCAN_COMMIT(pRExC_state,data,minlenp);
4108       if (mincount && last_str) {
4109        SV * const sv = data->last_found;
4110        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4111         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4112
4113        if (mg)
4114         mg->mg_len = -1;
4115        sv_setsv(sv, last_str);
4116        data->last_end = data->pos_min;
4117        data->last_start_min =
4118         data->pos_min - CHR_SVLEN(last_str);
4119        data->last_start_max = is_inf
4120         ? I32_MAX
4121         : data->pos_min + data->pos_delta
4122         - CHR_SVLEN(last_str);
4123       }
4124       data->longest = &(data->longest_float);
4125      }
4126      SvREFCNT_dec(last_str);
4127     }
4128     if (data && (fl & SF_HAS_EVAL))
4129      data->flags |= SF_HAS_EVAL;
4130    optimize_curly_tail:
4131     if (OP(oscan) != CURLYX) {
4132      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4133       && NEXT_OFF(next))
4134       NEXT_OFF(oscan) += NEXT_OFF(next);
4135     }
4136     continue;
4137    default:   /* REF, and CLUMP only? */
4138     if (flags & SCF_DO_SUBSTR) {
4139      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4140      data->longest = &(data->longest_float);
4141     }
4142     is_inf = is_inf_internal = 1;
4143     if (flags & SCF_DO_STCLASS_OR)
4144      cl_anything(pRExC_state, data->start_class);
4145     flags &= ~SCF_DO_STCLASS;
4146     break;
4147    }
4148   }
4149   else if (OP(scan) == LNBREAK) {
4150    if (flags & SCF_DO_STCLASS) {
4151     int value = 0;
4152     CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4153      if (flags & SCF_DO_STCLASS_AND) {
4154      for (value = 0; value < 256; value++)
4155       if (!is_VERTWS_cp(value))
4156        ANYOF_BITMAP_CLEAR(data->start_class, value);
4157     }
4158     else {
4159      for (value = 0; value < 256; value++)
4160       if (is_VERTWS_cp(value))
4161        ANYOF_BITMAP_SET(data->start_class, value);
4162     }
4163     if (flags & SCF_DO_STCLASS_OR)
4164      cl_and(data->start_class, and_withp);
4165     flags &= ~SCF_DO_STCLASS;
4166    }
4167    min++;
4168    delta++;    /* Because of the 2 char string cr-lf */
4169    if (flags & SCF_DO_SUBSTR) {
4170      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4171      data->pos_min += 1;
4172     data->pos_delta += 1;
4173     data->longest = &(data->longest_float);
4174     }
4175   }
4176   else if (REGNODE_SIMPLE(OP(scan))) {
4177    int value = 0;
4178
4179    if (flags & SCF_DO_SUBSTR) {
4180     SCAN_COMMIT(pRExC_state,data,minlenp);
4181     data->pos_min++;
4182    }
4183    min++;
4184    if (flags & SCF_DO_STCLASS) {
4185     int loop_max = 256;
4186     CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4187
4188     /* Some of the logic below assumes that switching
4189     locale on will only add false positives. */
4190     switch (PL_regkind[OP(scan)]) {
4191      U8 classnum;
4192
4193     case SANY:
4194     default:
4195 #ifdef DEBUGGING
4196     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4197 #endif
4198     do_default:
4199      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4200       cl_anything(pRExC_state, data->start_class);
4201      break;
4202     case REG_ANY:
4203      if (OP(scan) == SANY)
4204       goto do_default;
4205      if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4206       value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4207         || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4208       cl_anything(pRExC_state, data->start_class);
4209      }
4210      if (flags & SCF_DO_STCLASS_AND || !value)
4211       ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4212      break;
4213     case ANYOF:
4214      if (flags & SCF_DO_STCLASS_AND)
4215       cl_and(data->start_class,
4216        (struct regnode_charclass_class*)scan);
4217      else
4218       cl_or(pRExC_state, data->start_class,
4219        (struct regnode_charclass_class*)scan);
4220      break;
4221     case POSIXA:
4222      loop_max = 128;
4223      /* FALL THROUGH */
4224     case POSIXL:
4225     case POSIXD:
4226     case POSIXU:
4227      classnum = FLAGS(scan);
4228      if (flags & SCF_DO_STCLASS_AND) {
4229       if (!(data->start_class->flags & ANYOF_LOCALE)) {
4230        ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4231        for (value = 0; value < loop_max; value++) {
4232         if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4233          ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4234         }
4235        }
4236       }
4237      }
4238      else {
4239       if (data->start_class->flags & ANYOF_LOCALE) {
4240        ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4241       }
4242       else {
4243
4244       /* Even if under locale, set the bits for non-locale
4245       * in case it isn't a true locale-node.  This will
4246       * create false positives if it truly is locale */
4247       for (value = 0; value < loop_max; value++) {
4248        if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4249         ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4250        }
4251       }
4252       }
4253      }
4254      break;
4255     case NPOSIXA:
4256      loop_max = 128;
4257      /* FALL THROUGH */
4258     case NPOSIXL:
4259     case NPOSIXU:
4260     case NPOSIXD:
4261      classnum = FLAGS(scan);
4262      if (flags & SCF_DO_STCLASS_AND) {
4263       if (!(data->start_class->flags & ANYOF_LOCALE)) {
4264        ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4265        for (value = 0; value < loop_max; value++) {
4266         if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4267          ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4268         }
4269        }
4270       }
4271      }
4272      else {
4273       if (data->start_class->flags & ANYOF_LOCALE) {
4274        ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4275       }
4276       else {
4277
4278       /* Even if under locale, set the bits for non-locale in
4279       * case it isn't a true locale-node.  This will create
4280       * false positives if it truly is locale */
4281       for (value = 0; value < loop_max; value++) {
4282        if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4283         ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4284        }
4285       }
4286       if (PL_regkind[OP(scan)] == NPOSIXD) {
4287        data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4288       }
4289       }
4290      }
4291      break;
4292     }
4293     if (flags & SCF_DO_STCLASS_OR)
4294      cl_and(data->start_class, and_withp);
4295     flags &= ~SCF_DO_STCLASS;
4296    }
4297   }
4298   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4299    data->flags |= (OP(scan) == MEOL
4300        ? SF_BEFORE_MEOL
4301        : SF_BEFORE_SEOL);
4302    SCAN_COMMIT(pRExC_state, data, minlenp);
4303
4304   }
4305   else if (  PL_regkind[OP(scan)] == BRANCHJ
4306     /* Lookbehind, or need to calculate parens/evals/stclass: */
4307     && (scan->flags || data || (flags & SCF_DO_STCLASS))
4308     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4309    if ( OP(scan) == UNLESSM &&
4310     scan->flags == 0 &&
4311     OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4312     OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4313    ) {
4314     regnode *opt;
4315     regnode *upto= regnext(scan);
4316     DEBUG_PARSE_r({
4317      SV * const mysv_val=sv_newmortal();
4318      DEBUG_STUDYDATA("OPFAIL",data,depth);
4319
4320      /*DEBUG_PARSE_MSG("opfail");*/
4321      regprop(RExC_rx, mysv_val, upto);
4322      PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4323         SvPV_nolen_const(mysv_val),
4324         (IV)REG_NODE_NUM(upto),
4325         (IV)(upto - scan)
4326      );
4327     });
4328     OP(scan) = OPFAIL;
4329     NEXT_OFF(scan) = upto - scan;
4330     for (opt= scan + 1; opt < upto ; opt++)
4331      OP(opt) = OPTIMIZED;
4332     scan= upto;
4333     continue;
4334    }
4335    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4336     || OP(scan) == UNLESSM )
4337    {
4338     /* Negative Lookahead/lookbehind
4339     In this case we can't do fixed string optimisation.
4340     */
4341
4342     I32 deltanext, minnext, fake = 0;
4343     regnode *nscan;
4344     struct regnode_charclass_class intrnl;
4345     int f = 0;
4346
4347     data_fake.flags = 0;
4348     if (data) {
4349      data_fake.whilem_c = data->whilem_c;
4350      data_fake.last_closep = data->last_closep;
4351     }
4352     else
4353      data_fake.last_closep = &fake;
4354     data_fake.pos_delta = delta;
4355     if ( flags & SCF_DO_STCLASS && !scan->flags
4356      && OP(scan) == IFMATCH ) { /* Lookahead */
4357      cl_init(pRExC_state, &intrnl);
4358      data_fake.start_class = &intrnl;
4359      f |= SCF_DO_STCLASS_AND;
4360     }
4361     if (flags & SCF_WHILEM_VISITED_POS)
4362      f |= SCF_WHILEM_VISITED_POS;
4363     next = regnext(scan);
4364     nscan = NEXTOPER(NEXTOPER(scan));
4365     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4366      last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4367     if (scan->flags) {
4368      if (deltanext) {
4369       FAIL("Variable length lookbehind not implemented");
4370      }
4371      else if (minnext > (I32)U8_MAX) {
4372       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4373      }
4374      scan->flags = (U8)minnext;
4375     }
4376     if (data) {
4377      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4378       pars++;
4379      if (data_fake.flags & SF_HAS_EVAL)
4380       data->flags |= SF_HAS_EVAL;
4381      data->whilem_c = data_fake.whilem_c;
4382     }
4383     if (f & SCF_DO_STCLASS_AND) {
4384      if (flags & SCF_DO_STCLASS_OR) {
4385       /* OR before, AND after: ideally we would recurse with
4386       * data_fake to get the AND applied by study of the
4387       * remainder of the pattern, and then derecurse;
4388       * *** HACK *** for now just treat as "no information".
4389       * See [perl #56690].
4390       */
4391       cl_init(pRExC_state, data->start_class);
4392      }  else {
4393       /* AND before and after: combine and continue */
4394       const int was = TEST_SSC_EOS(data->start_class);
4395
4396       cl_and(data->start_class, &intrnl);
4397       if (was)
4398        SET_SSC_EOS(data->start_class);
4399      }
4400     }
4401    }
4402 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4403    else {
4404     /* Positive Lookahead/lookbehind
4405     In this case we can do fixed string optimisation,
4406     but we must be careful about it. Note in the case of
4407     lookbehind the positions will be offset by the minimum
4408     length of the pattern, something we won't know about
4409     until after the recurse.
4410     */
4411     I32 deltanext, fake = 0;
4412     regnode *nscan;
4413     struct regnode_charclass_class intrnl;
4414     int f = 0;
4415     /* We use SAVEFREEPV so that when the full compile
4416      is finished perl will clean up the allocated
4417      minlens when it's all done. This way we don't
4418      have to worry about freeing them when we know
4419      they wont be used, which would be a pain.
4420     */
4421     I32 *minnextp;
4422     Newx( minnextp, 1, I32 );
4423     SAVEFREEPV(minnextp);
4424
4425     if (data) {
4426      StructCopy(data, &data_fake, scan_data_t);
4427      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4428       f |= SCF_DO_SUBSTR;
4429       if (scan->flags)
4430        SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4431       data_fake.last_found=newSVsv(data->last_found);
4432      }
4433     }
4434     else
4435      data_fake.last_closep = &fake;
4436     data_fake.flags = 0;
4437     data_fake.pos_delta = delta;
4438     if (is_inf)
4439      data_fake.flags |= SF_IS_INF;
4440     if ( flags & SCF_DO_STCLASS && !scan->flags
4441      && OP(scan) == IFMATCH ) { /* Lookahead */
4442      cl_init(pRExC_state, &intrnl);
4443      data_fake.start_class = &intrnl;
4444      f |= SCF_DO_STCLASS_AND;
4445     }
4446     if (flags & SCF_WHILEM_VISITED_POS)
4447      f |= SCF_WHILEM_VISITED_POS;
4448     next = regnext(scan);
4449     nscan = NEXTOPER(NEXTOPER(scan));
4450
4451     *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4452      last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4453     if (scan->flags) {
4454      if (deltanext) {
4455       FAIL("Variable length lookbehind not implemented");
4456      }
4457      else if (*minnextp > (I32)U8_MAX) {
4458       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4459      }
4460      scan->flags = (U8)*minnextp;
4461     }
4462
4463     *minnextp += min;
4464
4465     if (f & SCF_DO_STCLASS_AND) {
4466      const int was = TEST_SSC_EOS(data.start_class);
4467
4468      cl_and(data->start_class, &intrnl);
4469      if (was)
4470       SET_SSC_EOS(data->start_class);
4471     }
4472     if (data) {
4473      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4474       pars++;
4475      if (data_fake.flags & SF_HAS_EVAL)
4476       data->flags |= SF_HAS_EVAL;
4477      data->whilem_c = data_fake.whilem_c;
4478      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4479       if (RExC_rx->minlen<*minnextp)
4480        RExC_rx->minlen=*minnextp;
4481       SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4482       SvREFCNT_dec_NN(data_fake.last_found);
4483
4484       if ( data_fake.minlen_fixed != minlenp )
4485       {
4486        data->offset_fixed= data_fake.offset_fixed;
4487        data->minlen_fixed= data_fake.minlen_fixed;
4488        data->lookbehind_fixed+= scan->flags;
4489       }
4490       if ( data_fake.minlen_float != minlenp )
4491       {
4492        data->minlen_float= data_fake.minlen_float;
4493        data->offset_float_min=data_fake.offset_float_min;
4494        data->offset_float_max=data_fake.offset_float_max;
4495        data->lookbehind_float+= scan->flags;
4496       }
4497      }
4498     }
4499    }
4500 #endif
4501   }
4502   else if (OP(scan) == OPEN) {
4503    if (stopparen != (I32)ARG(scan))
4504     pars++;
4505   }
4506   else if (OP(scan) == CLOSE) {
4507    if (stopparen == (I32)ARG(scan)) {
4508     break;
4509    }
4510    if ((I32)ARG(scan) == is_par) {
4511     next = regnext(scan);
4512
4513     if ( next && (OP(next) != WHILEM) && next < last)
4514      is_par = 0;  /* Disable optimization */
4515    }
4516    if (data)
4517     *(data->last_closep) = ARG(scan);
4518   }
4519   else if (OP(scan) == EVAL) {
4520     if (data)
4521      data->flags |= SF_HAS_EVAL;
4522   }
4523   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4524    if (flags & SCF_DO_SUBSTR) {
4525     SCAN_COMMIT(pRExC_state,data,minlenp);
4526     flags &= ~SCF_DO_SUBSTR;
4527    }
4528    if (data && OP(scan)==ACCEPT) {
4529     data->flags |= SCF_SEEN_ACCEPT;
4530     if (stopmin > min)
4531      stopmin = min;
4532    }
4533   }
4534   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4535   {
4536     if (flags & SCF_DO_SUBSTR) {
4537      SCAN_COMMIT(pRExC_state,data,minlenp);
4538      data->longest = &(data->longest_float);
4539     }
4540     is_inf = is_inf_internal = 1;
4541     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4542      cl_anything(pRExC_state, data->start_class);
4543     flags &= ~SCF_DO_STCLASS;
4544   }
4545   else if (OP(scan) == GPOS) {
4546    if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4547     !(delta || is_inf || (data && data->pos_delta)))
4548    {
4549     if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4550      RExC_rx->extflags |= RXf_ANCH_GPOS;
4551     if (RExC_rx->gofs < (U32)min)
4552      RExC_rx->gofs = min;
4553    } else {
4554     RExC_rx->extflags |= RXf_GPOS_FLOAT;
4555     RExC_rx->gofs = 0;
4556    }
4557   }
4558 #ifdef TRIE_STUDY_OPT
4559 #ifdef FULL_TRIE_STUDY
4560   else if (PL_regkind[OP(scan)] == TRIE) {
4561    /* NOTE - There is similar code to this block above for handling
4562    BRANCH nodes on the initial study.  If you change stuff here
4563    check there too. */
4564    regnode *trie_node= scan;
4565    regnode *tail= regnext(scan);
4566    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4567    I32 max1 = 0, min1 = I32_MAX;
4568    struct regnode_charclass_class accum;
4569
4570    if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4571     SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4572    if (flags & SCF_DO_STCLASS)
4573     cl_init_zero(pRExC_state, &accum);
4574
4575    if (!trie->jump) {
4576     min1= trie->minlen;
4577     max1= trie->maxlen;
4578    } else {
4579     const regnode *nextbranch= NULL;
4580     U32 word;
4581
4582     for ( word=1 ; word <= trie->wordcount ; word++)
4583     {
4584      I32 deltanext=0, minnext=0, f = 0, fake;
4585      struct regnode_charclass_class this_class;
4586
4587      data_fake.flags = 0;
4588      if (data) {
4589       data_fake.whilem_c = data->whilem_c;
4590       data_fake.last_closep = data->last_closep;
4591      }
4592      else
4593       data_fake.last_closep = &fake;
4594      data_fake.pos_delta = delta;
4595      if (flags & SCF_DO_STCLASS) {
4596       cl_init(pRExC_state, &this_class);
4597       data_fake.start_class = &this_class;
4598       f = SCF_DO_STCLASS_AND;
4599      }
4600      if (flags & SCF_WHILEM_VISITED_POS)
4601       f |= SCF_WHILEM_VISITED_POS;
4602
4603      if (trie->jump[word]) {
4604       if (!nextbranch)
4605        nextbranch = trie_node + trie->jump[0];
4606       scan= trie_node + trie->jump[word];
4607       /* We go from the jump point to the branch that follows
4608       it. Note this means we need the vestigal unused branches
4609       even though they arent otherwise used.
4610       */
4611       minnext = study_chunk(pRExC_state, &scan, minlenp,
4612        &deltanext, (regnode *)nextbranch, &data_fake,
4613        stopparen, recursed, NULL, f,depth+1);
4614      }
4615      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4616       nextbranch= regnext((regnode*)nextbranch);
4617
4618      if (min1 > (I32)(minnext + trie->minlen))
4619       min1 = minnext + trie->minlen;
4620      if (deltanext == I32_MAX) {
4621       is_inf = is_inf_internal = 1;
4622       max1 = I32_MAX;
4623      } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4624       max1 = minnext + deltanext + trie->maxlen;
4625
4626      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4627       pars++;
4628      if (data_fake.flags & SCF_SEEN_ACCEPT) {
4629       if ( stopmin > min + min1)
4630        stopmin = min + min1;
4631       flags &= ~SCF_DO_SUBSTR;
4632       if (data)
4633        data->flags |= SCF_SEEN_ACCEPT;
4634      }
4635      if (data) {
4636       if (data_fake.flags & SF_HAS_EVAL)
4637        data->flags |= SF_HAS_EVAL;
4638       data->whilem_c = data_fake.whilem_c;
4639      }
4640      if (flags & SCF_DO_STCLASS)
4641       cl_or(pRExC_state, &accum, &this_class);
4642     }
4643    }
4644    if (flags & SCF_DO_SUBSTR) {
4645     data->pos_min += min1;
4646     data->pos_delta += max1 - min1;
4647     if (max1 != min1 || is_inf)
4648      data->longest = &(data->longest_float);
4649    }
4650    min += min1;
4651    delta += max1 - min1;
4652    if (flags & SCF_DO_STCLASS_OR) {
4653     cl_or(pRExC_state, data->start_class, &accum);
4654     if (min1) {
4655      cl_and(data->start_class, and_withp);
4656      flags &= ~SCF_DO_STCLASS;
4657     }
4658    }
4659    else if (flags & SCF_DO_STCLASS_AND) {
4660     if (min1) {
4661      cl_and(data->start_class, &accum);
4662      flags &= ~SCF_DO_STCLASS;
4663     }
4664     else {
4665      /* Switch to OR mode: cache the old value of
4666      * data->start_class */
4667      INIT_AND_WITHP;
4668      StructCopy(data->start_class, and_withp,
4669        struct regnode_charclass_class);
4670      flags &= ~SCF_DO_STCLASS_AND;
4671      StructCopy(&accum, data->start_class,
4672        struct regnode_charclass_class);
4673      flags |= SCF_DO_STCLASS_OR;
4674      SET_SSC_EOS(data->start_class);
4675     }
4676    }
4677    scan= tail;
4678    continue;
4679   }
4680 #else
4681   else if (PL_regkind[OP(scan)] == TRIE) {
4682    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4683    U8*bang=NULL;
4684
4685    min += trie->minlen;
4686    delta += (trie->maxlen - trie->minlen);
4687    flags &= ~SCF_DO_STCLASS; /* xxx */
4688    if (flags & SCF_DO_SUBSTR) {
4689      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4690      data->pos_min += trie->minlen;
4691      data->pos_delta += (trie->maxlen - trie->minlen);
4692     if (trie->maxlen != trie->minlen)
4693      data->longest = &(data->longest_float);
4694     }
4695     if (trie->jump) /* no more substrings -- for now /grr*/
4696      flags &= ~SCF_DO_SUBSTR;
4697   }
4698 #endif /* old or new */
4699 #endif /* TRIE_STUDY_OPT */
4700
4701   /* Else: zero-length, ignore. */
4702   scan = regnext(scan);
4703  }
4704  if (frame) {
4705   last = frame->last;
4706   scan = frame->next;
4707   stopparen = frame->stop;
4708   frame = frame->prev;
4709   goto fake_study_recurse;
4710  }
4711
4712   finish:
4713  assert(!frame);
4714  DEBUG_STUDYDATA("pre-fin:",data,depth);
4715
4716  *scanp = scan;
4717  *deltap = is_inf_internal ? I32_MAX : delta;
4718  if (flags & SCF_DO_SUBSTR && is_inf)
4719   data->pos_delta = I32_MAX - data->pos_min;
4720  if (is_par > (I32)U8_MAX)
4721   is_par = 0;
4722  if (is_par && pars==1 && data) {
4723   data->flags |= SF_IN_PAR;
4724   data->flags &= ~SF_HAS_PAR;
4725  }
4726  else if (pars && data) {
4727   data->flags |= SF_HAS_PAR;
4728   data->flags &= ~SF_IN_PAR;
4729  }
4730  if (flags & SCF_DO_STCLASS_OR)
4731   cl_and(data->start_class, and_withp);
4732  if (flags & SCF_TRIE_RESTUDY)
4733   data->flags |=  SCF_TRIE_RESTUDY;
4734
4735  DEBUG_STUDYDATA("post-fin:",data,depth);
4736
4737  return min < stopmin ? min : stopmin;
4738 }
4739
4740 STATIC U32
4741 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4742 {
4743  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4744
4745  PERL_ARGS_ASSERT_ADD_DATA;
4746
4747  Renewc(RExC_rxi->data,
4748   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4749   char, struct reg_data);
4750  if(count)
4751   Renew(RExC_rxi->data->what, count + n, U8);
4752  else
4753   Newx(RExC_rxi->data->what, n, U8);
4754  RExC_rxi->data->count = count + n;
4755  Copy(s, RExC_rxi->data->what + count, n, U8);
4756  return count;
4757 }
4758
4759 /*XXX: todo make this not included in a non debugging perl */
4760 #ifndef PERL_IN_XSUB_RE
4761 void
4762 Perl_reginitcolors(pTHX)
4763 {
4764  dVAR;
4765  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4766  if (s) {
4767   char *t = savepv(s);
4768   int i = 0;
4769   PL_colors[0] = t;
4770   while (++i < 6) {
4771    t = strchr(t, '\t');
4772    if (t) {
4773     *t = '\0';
4774     PL_colors[i] = ++t;
4775    }
4776    else
4777     PL_colors[i] = t = (char *)"";
4778   }
4779  } else {
4780   int i = 0;
4781   while (i < 6)
4782    PL_colors[i++] = (char *)"";
4783  }
4784  PL_colorset = 1;
4785 }
4786 #endif
4787
4788
4789 #ifdef TRIE_STUDY_OPT
4790 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4791  STMT_START {                                            \
4792   if (                                                \
4793    (data.flags & SCF_TRIE_RESTUDY)               \
4794    && ! restudied++                              \
4795   ) {                                                 \
4796    dOsomething;                                    \
4797    goto reStudy;                                   \
4798   }                                                   \
4799  } STMT_END
4800 #else
4801 #define CHECK_RESTUDY_GOTO_butfirst
4802 #endif
4803
4804 /*
4805  * pregcomp - compile a regular expression into internal code
4806  *
4807  * Decides which engine's compiler to call based on the hint currently in
4808  * scope
4809  */
4810
4811 #ifndef PERL_IN_XSUB_RE
4812
4813 /* return the currently in-scope regex engine (or the default if none)  */
4814
4815 regexp_engine const *
4816 Perl_current_re_engine(pTHX)
4817 {
4818  dVAR;
4819
4820  if (IN_PERL_COMPILETIME) {
4821   HV * const table = GvHV(PL_hintgv);
4822   SV **ptr;
4823
4824   if (!table)
4825    return &reh_regexp_engine;
4826   ptr = hv_fetchs(table, "regcomp", FALSE);
4827   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4828    return &reh_regexp_engine;
4829   return INT2PTR(regexp_engine*,SvIV(*ptr));
4830  }
4831  else {
4832   SV *ptr;
4833   if (!PL_curcop->cop_hints_hash)
4834    return &reh_regexp_engine;
4835   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4836   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4837    return &reh_regexp_engine;
4838   return INT2PTR(regexp_engine*,SvIV(ptr));
4839  }
4840 }
4841
4842
4843 REGEXP *
4844 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4845 {
4846  dVAR;
4847  regexp_engine const *eng = current_re_engine();
4848  GET_RE_DEBUG_FLAGS_DECL;
4849
4850  PERL_ARGS_ASSERT_PREGCOMP;
4851
4852  /* Dispatch a request to compile a regexp to correct regexp engine. */
4853  DEBUG_COMPILE_r({
4854   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4855       PTR2UV(eng));
4856  });
4857  return CALLREGCOMP_ENG(eng, pattern, flags);
4858 }
4859 #endif
4860
4861 /* public(ish) entry point for the perl core's own regex compiling code.
4862  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4863  * pattern rather than a list of OPs, and uses the internal engine rather
4864  * than the current one */
4865
4866 REGEXP *
4867 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4868 {
4869  SV *pat = pattern; /* defeat constness! */
4870  PERL_ARGS_ASSERT_RE_COMPILE;
4871  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4872 #ifdef PERL_IN_XSUB_RE
4873         &my_reg_engine,
4874 #else
4875         &reh_regexp_engine,
4876 #endif
4877         NULL, NULL, rx_flags, 0);
4878 }
4879
4880
4881 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
4882  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
4883  * point to the realloced string and length.
4884  *
4885  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
4886  * stuff added */
4887
4888 static void
4889 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
4890      char **pat_p, STRLEN *plen_p, int num_code_blocks)
4891 {
4892  U8 *const src = (U8*)*pat_p;
4893  U8 *dst;
4894  int n=0;
4895  STRLEN s = 0, d = 0;
4896  bool do_end = 0;
4897  GET_RE_DEBUG_FLAGS_DECL;
4898
4899  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4900   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4901
4902  Newx(dst, *plen_p * 2 + 1, U8);
4903
4904  while (s < *plen_p) {
4905   const UV uv = NATIVE_TO_ASCII(src[s]);
4906   if (UNI_IS_INVARIANT(uv))
4907    dst[d]   = (U8)UTF_TO_NATIVE(uv);
4908   else {
4909    dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
4910    dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
4911   }
4912   if (n < num_code_blocks) {
4913    if (!do_end && pRExC_state->code_blocks[n].start == s) {
4914     pRExC_state->code_blocks[n].start = d;
4915     assert(dst[d] == '(');
4916     do_end = 1;
4917    }
4918    else if (do_end && pRExC_state->code_blocks[n].end == s) {
4919     pRExC_state->code_blocks[n].end = d;
4920     assert(dst[d] == ')');
4921     do_end = 0;
4922     n++;
4923    }
4924   }
4925   s++;
4926   d++;
4927  }
4928  dst[d] = '\0';
4929  *plen_p = d;
4930  *pat_p = (char*) dst;
4931  SAVEFREEPV(*pat_p);
4932  RExC_orig_utf8 = RExC_utf8 = 1;
4933 }
4934
4935
4936
4937 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
4938  * while recording any code block indices, and handling overloading,
4939  * nested qr// objects etc.  If pat is null, it will allocate a new
4940  * string, or just return the first arg, if there's only one.
4941  *
4942  * Returns the malloced/updated pat.
4943  * patternp and pat_count is the array of SVs to be concatted;
4944  * oplist is the optional list of ops that generated the SVs;
4945  * recompile_p is a pointer to a boolean that will be set if
4946  *   the regex will need to be recompiled.
4947  * delim, if non-null is an SV that will be inserted between each element
4948  */
4949
4950 static SV*
4951 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
4952     SV *pat, SV ** const patternp, int pat_count,
4953     OP *oplist, bool *recompile_p, SV *delim)
4954 {
4955  SV **svp;
4956  int n = 0;
4957  bool use_delim = FALSE;
4958  bool alloced = FALSE;
4959
4960  /* if we know we have at least two args, create an empty string,
4961  * then concatenate args to that. For no args, return an empty string */
4962  if (!pat && pat_count != 1) {
4963   pat = newSVpvn("", 0);
4964   SAVEFREESV(pat);
4965   alloced = TRUE;
4966  }
4967
4968  for (svp = patternp; svp < patternp + pat_count; svp++) {
4969   SV *sv;
4970   SV *rx  = NULL;
4971   STRLEN orig_patlen = 0;
4972   bool code = 0;
4973   SV *msv = use_delim ? delim : *svp;
4974
4975   /* if we've got a delimiter, we go round the loop twice for each
4976   * svp slot (except the last), using the delimiter the second
4977   * time round */
4978   if (use_delim) {
4979    svp--;
4980    use_delim = FALSE;
4981   }
4982   else if (delim)
4983    use_delim = TRUE;
4984
4985   if (SvTYPE(msv) == SVt_PVAV) {
4986    /* we've encountered an interpolated array within
4987    * the pattern, e.g. /...@a..../. Expand the list of elements,
4988    * then recursively append elements.
4989    * The code in this block is based on S_pushav() */
4990
4991    AV *const av = (AV*)msv;
4992    const I32 maxarg = AvFILL(av) + 1;
4993    SV **array;
4994
4995    if (oplist) {
4996     assert(oplist->op_type == OP_PADAV
4997      || oplist->op_type == OP_RV2AV);
4998     oplist = oplist->op_sibling;;
4999    }
5000
5001    if (SvRMAGICAL(av)) {
5002     U32 i;
5003
5004     Newx(array, maxarg, SV*);
5005     SAVEFREEPV(array);
5006     for (i=0; i < (U32)maxarg; i++) {
5007      SV ** const svp = av_fetch(av, i, FALSE);
5008      array[i] = svp ? *svp : &PL_sv_undef;
5009     }
5010    }
5011    else
5012     array = AvARRAY(av);
5013
5014    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5015         array, maxarg, NULL, recompile_p,
5016         /* $" */
5017         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5018
5019    continue;
5020   }
5021
5022
5023   /* we make the assumption here that each op in the list of
5024   * op_siblings maps to one SV pushed onto the stack,
5025   * except for code blocks, with have both an OP_NULL and
5026   * and OP_CONST.
5027   * This allows us to match up the list of SVs against the
5028   * list of OPs to find the next code block.
5029   *
5030   * Note that       PUSHMARK PADSV PADSV ..
5031   * is optimised to
5032   *                 PADRANGE PADSV  PADSV  ..
5033   * so the alignment still works. */
5034
5035   if (oplist) {
5036    if (oplist->op_type == OP_NULL
5037     && (oplist->op_flags & OPf_SPECIAL))
5038    {
5039     assert(n < pRExC_state->num_code_blocks);
5040     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5041     pRExC_state->code_blocks[n].block = oplist;
5042     pRExC_state->code_blocks[n].src_regex = NULL;
5043     n++;
5044     code = 1;
5045     oplist = oplist->op_sibling; /* skip CONST */
5046     assert(oplist);
5047    }
5048    oplist = oplist->op_sibling;;
5049   }
5050
5051   /* apply magic and QR overloading to arg */
5052
5053   SvGETMAGIC(msv);
5054   if (SvROK(msv) && SvAMAGIC(msv)) {
5055    SV *sv = AMG_CALLunary(msv, regexp_amg);
5056    if (sv) {
5057     if (SvROK(sv))
5058      sv = SvRV(sv);
5059     if (SvTYPE(sv) != SVt_REGEXP)
5060      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5061     msv = sv;
5062    }
5063   }
5064
5065   /* try concatenation overload ... */
5066   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5067     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5068   {
5069    sv_setsv(pat, sv);
5070    /* overloading involved: all bets are off over literal
5071    * code. Pretend we haven't seen it */
5072    pRExC_state->num_code_blocks -= n;
5073    n = 0;
5074   }
5075   else  {
5076    /* ... or failing that, try "" overload */
5077    while (SvAMAGIC(msv)
5078      && (sv = AMG_CALLunary(msv, string_amg))
5079      && sv != msv
5080      &&  !(   SvROK(msv)
5081       && SvROK(sv)
5082       && SvRV(msv) == SvRV(sv))
5083    ) {
5084     msv = sv;
5085     SvGETMAGIC(msv);
5086    }
5087    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5088     msv = SvRV(msv);
5089
5090    if (pat) {
5091     /* this is a partially unrolled
5092     *     sv_catsv_nomg(pat, msv);
5093     * that allows us to adjust code block indices if
5094     * needed */
5095     STRLEN dlen;
5096     char *dst = SvPV_force_nomg(pat, dlen);
5097     orig_patlen = dlen;
5098     if (SvUTF8(msv) && !SvUTF8(pat)) {
5099      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5100      sv_setpvn(pat, dst, dlen);
5101      SvUTF8_on(pat);
5102     }
5103     sv_catsv_nomg(pat, msv);
5104     rx = msv;
5105    }
5106    else
5107     pat = msv;
5108
5109    if (code)
5110     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5111   }
5112
5113   /* extract any code blocks within any embedded qr//'s */
5114   if (rx && SvTYPE(rx) == SVt_REGEXP
5115    && RX_ENGINE((REGEXP*)rx)->op_comp)
5116   {
5117
5118    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5119    if (ri->num_code_blocks) {
5120     int i;
5121     /* the presence of an embedded qr// with code means
5122     * we should always recompile: the text of the
5123     * qr// may not have changed, but it may be a
5124     * different closure than last time */
5125     *recompile_p = 1;
5126     Renew(pRExC_state->code_blocks,
5127      pRExC_state->num_code_blocks + ri->num_code_blocks,
5128      struct reg_code_block);
5129     pRExC_state->num_code_blocks += ri->num_code_blocks;
5130
5131     for (i=0; i < ri->num_code_blocks; i++) {
5132      struct reg_code_block *src, *dst;
5133      STRLEN offset =  orig_patlen
5134       + ReANY((REGEXP *)rx)->pre_prefix;
5135      assert(n < pRExC_state->num_code_blocks);
5136      src = &ri->code_blocks[i];
5137      dst = &pRExC_state->code_blocks[n];
5138      dst->start     = src->start + offset;
5139      dst->end     = src->end   + offset;
5140      dst->block     = src->block;
5141      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5142            src->src_regex
5143             ? src->src_regex
5144             : (REGEXP*)rx);
5145      n++;
5146     }
5147    }
5148   }
5149  }
5150  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5151  if (alloced)
5152   SvSETMAGIC(pat);
5153
5154  return pat;
5155 }
5156
5157
5158
5159 /* see if there are any run-time code blocks in the pattern.
5160  * False positives are allowed */
5161
5162 static bool
5163 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5164      char *pat, STRLEN plen)
5165 {
5166  int n = 0;
5167  STRLEN s;
5168
5169  for (s = 0; s < plen; s++) {
5170   if (n < pRExC_state->num_code_blocks
5171    && s == pRExC_state->code_blocks[n].start)
5172   {
5173    s = pRExC_state->code_blocks[n].end;
5174    n++;
5175    continue;
5176   }
5177   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5178   * positives here */
5179   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5180    (pat[s+2] == '{'
5181     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5182   )
5183    return 1;
5184  }
5185  return 0;
5186 }
5187
5188 /* Handle run-time code blocks. We will already have compiled any direct
5189  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5190  * copy of it, but with any literal code blocks blanked out and
5191  * appropriate chars escaped; then feed it into
5192  *
5193  *    eval "qr'modified_pattern'"
5194  *
5195  * For example,
5196  *
5197  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5198  *
5199  * becomes
5200  *
5201  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5202  *
5203  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5204  * and merge them with any code blocks of the original regexp.
5205  *
5206  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5207  * instead, just save the qr and return FALSE; this tells our caller that
5208  * the original pattern needs upgrading to utf8.
5209  */
5210
5211 static bool
5212 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5213  char *pat, STRLEN plen)
5214 {
5215  SV *qr;
5216
5217  GET_RE_DEBUG_FLAGS_DECL;
5218
5219  if (pRExC_state->runtime_code_qr) {
5220   /* this is the second time we've been called; this should
5221   * only happen if the main pattern got upgraded to utf8
5222   * during compilation; re-use the qr we compiled first time
5223   * round (which should be utf8 too)
5224   */
5225   qr = pRExC_state->runtime_code_qr;
5226   pRExC_state->runtime_code_qr = NULL;
5227   assert(RExC_utf8 && SvUTF8(qr));
5228  }
5229  else {
5230   int n = 0;
5231   STRLEN s;
5232   char *p, *newpat;
5233   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5234   SV *sv, *qr_ref;
5235   dSP;
5236
5237   /* determine how many extra chars we need for ' and \ escaping */
5238   for (s = 0; s < plen; s++) {
5239    if (pat[s] == '\'' || pat[s] == '\\')
5240     newlen++;
5241   }
5242
5243   Newx(newpat, newlen, char);
5244   p = newpat;
5245   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5246
5247   for (s = 0; s < plen; s++) {
5248    if (n < pRExC_state->num_code_blocks
5249     && s == pRExC_state->code_blocks[n].start)
5250    {
5251     /* blank out literal code block */
5252     assert(pat[s] == '(');
5253     while (s <= pRExC_state->code_blocks[n].end) {
5254      *p++ = '_';
5255      s++;
5256     }
5257     s--;
5258     n++;
5259     continue;
5260    }
5261    if (pat[s] == '\'' || pat[s] == '\\')
5262     *p++ = '\\';
5263    *p++ = pat[s];
5264   }
5265   *p++ = '\'';
5266   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5267    *p++ = 'x';
5268   *p++ = '\0';
5269   DEBUG_COMPILE_r({
5270    PerlIO_printf(Perl_debug_log,
5271     "%sre-parsing pattern for runtime code:%s %s\n",
5272     PL_colors[4],PL_colors[5],newpat);
5273   });
5274
5275   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5276   Safefree(newpat);
5277
5278   ENTER;
5279   SAVETMPS;
5280   save_re_context();
5281   PUSHSTACKi(PERLSI_REQUIRE);
5282   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5283   * parsing qr''; normally only q'' does this. It also alters
5284   * hints handling */
5285   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5286   SvREFCNT_dec_NN(sv);
5287   SPAGAIN;
5288   qr_ref = POPs;
5289   PUTBACK;
5290   {
5291    SV * const errsv = ERRSV;
5292    if (SvTRUE_NN(errsv))
5293    {
5294     Safefree(pRExC_state->code_blocks);
5295     /* use croak_sv ? */
5296     Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5297    }
5298   }
5299   assert(SvROK(qr_ref));
5300   qr = SvRV(qr_ref);
5301   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5302   /* the leaving below frees the tmp qr_ref.
5303   * Give qr a life of its own */
5304   SvREFCNT_inc(qr);
5305   POPSTACK;
5306   FREETMPS;
5307   LEAVE;
5308
5309  }
5310
5311  if (!RExC_utf8 && SvUTF8(qr)) {
5312   /* first time through; the pattern got upgraded; save the
5313   * qr for the next time through */
5314   assert(!pRExC_state->runtime_code_qr);
5315   pRExC_state->runtime_code_qr = qr;
5316   return 0;
5317  }
5318
5319
5320  /* extract any code blocks within the returned qr//  */
5321
5322
5323  /* merge the main (r1) and run-time (r2) code blocks into one */
5324  {
5325   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5326   struct reg_code_block *new_block, *dst;
5327   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5328   int i1 = 0, i2 = 0;
5329
5330   if (!r2->num_code_blocks) /* we guessed wrong */
5331   {
5332    SvREFCNT_dec_NN(qr);
5333    return 1;
5334   }
5335
5336   Newx(new_block,
5337    r1->num_code_blocks + r2->num_code_blocks,
5338    struct reg_code_block);
5339   dst = new_block;
5340
5341   while (    i1 < r1->num_code_blocks
5342     || i2 < r2->num_code_blocks)
5343   {
5344    struct reg_code_block *src;
5345    bool is_qr = 0;
5346
5347    if (i1 == r1->num_code_blocks) {
5348     src = &r2->code_blocks[i2++];
5349     is_qr = 1;
5350    }
5351    else if (i2 == r2->num_code_blocks)
5352     src = &r1->code_blocks[i1++];
5353    else if (  r1->code_blocks[i1].start
5354      < r2->code_blocks[i2].start)
5355    {
5356     src = &r1->code_blocks[i1++];
5357     assert(src->end < r2->code_blocks[i2].start);
5358    }
5359    else {
5360     assert(  r1->code_blocks[i1].start
5361      > r2->code_blocks[i2].start);
5362     src = &r2->code_blocks[i2++];
5363     is_qr = 1;
5364     assert(src->end < r1->code_blocks[i1].start);
5365    }
5366
5367    assert(pat[src->start] == '(');
5368    assert(pat[src->end]   == ')');
5369    dst->start     = src->start;
5370    dst->end     = src->end;
5371    dst->block     = src->block;
5372    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5373          : src->src_regex;
5374    dst++;
5375   }
5376   r1->num_code_blocks += r2->num_code_blocks;
5377   Safefree(r1->code_blocks);
5378   r1->code_blocks = new_block;
5379  }
5380
5381  SvREFCNT_dec_NN(qr);
5382  return 1;
5383 }
5384
5385
5386 STATIC bool
5387 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5388 {
5389  /* This is the common code for setting up the floating and fixed length
5390  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
5391  * as to whether succeeded or not */
5392
5393  I32 t,ml;
5394
5395  if (! (longest_length
5396   || (eol /* Can't have SEOL and MULTI */
5397    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5398   )
5399    /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5400   || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5401  {
5402   return FALSE;
5403  }
5404
5405  /* copy the information about the longest from the reg_scan_data
5406   over to the program. */
5407  if (SvUTF8(sv_longest)) {
5408   *rx_utf8 = sv_longest;
5409   *rx_substr = NULL;
5410  } else {
5411   *rx_substr = sv_longest;
5412   *rx_utf8 = NULL;
5413  }
5414  /* end_shift is how many chars that must be matched that
5415   follow this item. We calculate it ahead of time as once the
5416   lookbehind offset is added in we lose the ability to correctly
5417   calculate it.*/
5418  ml = minlen ? *(minlen) : (I32)longest_length;
5419  *rx_end_shift = ml - offset
5420   - longest_length + (SvTAIL(sv_longest) != 0)
5421   + lookbehind;
5422
5423  t = (eol/* Can't have SEOL and MULTI */
5424   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5425  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5426
5427  return TRUE;
5428 }
5429
5430 /*
5431  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5432  * regular expression into internal code.
5433  * The pattern may be passed either as:
5434  *    a list of SVs (patternp plus pat_count)
5435  *    a list of OPs (expr)
5436  * If both are passed, the SV list is used, but the OP list indicates
5437  * which SVs are actually pre-compiled code blocks
5438  *
5439  * The SVs in the list have magic and qr overloading applied to them (and
5440  * the list may be modified in-place with replacement SVs in the latter
5441  * case).
5442  *
5443  * If the pattern hasn't changed from old_re, then old_re will be
5444  * returned.
5445  *
5446  * eng is the current engine. If that engine has an op_comp method, then
5447  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5448  * do the initial concatenation of arguments and pass on to the external
5449  * engine.
5450  *
5451  * If is_bare_re is not null, set it to a boolean indicating whether the
5452  * arg list reduced (after overloading) to a single bare regex which has
5453  * been returned (i.e. /$qr/).
5454  *
5455  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5456  *
5457  * pm_flags contains the PMf_* flags, typically based on those from the
5458  * pm_flags field of the related PMOP. Currently we're only interested in
5459  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5460  *
5461  * We can't allocate space until we know how big the compiled form will be,
5462  * but we can't compile it (and thus know how big it is) until we've got a
5463  * place to put the code.  So we cheat:  we compile it twice, once with code
5464  * generation turned off and size counting turned on, and once "for real".
5465  * This also means that we don't allocate space until we are sure that the
5466  * thing really will compile successfully, and we never have to move the
5467  * code and thus invalidate pointers into it.  (Note that it has to be in
5468  * one piece because free() must be able to free it all.) [NB: not true in perl]
5469  *
5470  * Beware that the optimization-preparation code in here knows about some
5471  * of the structure of the compiled regexp.  [I'll say.]
5472  */
5473
5474 REGEXP *
5475 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5476      OP *expr, const regexp_engine* eng, REGEXP *old_re,
5477      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5478 {
5479  dVAR;
5480  REGEXP *rx;
5481  struct regexp *r;
5482  regexp_internal *ri;
5483  STRLEN plen;
5484  char *exp;
5485  regnode *scan;
5486  I32 flags;
5487  I32 minlen = 0;
5488  U32 rx_flags;
5489  SV *pat;
5490  SV *code_blocksv = NULL;
5491  SV** new_patternp = patternp;
5492
5493  /* these are all flags - maybe they should be turned
5494  * into a single int with different bit masks */
5495  I32 sawlookahead = 0;
5496  I32 sawplus = 0;
5497  I32 sawopen = 0;
5498  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5499  bool recompile = 0;
5500  bool runtime_code = 0;
5501  scan_data_t data;
5502  RExC_state_t RExC_state;
5503  RExC_state_t * const pRExC_state = &RExC_state;
5504 #ifdef TRIE_STUDY_OPT
5505  int restudied = 0;
5506  RExC_state_t copyRExC_state;
5507 #endif
5508  GET_RE_DEBUG_FLAGS_DECL;
5509
5510  PERL_ARGS_ASSERT_RE_OP_COMPILE;
5511
5512  DEBUG_r(if (!PL_colorset) reginitcolors());
5513
5514 #ifndef PERL_IN_XSUB_RE
5515  /* Initialize these here instead of as-needed, as is quick and avoids
5516  * having to test them each time otherwise */
5517  if (! PL_AboveLatin1) {
5518   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5519   PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5520   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5521
5522   PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5523         = _new_invlist_C_array(L1PosixAlnum_invlist);
5524   PL_Posix_ptrs[_CC_ALPHANUMERIC]
5525         = _new_invlist_C_array(PosixAlnum_invlist);
5526
5527   PL_L1Posix_ptrs[_CC_ALPHA]
5528         = _new_invlist_C_array(L1PosixAlpha_invlist);
5529   PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5530
5531   PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5532   PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5533
5534   /* Cased is the same as Alpha in the ASCII range */
5535   PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5536   PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5537
5538   PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5539   PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5540
5541   PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5542   PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5543
5544   PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5545   PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5546
5547   PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5548   PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5549
5550   PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5551   PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5552
5553   PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5554   PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5555
5556   PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5557   PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5558   PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5559   PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5560
5561   PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5562   PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5563
5564   PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5565
5566   PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5567   PL_L1Posix_ptrs[_CC_WORDCHAR]
5568         = _new_invlist_C_array(L1PosixWord_invlist);
5569
5570   PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5571   PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5572
5573   PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5574  }
5575 #endif
5576
5577  pRExC_state->code_blocks = NULL;
5578  pRExC_state->num_code_blocks = 0;
5579
5580  if (is_bare_re)
5581   *is_bare_re = FALSE;
5582
5583  if (expr && (expr->op_type == OP_LIST ||
5584     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5585   /* allocate code_blocks if needed */
5586   OP *o;
5587   int ncode = 0;
5588
5589   for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5590    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5591     ncode++; /* count of DO blocks */
5592   if (ncode) {
5593    pRExC_state->num_code_blocks = ncode;
5594    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5595   }
5596  }
5597
5598  if (!pat_count) {
5599   /* compile-time pattern with just OP_CONSTs and DO blocks */
5600
5601   int n;
5602   OP *o;
5603
5604   /* find how many CONSTs there are */
5605   assert(expr);
5606   n = 0;
5607   if (expr->op_type == OP_CONST)
5608    n = 1;
5609   else
5610    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5611     if (o->op_type == OP_CONST)
5612      n++;
5613    }
5614
5615   /* fake up an SV array */
5616
5617   assert(!new_patternp);
5618   Newx(new_patternp, n, SV*);
5619   SAVEFREEPV(new_patternp);
5620   pat_count = n;
5621
5622   n = 0;
5623   if (expr->op_type == OP_CONST)
5624    new_patternp[n] = cSVOPx_sv(expr);
5625   else
5626    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5627     if (o->op_type == OP_CONST)
5628      new_patternp[n++] = cSVOPo_sv;
5629    }
5630
5631  }
5632
5633  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5634   "Assembling pattern from %d elements%s\n", pat_count,
5635    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5636
5637  /* set expr to the first arg op */
5638
5639  if (pRExC_state->num_code_blocks
5640   && expr->op_type != OP_CONST)
5641  {
5642    expr = cLISTOPx(expr)->op_first;
5643    assert(   expr->op_type == OP_PUSHMARK
5644     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5645     || expr->op_type == OP_PADRANGE);
5646    expr = expr->op_sibling;
5647  }
5648
5649  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5650       expr, &recompile, NULL);
5651
5652  /* handle bare (possibly after overloading) regex: foo =~ $re */
5653  {
5654   SV *re = pat;
5655   if (SvROK(re))
5656    re = SvRV(re);
5657   if (SvTYPE(re) == SVt_REGEXP) {
5658    if (is_bare_re)
5659     *is_bare_re = TRUE;
5660    SvREFCNT_inc(re);
5661    Safefree(pRExC_state->code_blocks);
5662    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5663     "Precompiled pattern%s\n",
5664      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5665
5666    return (REGEXP*)re;
5667   }
5668  }
5669
5670  exp = SvPV_nomg(pat, plen);
5671
5672  if (!eng->op_comp) {
5673   if ((SvUTF8(pat) && IN_BYTES)
5674     || SvGMAGICAL(pat) || SvAMAGIC(pat))
5675   {
5676    /* make a temporary copy; either to convert to bytes,
5677    * or to avoid repeating get-magic / overloaded stringify */
5678    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5679           (IN_BYTES ? 0 : SvUTF8(pat)));
5680   }
5681   Safefree(pRExC_state->code_blocks);
5682   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5683  }
5684
5685  /* ignore the utf8ness if the pattern is 0 length */
5686  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5687  RExC_uni_semantics = 0;
5688  RExC_contains_locale = 0;
5689  pRExC_state->runtime_code_qr = NULL;
5690
5691  DEBUG_COMPILE_r({
5692    SV *dsv= sv_newmortal();
5693    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
5694    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5695       PL_colors[4],PL_colors[5],s);
5696   });
5697
5698   redo_first_pass:
5699  /* we jump here if we upgrade the pattern to utf8 and have to
5700  * recompile */
5701
5702  if ((pm_flags & PMf_USE_RE_EVAL)
5703     /* this second condition covers the non-regex literal case,
5704     * i.e.  $foo =~ '(?{})'. */
5705     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
5706  )
5707   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
5708
5709  /* return old regex if pattern hasn't changed */
5710  /* XXX: note in the below we have to check the flags as well as the pattern.
5711  *
5712  * Things get a touch tricky as we have to compare the utf8 flag independently
5713  * from the compile flags.
5714  */
5715
5716  if (   old_re
5717   && !recompile
5718   && !!RX_UTF8(old_re) == !!RExC_utf8
5719   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
5720   && RX_PRECOMP(old_re)
5721   && RX_PRELEN(old_re) == plen
5722   && memEQ(RX_PRECOMP(old_re), exp, plen)
5723   && !runtime_code /* with runtime code, always recompile */ )
5724  {
5725   Safefree(pRExC_state->code_blocks);
5726   return old_re;
5727  }
5728
5729  rx_flags = orig_rx_flags;
5730
5731  if (initial_charset == REGEX_LOCALE_CHARSET) {
5732   RExC_contains_locale = 1;
5733  }
5734  else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5735
5736   /* Set to use unicode semantics if the pattern is in utf8 and has the
5737   * 'depends' charset specified, as it means unicode when utf8  */
5738   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5739  }
5740
5741  RExC_precomp = exp;
5742  RExC_flags = rx_flags;
5743  RExC_pm_flags = pm_flags;
5744
5745  if (runtime_code) {
5746   if (TAINTING_get && TAINT_get)
5747    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5748
5749   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5750    /* whoops, we have a non-utf8 pattern, whilst run-time code
5751    * got compiled as utf8. Try again with a utf8 pattern */
5752    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5753          pRExC_state->num_code_blocks);
5754    goto redo_first_pass;
5755   }
5756  }
5757  assert(!pRExC_state->runtime_code_qr);
5758
5759  RExC_sawback = 0;
5760
5761  RExC_seen = 0;
5762  RExC_in_lookbehind = 0;
5763  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5764  RExC_extralen = 0;
5765  RExC_override_recoding = 0;
5766  RExC_in_multi_char_class = 0;
5767
5768  /* First pass: determine size, legality. */
5769  RExC_parse = exp;
5770  RExC_start = exp;
5771  RExC_end = exp + plen;
5772  RExC_naughty = 0;
5773  RExC_npar = 1;
5774  RExC_nestroot = 0;
5775  RExC_size = 0L;
5776  RExC_emit = &PL_regdummy;
5777  RExC_whilem_seen = 0;
5778  RExC_open_parens = NULL;
5779  RExC_close_parens = NULL;
5780  RExC_opend = NULL;
5781  RExC_paren_names = NULL;
5782 #ifdef DEBUGGING
5783  RExC_paren_name_list = NULL;
5784 #endif
5785  RExC_recurse = NULL;
5786  RExC_recurse_count = 0;
5787  pRExC_state->code_index = 0;
5788
5789 #if 0 /* REGC() is (currently) a NOP at the first pass.
5790  * Clever compilers notice this and complain. --jhi */
5791  REGC((U8)REG_MAGIC, (char*)RExC_emit);
5792 #endif
5793  DEBUG_PARSE_r(
5794   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5795   RExC_lastnum=0;
5796   RExC_lastparse=NULL;
5797  );
5798  /* reg may croak on us, not giving us a chance to free
5799  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5800  need it to survive as long as the regexp (qr/(?{})/).
5801  We must check that code_blocksv is not already set, because we may
5802  have jumped back to restart the sizing pass. */
5803  if (pRExC_state->code_blocks && !code_blocksv) {
5804   code_blocksv = newSV_type(SVt_PV);
5805   SAVEFREESV(code_blocksv);
5806   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5807   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5808  }
5809  if (reg(pRExC_state, 0, &flags,1) == NULL) {
5810   /* It's possible to write a regexp in ascii that represents Unicode
5811   codepoints outside of the byte range, such as via \x{100}. If we
5812   detect such a sequence we have to convert the entire pattern to utf8
5813   and then recompile, as our sizing calculation will have been based
5814   on 1 byte == 1 character, but we will need to use utf8 to encode
5815   at least some part of the pattern, and therefore must convert the whole
5816   thing.
5817   -- dmq */
5818   if (flags & RESTART_UTF8) {
5819    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5820          pRExC_state->num_code_blocks);
5821    goto redo_first_pass;
5822   }
5823   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
5824  }
5825  if (code_blocksv)
5826   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5827
5828  DEBUG_PARSE_r({
5829   PerlIO_printf(Perl_debug_log,
5830    "Required size %"IVdf" nodes\n"
5831    "Starting second pass (creation)\n",
5832    (IV)RExC_size);
5833   RExC_lastnum=0;
5834   RExC_lastparse=NULL;
5835  });
5836
5837  /* The first pass could have found things that force Unicode semantics */
5838  if ((RExC_utf8 || RExC_uni_semantics)
5839   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5840  {
5841   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5842  }
5843
5844  /* Small enough for pointer-storage convention?
5845  If extralen==0, this means that we will not need long jumps. */
5846  if (RExC_size >= 0x10000L && RExC_extralen)
5847   RExC_size += RExC_extralen;
5848  else
5849   RExC_extralen = 0;
5850  if (RExC_whilem_seen > 15)
5851   RExC_whilem_seen = 15;
5852
5853  /* Allocate space and zero-initialize. Note, the two step process
5854  of zeroing when in debug mode, thus anything assigned has to
5855  happen after that */
5856  rx = (REGEXP*) newSV_type(SVt_REGEXP);
5857  r = ReANY(rx);
5858  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5859   char, regexp_internal);
5860  if ( r == NULL || ri == NULL )
5861   FAIL("Regexp out of space");
5862 #ifdef DEBUGGING
5863  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5864  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5865 #else
5866  /* bulk initialize base fields with 0. */
5867  Zero(ri, sizeof(regexp_internal), char);
5868 #endif
5869
5870  /* non-zero initialization begins here */
5871  RXi_SET( r, ri );
5872  r->engine= eng;
5873  r->extflags = rx_flags;
5874  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
5875
5876  if (pm_flags & PMf_IS_QR) {
5877   ri->code_blocks = pRExC_state->code_blocks;
5878   ri->num_code_blocks = pRExC_state->num_code_blocks;
5879  }
5880  else
5881  {
5882   int n;
5883   for (n = 0; n < pRExC_state->num_code_blocks; n++)
5884    if (pRExC_state->code_blocks[n].src_regex)
5885     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5886   SAVEFREEPV(pRExC_state->code_blocks);
5887  }
5888
5889  {
5890   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5891   bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5892
5893   /* The caret is output if there are any defaults: if not all the STD
5894   * flags are set, or if no character set specifier is needed */
5895   bool has_default =
5896      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5897      || ! has_charset);
5898   bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5899   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5900        >> RXf_PMf_STD_PMMOD_SHIFT);
5901   const char *fptr = STD_PAT_MODS;        /*"msix"*/
5902   char *p;
5903   /* Allocate for the worst case, which is all the std flags are turned
5904   * on.  If more precision is desired, we could do a population count of
5905   * the flags set.  This could be done with a small lookup table, or by
5906   * shifting, masking and adding, or even, when available, assembly
5907   * language for a machine-language population count.
5908   * We never output a minus, as all those are defaults, so are
5909   * covered by the caret */
5910   const STRLEN wraplen = plen + has_p + has_runon
5911    + has_default       /* If needs a caret */
5912
5913     /* If needs a character set specifier */
5914    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5915    + (sizeof(STD_PAT_MODS) - 1)
5916    + (sizeof("(?:)") - 1);
5917
5918   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5919   r->xpv_len_u.xpvlenu_pv = p;
5920   if (RExC_utf8)
5921    SvFLAGS(rx) |= SVf_UTF8;
5922   *p++='('; *p++='?';
5923
5924   /* If a default, cover it using the caret */
5925   if (has_default) {
5926    *p++= DEFAULT_PAT_MOD;
5927   }
5928   if (has_charset) {
5929    STRLEN len;
5930    const char* const name = get_regex_charset_name(r->extflags, &len);
5931    Copy(name, p, len, char);
5932    p += len;
5933   }
5934   if (has_p)
5935    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5936   {
5937    char ch;
5938    while((ch = *fptr++)) {
5939     if(reganch & 1)
5940      *p++ = ch;
5941     reganch >>= 1;
5942    }
5943   }
5944
5945   *p++ = ':';
5946   Copy(RExC_precomp, p, plen, char);
5947   assert ((RX_WRAPPED(rx) - p) < 16);
5948   r->pre_prefix = p - RX_WRAPPED(rx);
5949   p += plen;
5950   if (has_runon)
5951    *p++ = '\n';
5952   *p++ = ')';
5953   *p = 0;
5954   SvCUR_set(rx, p - RX_WRAPPED(rx));
5955  }
5956
5957  r->intflags = 0;
5958  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5959
5960  if (RExC_seen & REG_SEEN_RECURSE) {
5961   Newxz(RExC_open_parens, RExC_npar,regnode *);
5962   SAVEFREEPV(RExC_open_parens);
5963   Newxz(RExC_close_parens,RExC_npar,regnode *);
5964   SAVEFREEPV(RExC_close_parens);
5965  }
5966
5967  /* Useful during FAIL. */
5968 #ifdef RE_TRACK_PATTERN_OFFSETS
5969  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5970  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5971       "%s %"UVuf" bytes for offset annotations.\n",
5972       ri->u.offsets ? "Got" : "Couldn't get",
5973       (UV)((2*RExC_size+1) * sizeof(U32))));
5974 #endif
5975  SetProgLen(ri,RExC_size);
5976  RExC_rx_sv = rx;
5977  RExC_rx = r;
5978  RExC_rxi = ri;
5979  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5980
5981  /* Second pass: emit code. */
5982  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5983  RExC_pm_flags = pm_flags;
5984  RExC_parse = exp;
5985  RExC_end = exp + plen;
5986  RExC_naughty = 0;
5987  RExC_npar = 1;
5988  RExC_emit_start = ri->program;
5989  RExC_emit = ri->program;
5990  RExC_emit_bound = ri->program + RExC_size + 1;
5991  pRExC_state->code_index = 0;
5992
5993  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5994  if (reg(pRExC_state, 0, &flags,1) == NULL) {
5995   ReREFCNT_dec(rx);
5996   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags);
5997  }
5998  /* XXXX To minimize changes to RE engine we always allocate
5999  3-units-long substrs field. */
6000  Newx(r->substrs, 1, struct reg_substr_data);
6001  if (RExC_recurse_count) {
6002   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6003   SAVEFREEPV(RExC_recurse);
6004  }
6005
6006 reStudy:
6007  r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6008  Zero(r->substrs, 1, struct reg_substr_data);
6009
6010 #ifdef TRIE_STUDY_OPT
6011  if (!restudied) {
6012   StructCopy(&zero_scan_data, &data, scan_data_t);
6013   copyRExC_state = RExC_state;
6014  } else {
6015   U32 seen=RExC_seen;
6016   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6017
6018   RExC_state = copyRExC_state;
6019   if (seen & REG_TOP_LEVEL_BRANCHES)
6020    RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6021   else
6022    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6023   StructCopy(&zero_scan_data, &data, scan_data_t);
6024  }
6025 #else
6026  StructCopy(&zero_scan_data, &data, scan_data_t);
6027 #endif
6028
6029  /* Dig out information for optimizations. */
6030  r->extflags = RExC_flags; /* was pm_op */
6031  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6032
6033  if (UTF)
6034   SvUTF8_on(rx); /* Unicode in it? */
6035  ri->regstclass = NULL;
6036  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6037   r->intflags |= PREGf_NAUGHTY;
6038  scan = ri->program + 1;  /* First BRANCH. */
6039
6040  /* testing for BRANCH here tells us whether there is "must appear"
6041  data in the pattern. If there is then we can use it for optimisations */
6042  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6043   I32 fake;
6044   STRLEN longest_float_length, longest_fixed_length;
6045   struct regnode_charclass_class ch_class; /* pointed to by data */
6046   int stclass_flag;
6047   I32 last_close = 0; /* pointed to by data */
6048   regnode *first= scan;
6049   regnode *first_next= regnext(first);
6050   /*
6051   * Skip introductions and multiplicators >= 1
6052   * so that we can extract the 'meat' of the pattern that must
6053   * match in the large if() sequence following.
6054   * NOTE that EXACT is NOT covered here, as it is normally
6055   * picked up by the optimiser separately.
6056   *
6057   * This is unfortunate as the optimiser isnt handling lookahead
6058   * properly currently.
6059   *
6060   */
6061   while ((OP(first) == OPEN && (sawopen = 1)) ||
6062    /* An OR of *one* alternative - should not happen now. */
6063    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6064    /* for now we can't handle lookbehind IFMATCH*/
6065    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6066    (OP(first) == PLUS) ||
6067    (OP(first) == MINMOD) ||
6068    /* An {n,m} with n>0 */
6069    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6070    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6071   {
6072     /*
6073     * the only op that could be a regnode is PLUS, all the rest
6074     * will be regnode_1 or regnode_2.
6075     *
6076     */
6077     if (OP(first) == PLUS)
6078      sawplus = 1;
6079     else
6080      first += regarglen[OP(first)];
6081
6082     first = NEXTOPER(first);
6083     first_next= regnext(first);
6084   }
6085
6086   /* Starting-point info. */
6087  again:
6088   DEBUG_PEEP("first:",first,0);
6089   /* Ignore EXACT as we deal with it later. */
6090   if (PL_regkind[OP(first)] == EXACT) {
6091    if (OP(first) == EXACT)
6092     NOOP; /* Empty, get anchored substr later. */
6093    else
6094     ri->regstclass = first;
6095   }
6096 #ifdef TRIE_STCLASS
6097   else if (PL_regkind[OP(first)] == TRIE &&
6098     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6099   {
6100    regnode *trie_op;
6101    /* this can happen only on restudy */
6102    if ( OP(first) == TRIE ) {
6103     struct regnode_1 *trieop = (struct regnode_1 *)
6104      PerlMemShared_calloc(1, sizeof(struct regnode_1));
6105     StructCopy(first,trieop,struct regnode_1);
6106     trie_op=(regnode *)trieop;
6107    } else {
6108     struct regnode_charclass *trieop = (struct regnode_charclass *)
6109      PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6110     StructCopy(first,trieop,struct regnode_charclass);
6111     trie_op=(regnode *)trieop;
6112    }
6113    OP(trie_op)+=2;
6114    make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6115    ri->regstclass = trie_op;
6116   }
6117 #endif
6118   else if (REGNODE_SIMPLE(OP(first)))
6119    ri->regstclass = first;
6120   else if (PL_regkind[OP(first)] == BOUND ||
6121     PL_regkind[OP(first)] == NBOUND)
6122    ri->regstclass = first;
6123   else if (PL_regkind[OP(first)] == BOL) {
6124    r->extflags |= (OP(first) == MBOL
6125       ? RXf_ANCH_MBOL
6126       : (OP(first) == SBOL
6127        ? RXf_ANCH_SBOL
6128        : RXf_ANCH_BOL));
6129    first = NEXTOPER(first);
6130    goto again;
6131   }
6132   else if (OP(first) == GPOS) {
6133    r->extflags |= RXf_ANCH_GPOS;
6134    first = NEXTOPER(first);
6135    goto again;
6136   }
6137   else if ((!sawopen || !RExC_sawback) &&
6138    (OP(first) == STAR &&
6139    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6140    !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6141   {
6142    /* turn .* into ^.* with an implied $*=1 */
6143    const int type =
6144     (OP(NEXTOPER(first)) == REG_ANY)
6145      ? RXf_ANCH_MBOL
6146      : RXf_ANCH_SBOL;
6147    r->extflags |= type;
6148    r->intflags |= PREGf_IMPLICIT;
6149    first = NEXTOPER(first);
6150    goto again;
6151   }
6152   if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6153    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6154    /* x+ must match at the 1st pos of run of x's */
6155    r->intflags |= PREGf_SKIP;
6156
6157   /* Scan is after the zeroth branch, first is atomic matcher. */
6158 #ifdef TRIE_STUDY_OPT
6159   DEBUG_PARSE_r(
6160    if (!restudied)
6161     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6162        (IV)(first - scan + 1))
6163   );
6164 #else
6165   DEBUG_PARSE_r(
6166    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6167     (IV)(first - scan + 1))
6168   );
6169 #endif
6170
6171
6172   /*
6173   * If there's something expensive in the r.e., find the
6174   * longest literal string that must appear and make it the
6175   * regmust.  Resolve ties in favor of later strings, since
6176   * the regstart check works with the beginning of the r.e.
6177   * and avoiding duplication strengthens checking.  Not a
6178   * strong reason, but sufficient in the absence of others.
6179   * [Now we resolve ties in favor of the earlier string if
6180   * it happens that c_offset_min has been invalidated, since the
6181   * earlier string may buy us something the later one won't.]
6182   */
6183
6184   data.longest_fixed = newSVpvs("");
6185   data.longest_float = newSVpvs("");
6186   data.last_found = newSVpvs("");
6187   data.longest = &(data.longest_fixed);
6188   ENTER_with_name("study_chunk");
6189   SAVEFREESV(data.longest_fixed);
6190   SAVEFREESV(data.longest_float);
6191   SAVEFREESV(data.last_found);
6192   first = scan;
6193   if (!ri->regstclass) {
6194    cl_init(pRExC_state, &ch_class);
6195    data.start_class = &ch_class;
6196    stclass_flag = SCF_DO_STCLASS_AND;
6197   } else    /* XXXX Check for BOUND? */
6198    stclass_flag = 0;
6199   data.last_closep = &last_close;
6200
6201   minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6202    &data, -1, NULL, NULL,
6203    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6204
6205
6206   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6207
6208
6209   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6210    && data.last_start_min == 0 && data.last_end > 0
6211    && !RExC_seen_zerolen
6212    && !(RExC_seen & REG_SEEN_VERBARG)
6213    && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6214    r->extflags |= RXf_CHECK_ALL;
6215   scan_commit(pRExC_state, &data,&minlen,0);
6216
6217   longest_float_length = CHR_SVLEN(data.longest_float);
6218
6219   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6220     && data.offset_fixed == data.offset_float_min
6221     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6222    && S_setup_longest (aTHX_ pRExC_state,
6223          data.longest_float,
6224          &(r->float_utf8),
6225          &(r->float_substr),
6226          &(r->float_end_shift),
6227          data.lookbehind_float,
6228          data.offset_float_min,
6229          data.minlen_float,
6230          longest_float_length,
6231          cBOOL(data.flags & SF_FL_BEFORE_EOL),
6232          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6233   {
6234    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6235    r->float_max_offset = data.offset_float_max;
6236    if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6237     r->float_max_offset -= data.lookbehind_float;
6238    SvREFCNT_inc_simple_void_NN(data.longest_float);
6239   }
6240   else {
6241    r->float_substr = r->float_utf8 = NULL;
6242    longest_float_length = 0;
6243   }
6244
6245   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6246
6247   if (S_setup_longest (aTHX_ pRExC_state,
6248         data.longest_fixed,
6249         &(r->anchored_utf8),
6250         &(r->anchored_substr),
6251         &(r->anchored_end_shift),
6252         data.lookbehind_fixed,
6253         data.offset_fixed,
6254         data.minlen_fixed,
6255         longest_fixed_length,
6256         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6257         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6258   {
6259    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6260    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6261   }
6262   else {
6263    r->anchored_substr = r->anchored_utf8 = NULL;
6264    longest_fixed_length = 0;
6265   }
6266   LEAVE_with_name("study_chunk");
6267
6268   if (ri->regstclass
6269    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6270    ri->regstclass = NULL;
6271
6272   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6273    && stclass_flag
6274    && ! TEST_SSC_EOS(data.start_class)
6275    && !cl_is_anything(data.start_class))
6276   {
6277    const U32 n = add_data(pRExC_state, 1, "f");
6278    OP(data.start_class) = ANYOF_SYNTHETIC;
6279
6280    Newx(RExC_rxi->data->data[n], 1,
6281     struct regnode_charclass_class);
6282    StructCopy(data.start_class,
6283      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6284      struct regnode_charclass_class);
6285    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6286    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6287    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6288      regprop(r, sv, (regnode*)data.start_class);
6289      PerlIO_printf(Perl_debug_log,
6290          "synthetic stclass \"%s\".\n",
6291          SvPVX_const(sv));});
6292   }
6293
6294   /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6295   if (longest_fixed_length > longest_float_length) {
6296    r->check_end_shift = r->anchored_end_shift;
6297    r->check_substr = r->anchored_substr;
6298    r->check_utf8 = r->anchored_utf8;
6299    r->check_offset_min = r->check_offset_max = r->anchored_offset;
6300    if (r->extflags & RXf_ANCH_SINGLE)
6301     r->extflags |= RXf_NOSCAN;
6302   }
6303   else {
6304    r->check_end_shift = r->float_end_shift;
6305    r->check_substr = r->float_substr;
6306    r->check_utf8 = r->float_utf8;
6307    r->check_offset_min = r->float_min_offset;
6308    r->check_offset_max = r->float_max_offset;
6309   }
6310   /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6311   This should be changed ASAP!  */
6312   if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6313    r->extflags |= RXf_USE_INTUIT;
6314    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6315     r->extflags |= RXf_INTUIT_TAIL;
6316   }
6317   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6318   if ( (STRLEN)minlen < longest_float_length )
6319    minlen= longest_float_length;
6320   if ( (STRLEN)minlen < longest_fixed_length )
6321    minlen= longest_fixed_length;
6322   */
6323  }
6324  else {
6325   /* Several toplevels. Best we can is to set minlen. */
6326   I32 fake;
6327   struct regnode_charclass_class ch_class;
6328   I32 last_close = 0;
6329
6330   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6331
6332   scan = ri->program + 1;
6333   cl_init(pRExC_state, &ch_class);
6334   data.start_class = &ch_class;
6335   data.last_closep = &last_close;
6336
6337
6338   minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6339    &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6340
6341   CHECK_RESTUDY_GOTO_butfirst(NOOP);
6342
6343   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6344     = r->float_substr = r->float_utf8 = NULL;
6345
6346   if (! TEST_SSC_EOS(data.start_class)
6347    && !cl_is_anything(data.start_class))
6348   {
6349    const U32 n = add_data(pRExC_state, 1, "f");
6350    OP(data.start_class) = ANYOF_SYNTHETIC;
6351
6352    Newx(RExC_rxi->data->data[n], 1,
6353     struct regnode_charclass_class);
6354    StructCopy(data.start_class,
6355      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6356      struct regnode_charclass_class);
6357    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6358    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6359    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6360      regprop(r, sv, (regnode*)data.start_class);
6361      PerlIO_printf(Perl_debug_log,
6362          "synthetic stclass \"%s\".\n",
6363          SvPVX_const(sv));});
6364   }
6365  }
6366
6367  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6368  the "real" pattern. */
6369  DEBUG_OPTIMISE_r({
6370   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6371      (IV)minlen, (IV)r->minlen);
6372  });
6373  r->minlenret = minlen;
6374  if (r->minlen < minlen)
6375   r->minlen = minlen;
6376
6377  if (RExC_seen & REG_SEEN_GPOS)
6378   r->extflags |= RXf_GPOS_SEEN;
6379  if (RExC_seen & REG_SEEN_LOOKBEHIND)
6380   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6381  if (pRExC_state->num_code_blocks)
6382   r->extflags |= RXf_EVAL_SEEN;
6383  if (RExC_seen & REG_SEEN_CANY)
6384   r->extflags |= RXf_CANY_SEEN;
6385  if (RExC_seen & REG_SEEN_VERBARG)
6386  {
6387   r->intflags |= PREGf_VERBARG_SEEN;
6388   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6389  }
6390  if (RExC_seen & REG_SEEN_CUTGROUP)
6391   r->intflags |= PREGf_CUTGROUP_SEEN;
6392  if (pm_flags & PMf_USE_RE_EVAL)
6393   r->intflags |= PREGf_USE_RE_EVAL;
6394  if (RExC_paren_names)
6395   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6396  else
6397   RXp_PAREN_NAMES(r) = NULL;
6398
6399  {
6400   regnode *first = ri->program + 1;
6401   U8 fop = OP(first);
6402   regnode *next = NEXTOPER(first);
6403   U8 nop = OP(next);
6404
6405   if (PL_regkind[fop] == NOTHING && nop == END)
6406    r->extflags |= RXf_NULL;
6407   else if (PL_regkind[fop] == BOL && nop == END)
6408    r->extflags |= RXf_START_ONLY;
6409   else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6410    r->extflags |= RXf_WHITE;
6411   else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6412    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6413
6414  }
6415 #ifdef DEBUGGING
6416  if (RExC_paren_names) {
6417   ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6418   ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6419  } else
6420 #endif
6421   ri->name_list_idx = 0;
6422
6423  if (RExC_recurse_count) {
6424   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6425    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6426    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6427   }
6428  }
6429  Newxz(r->offs, RExC_npar, regexp_paren_pair);
6430  /* assume we don't need to swap parens around before we match */
6431
6432  DEBUG_DUMP_r({
6433   PerlIO_printf(Perl_debug_log,"Final program:\n");
6434   regdump(r);
6435  });
6436 #ifdef RE_TRACK_PATTERN_OFFSETS
6437  DEBUG_OFFSETS_r(if (ri->u.offsets) {
6438   const U32 len = ri->u.offsets[0];
6439   U32 i;
6440   GET_RE_DEBUG_FLAGS_DECL;
6441   PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6442   for (i = 1; i <= len; i++) {
6443    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6444     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6445     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6446    }
6447   PerlIO_printf(Perl_debug_log, "\n");
6448  });
6449 #endif
6450
6451 #ifdef USE_ITHREADS
6452  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6453  * by setting the regexp SV to readonly-only instead. If the
6454  * pattern's been recompiled, the USEDness should remain. */
6455  if (old_re && SvREADONLY(old_re))
6456   SvREADONLY_on(rx);
6457 #endif
6458  return rx;
6459 }
6460
6461
6462 SV*
6463 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6464      const U32 flags)
6465 {
6466  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6467
6468  PERL_UNUSED_ARG(value);
6469
6470  if (flags & RXapif_FETCH) {
6471   return reg_named_buff_fetch(rx, key, flags);
6472  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6473   Perl_croak_no_modify();
6474   return NULL;
6475  } else if (flags & RXapif_EXISTS) {
6476   return reg_named_buff_exists(rx, key, flags)
6477    ? &PL_sv_yes
6478    : &PL_sv_no;
6479  } else if (flags & RXapif_REGNAMES) {
6480   return reg_named_buff_all(rx, flags);
6481  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6482   return reg_named_buff_scalar(rx, flags);
6483  } else {
6484   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6485   return NULL;
6486  }
6487 }
6488
6489 SV*
6490 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6491       const U32 flags)
6492 {
6493  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6494  PERL_UNUSED_ARG(lastkey);
6495
6496  if (flags & RXapif_FIRSTKEY)
6497   return reg_named_buff_firstkey(rx, flags);
6498  else if (flags & RXapif_NEXTKEY)
6499   return reg_named_buff_nextkey(rx, flags);
6500  else {
6501   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6502   return NULL;
6503  }
6504 }
6505
6506 SV*
6507 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6508       const U32 flags)
6509 {
6510  AV *retarray = NULL;
6511  SV *ret;
6512  struct regexp *const rx = ReANY(r);
6513
6514  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6515
6516  if (flags & RXapif_ALL)
6517   retarray=newAV();
6518
6519  if (rx && RXp_PAREN_NAMES(rx)) {
6520   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6521   if (he_str) {
6522    IV i;
6523    SV* sv_dat=HeVAL(he_str);
6524    I32 *nums=(I32*)SvPVX(sv_dat);
6525    for ( i=0; i<SvIVX(sv_dat); i++ ) {
6526     if ((I32)(rx->nparens) >= nums[i]
6527      && rx->offs[nums[i]].start != -1
6528      && rx->offs[nums[i]].end != -1)
6529     {
6530      ret = newSVpvs("");
6531      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6532      if (!retarray)
6533       return ret;
6534     } else {
6535      if (retarray)
6536       ret = newSVsv(&PL_sv_undef);
6537     }
6538     if (retarray)
6539      av_push(retarray, ret);
6540    }
6541    if (retarray)
6542     return newRV_noinc(MUTABLE_SV(retarray));
6543   }
6544  }
6545  return NULL;
6546 }
6547
6548 bool
6549 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6550       const U32 flags)
6551 {
6552  struct regexp *const rx = ReANY(r);
6553
6554  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6555
6556  if (rx && RXp_PAREN_NAMES(rx)) {
6557   if (flags & RXapif_ALL) {
6558    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6559   } else {
6560    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6561    if (sv) {
6562     SvREFCNT_dec_NN(sv);
6563     return TRUE;
6564    } else {
6565     return FALSE;
6566    }
6567   }
6568  } else {
6569   return FALSE;
6570  }
6571 }
6572
6573 SV*
6574 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6575 {
6576  struct regexp *const rx = ReANY(r);
6577
6578  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6579
6580  if ( rx && RXp_PAREN_NAMES(rx) ) {
6581   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6582
6583   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6584  } else {
6585   return FALSE;
6586  }
6587 }
6588
6589 SV*
6590 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6591 {
6592  struct regexp *const rx = ReANY(r);
6593  GET_RE_DEBUG_FLAGS_DECL;
6594
6595  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6596
6597  if (rx && RXp_PAREN_NAMES(rx)) {
6598   HV *hv = RXp_PAREN_NAMES(rx);
6599   HE *temphe;
6600   while ( (temphe = hv_iternext_flags(hv,0)) ) {
6601    IV i;
6602    IV parno = 0;
6603    SV* sv_dat = HeVAL(temphe);
6604    I32 *nums = (I32*)SvPVX(sv_dat);
6605    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6606     if ((I32)(rx->lastparen) >= nums[i] &&
6607      rx->offs[nums[i]].start != -1 &&
6608      rx->offs[nums[i]].end != -1)
6609     {
6610      parno = nums[i];
6611      break;
6612     }
6613    }
6614    if (parno || flags & RXapif_ALL) {
6615     return newSVhek(HeKEY_hek(temphe));
6616    }
6617   }
6618  }
6619  return NULL;
6620 }
6621
6622 SV*
6623 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6624 {
6625  SV *ret;
6626  AV *av;
6627  I32 length;
6628  struct regexp *const rx = ReANY(r);
6629
6630  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6631
6632  if (rx && RXp_PAREN_NAMES(rx)) {
6633   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6634    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6635   } else if (flags & RXapif_ONE) {
6636    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6637    av = MUTABLE_AV(SvRV(ret));
6638    length = av_len(av);
6639    SvREFCNT_dec_NN(ret);
6640    return newSViv(length + 1);
6641   } else {
6642    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6643    return NULL;
6644   }
6645  }
6646  return &PL_sv_undef;
6647 }
6648
6649 SV*
6650 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6651 {
6652  struct regexp *const rx = ReANY(r);
6653  AV *av = newAV();
6654
6655  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6656
6657  if (rx && RXp_PAREN_NAMES(rx)) {
6658   HV *hv= RXp_PAREN_NAMES(rx);
6659   HE *temphe;
6660   (void)hv_iterinit(hv);
6661   while ( (temphe = hv_iternext_flags(hv,0)) ) {
6662    IV i;
6663    IV parno = 0;
6664    SV* sv_dat = HeVAL(temphe);
6665    I32 *nums = (I32*)SvPVX(sv_dat);
6666    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6667     if ((I32)(rx->lastparen) >= nums[i] &&
6668      rx->offs[nums[i]].start != -1 &&
6669      rx->offs[nums[i]].end != -1)
6670     {
6671      parno = nums[i];
6672      break;
6673     }
6674    }
6675    if (parno || flags & RXapif_ALL) {
6676     av_push(av, newSVhek(HeKEY_hek(temphe)));
6677    }
6678   }
6679  }
6680
6681  return newRV_noinc(MUTABLE_SV(av));
6682 }
6683
6684 void
6685 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6686        SV * const sv)
6687 {
6688  struct regexp *const rx = ReANY(r);
6689  char *s = NULL;
6690  I32 i = 0;
6691  I32 s1, t1;
6692  I32 n = paren;
6693
6694  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6695
6696  if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
6697   || n == RX_BUFF_IDX_CARET_FULLMATCH
6698   || n == RX_BUFF_IDX_CARET_POSTMATCH
6699   )
6700   && !(rx->extflags & RXf_PMf_KEEPCOPY)
6701  )
6702   goto ret_undef;
6703
6704  if (!rx->subbeg)
6705   goto ret_undef;
6706
6707  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6708   /* no need to distinguish between them any more */
6709   n = RX_BUFF_IDX_FULLMATCH;
6710
6711  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6712   && rx->offs[0].start != -1)
6713  {
6714   /* $`, ${^PREMATCH} */
6715   i = rx->offs[0].start;
6716   s = rx->subbeg;
6717  }
6718  else
6719  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6720   && rx->offs[0].end != -1)
6721  {
6722   /* $', ${^POSTMATCH} */
6723   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6724   i = rx->sublen + rx->suboffset - rx->offs[0].end;
6725  }
6726  else
6727  if ( 0 <= n && n <= (I32)rx->nparens &&
6728   (s1 = rx->offs[n].start) != -1 &&
6729   (t1 = rx->offs[n].end) != -1)
6730  {
6731   /* $&, ${^MATCH},  $1 ... */
6732   i = t1 - s1;
6733   s = rx->subbeg + s1 - rx->suboffset;
6734  } else {
6735   goto ret_undef;
6736  }
6737
6738  assert(s >= rx->subbeg);
6739  assert(rx->sublen >= (s - rx->subbeg) + i );
6740  if (i >= 0) {
6741 #if NO_TAINT_SUPPORT
6742   sv_setpvn(sv, s, i);
6743 #else
6744   const int oldtainted = TAINT_get;
6745   TAINT_NOT;
6746   sv_setpvn(sv, s, i);
6747   TAINT_set(oldtainted);
6748 #endif
6749   if ( (rx->extflags & RXf_CANY_SEEN)
6750    ? (RXp_MATCH_UTF8(rx)
6751       && (!i || is_utf8_string((U8*)s, i)))
6752    : (RXp_MATCH_UTF8(rx)) )
6753   {
6754    SvUTF8_on(sv);
6755   }
6756   else
6757    SvUTF8_off(sv);
6758   if (TAINTING_get) {
6759    if (RXp_MATCH_TAINTED(rx)) {
6760     if (SvTYPE(sv) >= SVt_PVMG) {
6761      MAGIC* const mg = SvMAGIC(sv);
6762      MAGIC* mgt;
6763      TAINT;
6764      SvMAGIC_set(sv, mg->mg_moremagic);
6765      SvTAINT(sv);
6766      if ((mgt = SvMAGIC(sv))) {
6767       mg->mg_moremagic = mgt;
6768       SvMAGIC_set(sv, mg);
6769      }
6770     } else {
6771      TAINT;
6772      SvTAINT(sv);
6773     }
6774    } else
6775     SvTAINTED_off(sv);
6776   }
6777  } else {
6778  ret_undef:
6779   sv_setsv(sv,&PL_sv_undef);
6780   return;
6781  }
6782 }
6783
6784 void
6785 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6786               SV const * const value)
6787 {
6788  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6789
6790  PERL_UNUSED_ARG(rx);
6791  PERL_UNUSED_ARG(paren);
6792  PERL_UNUSED_ARG(value);
6793
6794  if (!PL_localizing)
6795   Perl_croak_no_modify();
6796 }
6797
6798 I32
6799 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6800        const I32 paren)
6801 {
6802  struct regexp *const rx = ReANY(r);
6803  I32 i;
6804  I32 s1, t1;
6805
6806  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6807
6808  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6809  switch (paren) {
6810  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6811   if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6812    goto warn_undef;
6813   /*FALLTHROUGH*/
6814
6815  case RX_BUFF_IDX_PREMATCH:       /* $` */
6816   if (rx->offs[0].start != -1) {
6817       i = rx->offs[0].start;
6818       if (i > 0) {
6819         s1 = 0;
6820         t1 = i;
6821         goto getlen;
6822       }
6823    }
6824   return 0;
6825
6826  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6827   if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6828    goto warn_undef;
6829  case RX_BUFF_IDX_POSTMATCH:       /* $' */
6830    if (rx->offs[0].end != -1) {
6831       i = rx->sublen - rx->offs[0].end;
6832       if (i > 0) {
6833         s1 = rx->offs[0].end;
6834         t1 = rx->sublen;
6835         goto getlen;
6836       }
6837    }
6838   return 0;
6839
6840  case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6841   if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6842    goto warn_undef;
6843   /*FALLTHROUGH*/
6844
6845  /* $& / ${^MATCH}, $1, $2, ... */
6846  default:
6847    if (paren <= (I32)rx->nparens &&
6848    (s1 = rx->offs[paren].start) != -1 &&
6849    (t1 = rx->offs[paren].end) != -1)
6850    {
6851    i = t1 - s1;
6852    goto getlen;
6853   } else {
6854   warn_undef:
6855    if (ckWARN(WARN_UNINITIALIZED))
6856     report_uninit((const SV *)sv);
6857    return 0;
6858   }
6859  }
6860   getlen:
6861  if (i > 0 && RXp_MATCH_UTF8(rx)) {
6862   const char * const s = rx->subbeg - rx->suboffset + s1;
6863   const U8 *ep;
6864   STRLEN el;
6865
6866   i = t1 - s1;
6867   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6868       i = el;
6869  }
6870  return i;
6871 }
6872
6873 SV*
6874 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6875 {
6876  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6877   PERL_UNUSED_ARG(rx);
6878   if (0)
6879    return NULL;
6880   else
6881    return newSVpvs("Regexp");
6882 }
6883
6884 /* Scans the name of a named buffer from the pattern.
6885  * If flags is REG_RSN_RETURN_NULL returns null.
6886  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6887  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6888  * to the parsed name as looked up in the RExC_paren_names hash.
6889  * If there is an error throws a vFAIL().. type exception.
6890  */
6891
6892 #define REG_RSN_RETURN_NULL    0
6893 #define REG_RSN_RETURN_NAME    1
6894 #define REG_RSN_RETURN_DATA    2
6895
6896 STATIC SV*
6897 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6898 {
6899  char *name_start = RExC_parse;
6900
6901  PERL_ARGS_ASSERT_REG_SCAN_NAME;
6902
6903  if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6904   /* skip IDFIRST by using do...while */
6905   if (UTF)
6906    do {
6907     RExC_parse += UTF8SKIP(RExC_parse);
6908    } while (isWORDCHAR_utf8((U8*)RExC_parse));
6909   else
6910    do {
6911     RExC_parse++;
6912    } while (isWORDCHAR(*RExC_parse));
6913  } else {
6914   RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6915   vFAIL("Group name must start with a non-digit word character");
6916  }
6917  if ( flags ) {
6918   SV* sv_name
6919    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6920        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6921   if ( flags == REG_RSN_RETURN_NAME)
6922    return sv_name;
6923   else if (flags==REG_RSN_RETURN_DATA) {
6924    HE *he_str = NULL;
6925    SV *sv_dat = NULL;
6926    if ( ! sv_name )      /* should not happen*/
6927     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6928    if (RExC_paren_names)
6929     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6930    if ( he_str )
6931     sv_dat = HeVAL(he_str);
6932    if ( ! sv_dat )
6933     vFAIL("Reference to nonexistent named group");
6934    return sv_dat;
6935   }
6936   else {
6937    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6938      (unsigned long) flags);
6939   }
6940   assert(0); /* NOT REACHED */
6941  }
6942  return NULL;
6943 }
6944
6945 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6946  int rem=(int)(RExC_end - RExC_parse);                       \
6947  int cut;                                                    \
6948  int num;                                                    \
6949  int iscut=0;                                                \
6950  if (rem>10) {                                               \
6951   rem=10;                                                 \
6952   iscut=1;                                                \
6953  }                                                           \
6954  cut=10-rem;                                                 \
6955  if (RExC_lastparse!=RExC_parse)                             \
6956   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6957    rem, RExC_parse,                                    \
6958    cut + 4,                                            \
6959    iscut ? "..." : "<"                                 \
6960   );                                                      \
6961  else                                                        \
6962   PerlIO_printf(Perl_debug_log,"%16s","");                \
6963                 \
6964  if (SIZE_ONLY)                                              \
6965  num = RExC_size + 1;                                     \
6966  else                                                        \
6967  num=REG_NODE_NUM(RExC_emit);                             \
6968  if (RExC_lastnum!=num)                                      \
6969  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6970  else                                                        \
6971  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6972  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6973   (int)((depth*2)), "",                                   \
6974   (funcname)                                              \
6975  );                                                          \
6976  RExC_lastnum=num;                                           \
6977  RExC_lastparse=RExC_parse;                                  \
6978 })
6979
6980
6981
6982 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6983  DEBUG_PARSE_MSG((funcname));                            \
6984  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6985 })
6986 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
6987  DEBUG_PARSE_MSG((funcname));                            \
6988  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
6989 })
6990
6991 /* This section of code defines the inversion list object and its methods.  The
6992  * interfaces are highly subject to change, so as much as possible is static to
6993  * this file.  An inversion list is here implemented as a malloc'd C UV array
6994  * with some added info that is placed as UVs at the beginning in a header
6995  * portion.  An inversion list for Unicode is an array of code points, sorted
6996  * by ordinal number.  The zeroth element is the first code point in the list.
6997  * The 1th element is the first element beyond that not in the list.  In other
6998  * words, the first range is
6999  *  invlist[0]..(invlist[1]-1)
7000  * The other ranges follow.  Thus every element whose index is divisible by two
7001  * marks the beginning of a range that is in the list, and every element not
7002  * divisible by two marks the beginning of a range not in the list.  A single
7003  * element inversion list that contains the single code point N generally
7004  * consists of two elements
7005  *  invlist[0] == N
7006  *  invlist[1] == N+1
7007  * (The exception is when N is the highest representable value on the
7008  * machine, in which case the list containing just it would be a single
7009  * element, itself.  By extension, if the last range in the list extends to
7010  * infinity, then the first element of that range will be in the inversion list
7011  * at a position that is divisible by two, and is the final element in the
7012  * list.)
7013  * Taking the complement (inverting) an inversion list is quite simple, if the
7014  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7015  * This implementation reserves an element at the beginning of each inversion
7016  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
7017  * actual beginning of the list is either that element if 0, or the next one if
7018  * 1.
7019  *
7020  * More about inversion lists can be found in "Unicode Demystified"
7021  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7022  * More will be coming when functionality is added later.
7023  *
7024  * The inversion list data structure is currently implemented as an SV pointing
7025  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7026  * array of UV whose memory management is automatically handled by the existing
7027  * facilities for SV's.
7028  *
7029  * Some of the methods should always be private to the implementation, and some
7030  * should eventually be made public */
7031
7032 /* The header definitions are in F<inline_invlist.c> */
7033 #define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
7034 #define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
7035
7036 #define INVLIST_INITIAL_LEN 10
7037
7038 PERL_STATIC_INLINE UV*
7039 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7040 {
7041  /* Returns a pointer to the first element in the inversion list's array.
7042  * This is called upon initialization of an inversion list.  Where the
7043  * array begins depends on whether the list has the code point U+0000
7044  * in it or not.  The other parameter tells it whether the code that
7045  * follows this call is about to put a 0 in the inversion list or not.
7046  * The first element is either the element with 0, if 0, or the next one,
7047  * if 1 */
7048
7049  UV* zero = get_invlist_zero_addr(invlist);
7050
7051  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7052
7053  /* Must be empty */
7054  assert(! *_get_invlist_len_addr(invlist));
7055
7056  /* 1^1 = 0; 1^0 = 1 */
7057  *zero = 1 ^ will_have_0;
7058  return zero + *zero;
7059 }
7060
7061 PERL_STATIC_INLINE UV*
7062 S_invlist_array(pTHX_ SV* const invlist)
7063 {
7064  /* Returns the pointer to the inversion list's array.  Every time the
7065  * length changes, this needs to be called in case malloc or realloc moved
7066  * it */
7067
7068  PERL_ARGS_ASSERT_INVLIST_ARRAY;
7069
7070  /* Must not be empty.  If these fail, you probably didn't check for <len>
7071  * being non-zero before trying to get the array */
7072  assert(*_get_invlist_len_addr(invlist));
7073  assert(*get_invlist_zero_addr(invlist) == 0
7074   || *get_invlist_zero_addr(invlist) == 1);
7075
7076  /* The array begins either at the element reserved for zero if the
7077  * list contains 0 (that element will be set to 0), or otherwise the next
7078  * element (in which case the reserved element will be set to 1). */
7079  return (UV *) (get_invlist_zero_addr(invlist)
7080     + *get_invlist_zero_addr(invlist));
7081 }
7082
7083 PERL_STATIC_INLINE void
7084 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7085 {
7086  /* Sets the current number of elements stored in the inversion list */
7087
7088  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7089
7090  *_get_invlist_len_addr(invlist) = len;
7091
7092  assert(len <= SvLEN(invlist));
7093
7094  SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7095  /* If the list contains U+0000, that element is part of the header,
7096  * and should not be counted as part of the array.  It will contain
7097  * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7098  * subtract:
7099  * SvCUR_set(invlist,
7100  *    TO_INTERNAL_SIZE(len
7101  *       - (*get_invlist_zero_addr(inv_list) ^ 1)));
7102  * But, this is only valid if len is not 0.  The consequences of not doing
7103  * this is that the memory allocation code may think that 1 more UV is
7104  * being used than actually is, and so might do an unnecessary grow.  That
7105  * seems worth not bothering to make this the precise amount.
7106  *
7107  * Note that when inverting, SvCUR shouldn't change */
7108 }
7109
7110 PERL_STATIC_INLINE IV*
7111 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7112 {
7113  /* Return the address of the UV that is reserved to hold the cached index
7114  * */
7115
7116  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7117
7118  return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7119 }
7120
7121 PERL_STATIC_INLINE IV
7122 S_invlist_previous_index(pTHX_ SV* const invlist)
7123 {
7124  /* Returns cached index of previous search */
7125
7126  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7127
7128  return *get_invlist_previous_index_addr(invlist);
7129 }
7130
7131 PERL_STATIC_INLINE void
7132 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7133 {
7134  /* Caches <index> for later retrieval */
7135
7136  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7137
7138  assert(index == 0 || index < (int) _invlist_len(invlist));
7139
7140  *get_invlist_previous_index_addr(invlist) = index;
7141 }
7142
7143 PERL_STATIC_INLINE UV
7144 S_invlist_max(pTHX_ SV* const invlist)
7145 {
7146  /* Returns the maximum number of elements storable in the inversion list's
7147  * array, without having to realloc() */
7148
7149  PERL_ARGS_ASSERT_INVLIST_MAX;
7150
7151  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7152   ? _invlist_len(invlist)
7153   : FROM_INTERNAL_SIZE(SvLEN(invlist));
7154 }
7155
7156 PERL_STATIC_INLINE UV*
7157 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7158 {
7159  /* Return the address of the UV that is reserved to hold 0 if the inversion
7160  * list contains 0.  This has to be the last element of the heading, as the
7161  * list proper starts with either it if 0, or the next element if not.
7162  * (But we force it to contain either 0 or 1) */
7163
7164  PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7165
7166  return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7167 }
7168
7169 #ifndef PERL_IN_XSUB_RE
7170 SV*
7171 Perl__new_invlist(pTHX_ IV initial_size)
7172 {
7173
7174  /* Return a pointer to a newly constructed inversion list, with enough
7175  * space to store 'initial_size' elements.  If that number is negative, a
7176  * system default is used instead */
7177
7178  SV* new_list;
7179
7180  if (initial_size < 0) {
7181   initial_size = INVLIST_INITIAL_LEN;
7182  }
7183
7184  /* Allocate the initial space */
7185  new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7186  invlist_set_len(new_list, 0);
7187
7188  /* Force iterinit() to be used to get iteration to work */
7189  *get_invlist_iter_addr(new_list) = UV_MAX;
7190
7191  /* This should force a segfault if a method doesn't initialize this
7192  * properly */
7193  *get_invlist_zero_addr(new_list) = UV_MAX;
7194
7195  *get_invlist_previous_index_addr(new_list) = 0;
7196  *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7197 #if HEADER_LENGTH != 5
7198 #   error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7199 #endif
7200
7201  return new_list;
7202 }
7203 #endif
7204
7205 STATIC SV*
7206 S__new_invlist_C_array(pTHX_ UV* list)
7207 {
7208  /* Return a pointer to a newly constructed inversion list, initialized to
7209  * point to <list>, which has to be in the exact correct inversion list
7210  * form, including internal fields.  Thus this is a dangerous routine that
7211  * should not be used in the wrong hands */
7212
7213  SV* invlist = newSV_type(SVt_PV);
7214
7215  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7216
7217  SvPV_set(invlist, (char *) list);
7218  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7219        shouldn't touch it */
7220  SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7221
7222  if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7223   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7224  }
7225
7226  /* Initialize the iteration pointer.
7227  * XXX This could be done at compile time in charclass_invlists.h, but I
7228  * (khw) am not confident that the suffixes for specifying the C constant
7229  * UV_MAX are portable, e.g.  'ull' on a 32 bit machine that is configured
7230  * to use 64 bits; might need a Configure probe */
7231  invlist_iterfinish(invlist);
7232
7233  return invlist;
7234 }
7235
7236 STATIC void
7237 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7238 {
7239  /* Grow the maximum size of an inversion list */
7240
7241  PERL_ARGS_ASSERT_INVLIST_EXTEND;
7242
7243  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7244 }
7245
7246 PERL_STATIC_INLINE void
7247 S_invlist_trim(pTHX_ SV* const invlist)
7248 {
7249  PERL_ARGS_ASSERT_INVLIST_TRIM;
7250
7251  /* Change the length of the inversion list to how many entries it currently
7252  * has */
7253
7254  SvPV_shrink_to_cur((SV *) invlist);
7255 }
7256
7257 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7258
7259 STATIC void
7260 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7261 {
7262    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7263  * the end of the inversion list.  The range must be above any existing
7264  * ones. */
7265
7266  UV* array;
7267  UV max = invlist_max(invlist);
7268  UV len = _invlist_len(invlist);
7269
7270  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7271
7272  if (len == 0) { /* Empty lists must be initialized */
7273   array = _invlist_array_init(invlist, start == 0);
7274  }
7275  else {
7276   /* Here, the existing list is non-empty. The current max entry in the
7277   * list is generally the first value not in the set, except when the
7278   * set extends to the end of permissible values, in which case it is
7279   * the first entry in that final set, and so this call is an attempt to
7280   * append out-of-order */
7281
7282   UV final_element = len - 1;
7283   array = invlist_array(invlist);
7284   if (array[final_element] > start
7285    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7286   {
7287    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7288      array[final_element], start,
7289      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7290   }
7291
7292   /* Here, it is a legal append.  If the new range begins with the first
7293   * value not in the set, it is extending the set, so the new first
7294   * value not in the set is one greater than the newly extended range.
7295   * */
7296   if (array[final_element] == start) {
7297    if (end != UV_MAX) {
7298     array[final_element] = end + 1;
7299    }
7300    else {
7301     /* But if the end is the maximum representable on the machine,
7302     * just let the range that this would extend to have no end */
7303     invlist_set_len(invlist, len - 1);
7304    }
7305    return;
7306   }
7307  }
7308
7309  /* Here the new range doesn't extend any existing set.  Add it */
7310
7311  len += 2; /* Includes an element each for the start and end of range */
7312
7313  /* If overflows the existing space, extend, which may cause the array to be
7314  * moved */
7315  if (max < len) {
7316   invlist_extend(invlist, len);
7317   invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7318           failure in invlist_array() */
7319   array = invlist_array(invlist);
7320  }
7321  else {
7322   invlist_set_len(invlist, len);
7323  }
7324
7325  /* The next item on the list starts the range, the one after that is
7326  * one past the new range.  */
7327  array[len - 2] = start;
7328  if (end != UV_MAX) {
7329   array[len - 1] = end + 1;
7330  }
7331  else {
7332   /* But if the end is the maximum representable on the machine, just let
7333   * the range have no end */
7334   invlist_set_len(invlist, len - 1);
7335  }
7336 }
7337
7338 #ifndef PERL_IN_XSUB_RE
7339
7340 IV
7341 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7342 {
7343  /* Searches the inversion list for the entry that contains the input code
7344  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7345  * return value is the index into the list's array of the range that
7346  * contains <cp> */
7347
7348  IV low = 0;
7349  IV mid;
7350  IV high = _invlist_len(invlist);
7351  const IV highest_element = high - 1;
7352  const UV* array;
7353
7354  PERL_ARGS_ASSERT__INVLIST_SEARCH;
7355
7356  /* If list is empty, return failure. */
7357  if (high == 0) {
7358   return -1;
7359  }
7360
7361  /* (We can't get the array unless we know the list is non-empty) */
7362  array = invlist_array(invlist);
7363
7364  mid = invlist_previous_index(invlist);
7365  assert(mid >=0 && mid <= highest_element);
7366
7367  /* <mid> contains the cache of the result of the previous call to this
7368  * function (0 the first time).  See if this call is for the same result,
7369  * or if it is for mid-1.  This is under the theory that calls to this
7370  * function will often be for related code points that are near each other.
7371  * And benchmarks show that caching gives better results.  We also test
7372  * here if the code point is within the bounds of the list.  These tests
7373  * replace others that would have had to be made anyway to make sure that
7374  * the array bounds were not exceeded, and these give us extra information
7375  * at the same time */
7376  if (cp >= array[mid]) {
7377   if (cp >= array[highest_element]) {
7378    return highest_element;
7379   }
7380
7381   /* Here, array[mid] <= cp < array[highest_element].  This means that
7382   * the final element is not the answer, so can exclude it; it also
7383   * means that <mid> is not the final element, so can refer to 'mid + 1'
7384   * safely */
7385   if (cp < array[mid + 1]) {
7386    return mid;
7387   }
7388   high--;
7389   low = mid + 1;
7390  }
7391  else { /* cp < aray[mid] */
7392   if (cp < array[0]) { /* Fail if outside the array */
7393    return -1;
7394   }
7395   high = mid;
7396   if (cp >= array[mid - 1]) {
7397    goto found_entry;
7398   }
7399  }
7400
7401  /* Binary search.  What we are looking for is <i> such that
7402  * array[i] <= cp < array[i+1]
7403  * The loop below converges on the i+1.  Note that there may not be an
7404  * (i+1)th element in the array, and things work nonetheless */
7405  while (low < high) {
7406   mid = (low + high) / 2;
7407   assert(mid <= highest_element);
7408   if (array[mid] <= cp) { /* cp >= array[mid] */
7409    low = mid + 1;
7410
7411    /* We could do this extra test to exit the loop early.
7412    if (cp < array[low]) {
7413     return mid;
7414    }
7415    */
7416   }
7417   else { /* cp < array[mid] */
7418    high = mid;
7419   }
7420  }
7421
7422   found_entry:
7423  high--;
7424  invlist_set_previous_index(invlist, high);
7425  return high;
7426 }
7427
7428 void
7429 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7430 {
7431  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7432  * but is used when the swash has an inversion list.  This makes this much
7433  * faster, as it uses a binary search instead of a linear one.  This is
7434  * intimately tied to that function, and perhaps should be in utf8.c,
7435  * except it is intimately tied to inversion lists as well.  It assumes
7436  * that <swatch> is all 0's on input */
7437
7438  UV current = start;
7439  const IV len = _invlist_len(invlist);
7440  IV i;
7441  const UV * array;
7442
7443  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7444
7445  if (len == 0) { /* Empty inversion list */
7446   return;
7447  }
7448
7449  array = invlist_array(invlist);
7450
7451  /* Find which element it is */
7452  i = _invlist_search(invlist, start);
7453
7454  /* We populate from <start> to <end> */
7455  while (current < end) {
7456   UV upper;
7457
7458   /* The inversion list gives the results for every possible code point
7459   * after the first one in the list.  Only those ranges whose index is
7460   * even are ones that the inversion list matches.  For the odd ones,
7461   * and if the initial code point is not in the list, we have to skip
7462   * forward to the next element */
7463   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7464    i++;
7465    if (i >= len) { /* Finished if beyond the end of the array */
7466     return;
7467    }
7468    current = array[i];
7469    if (current >= end) {   /* Finished if beyond the end of what we
7470          are populating */
7471     if (LIKELY(end < UV_MAX)) {
7472      return;
7473     }
7474
7475     /* We get here when the upper bound is the maximum
7476     * representable on the machine, and we are looking for just
7477     * that code point.  Have to special case it */
7478     i = len;
7479     goto join_end_of_list;
7480    }
7481   }
7482   assert(current >= start);
7483
7484   /* The current range ends one below the next one, except don't go past
7485   * <end> */
7486   i++;
7487   upper = (i < len && array[i] < end) ? array[i] : end;
7488
7489   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7490   * for each code point in it */
7491   for (; current < upper; current++) {
7492    const STRLEN offset = (STRLEN)(current - start);
7493    swatch[offset >> 3] |= 1 << (offset & 7);
7494   }
7495
7496  join_end_of_list:
7497
7498   /* Quit if at the end of the list */
7499   if (i >= len) {
7500
7501    /* But first, have to deal with the highest possible code point on
7502    * the platform.  The previous code assumes that <end> is one
7503    * beyond where we want to populate, but that is impossible at the
7504    * platform's infinity, so have to handle it specially */
7505    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7506    {
7507     const STRLEN offset = (STRLEN)(end - start);
7508     swatch[offset >> 3] |= 1 << (offset & 7);
7509    }
7510    return;
7511   }
7512
7513   /* Advance to the next range, which will be for code points not in the
7514   * inversion list */
7515   current = array[i];
7516  }
7517
7518  return;
7519 }
7520
7521 void
7522 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7523 {
7524  /* Take the union of two inversion lists and point <output> to it.  *output
7525  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7526  * the reference count to that list will be decremented.  The first list,
7527  * <a>, may be NULL, in which case a copy of the second list is returned.
7528  * If <complement_b> is TRUE, the union is taken of the complement
7529  * (inversion) of <b> instead of b itself.
7530  *
7531  * The basis for this comes from "Unicode Demystified" Chapter 13 by
7532  * Richard Gillam, published by Addison-Wesley, and explained at some
7533  * length there.  The preface says to incorporate its examples into your
7534  * code at your own risk.
7535  *
7536  * The algorithm is like a merge sort.
7537  *
7538  * XXX A potential performance improvement is to keep track as we go along
7539  * if only one of the inputs contributes to the result, meaning the other
7540  * is a subset of that one.  In that case, we can skip the final copy and
7541  * return the larger of the input lists, but then outside code might need
7542  * to keep track of whether to free the input list or not */
7543
7544  UV* array_a;    /* a's array */
7545  UV* array_b;
7546  UV len_a;     /* length of a's array */
7547  UV len_b;
7548
7549  SV* u;   /* the resulting union */
7550  UV* array_u;
7551  UV len_u;
7552
7553  UV i_a = 0;      /* current index into a's array */
7554  UV i_b = 0;
7555  UV i_u = 0;
7556
7557  /* running count, as explained in the algorithm source book; items are
7558  * stopped accumulating and are output when the count changes to/from 0.
7559  * The count is incremented when we start a range that's in the set, and
7560  * decremented when we start a range that's not in the set.  So its range
7561  * is 0 to 2.  Only when the count is zero is something not in the set.
7562  */
7563  UV count = 0;
7564
7565  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7566  assert(a != b);
7567
7568  /* If either one is empty, the union is the other one */
7569  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7570   if (*output == a) {
7571    if (a != NULL) {
7572     SvREFCNT_dec_NN(a);
7573    }
7574   }
7575   if (*output != b) {
7576    *output = invlist_clone(b);
7577    if (complement_b) {
7578     _invlist_invert(*output);
7579    }
7580   } /* else *output already = b; */
7581   return;
7582  }
7583  else if ((len_b = _invlist_len(b)) == 0) {
7584   if (*output == b) {
7585    SvREFCNT_dec_NN(b);
7586   }
7587
7588   /* The complement of an empty list is a list that has everything in it,
7589   * so the union with <a> includes everything too */
7590   if (complement_b) {
7591    if (a == *output) {
7592     SvREFCNT_dec_NN(a);
7593    }
7594    *output = _new_invlist(1);
7595    _append_range_to_invlist(*output, 0, UV_MAX);
7596   }
7597   else if (*output != a) {
7598    *output = invlist_clone(a);
7599   }
7600   /* else *output already = a; */
7601   return;
7602  }
7603
7604  /* Here both lists exist and are non-empty */
7605  array_a = invlist_array(a);
7606  array_b = invlist_array(b);
7607
7608  /* If are to take the union of 'a' with the complement of b, set it
7609  * up so are looking at b's complement. */
7610  if (complement_b) {
7611
7612   /* To complement, we invert: if the first element is 0, remove it.  To
7613   * do this, we just pretend the array starts one later, and clear the
7614   * flag as we don't have to do anything else later */
7615   if (array_b[0] == 0) {
7616    array_b++;
7617    len_b--;
7618    complement_b = FALSE;
7619   }
7620   else {
7621
7622    /* But if the first element is not zero, we unshift a 0 before the
7623    * array.  The data structure reserves a space for that 0 (which
7624    * should be a '1' right now), so physical shifting is unneeded,
7625    * but temporarily change that element to 0.  Before exiting the
7626    * routine, we must restore the element to '1' */
7627    array_b--;
7628    len_b++;
7629    array_b[0] = 0;
7630   }
7631  }
7632
7633  /* Size the union for the worst case: that the sets are completely
7634  * disjoint */
7635  u = _new_invlist(len_a + len_b);
7636
7637  /* Will contain U+0000 if either component does */
7638  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7639          || (len_b > 0 && array_b[0] == 0));
7640
7641  /* Go through each list item by item, stopping when exhausted one of
7642  * them */
7643  while (i_a < len_a && i_b < len_b) {
7644   UV cp;     /* The element to potentially add to the union's array */
7645   bool cp_in_set;   /* is it in the the input list's set or not */
7646
7647   /* We need to take one or the other of the two inputs for the union.
7648   * Since we are merging two sorted lists, we take the smaller of the
7649   * next items.  In case of a tie, we take the one that is in its set
7650   * first.  If we took one not in the set first, it would decrement the
7651   * count, possibly to 0 which would cause it to be output as ending the
7652   * range, and the next time through we would take the same number, and
7653   * output it again as beginning the next range.  By doing it the
7654   * opposite way, there is no possibility that the count will be
7655   * momentarily decremented to 0, and thus the two adjoining ranges will
7656   * be seamlessly merged.  (In a tie and both are in the set or both not
7657   * in the set, it doesn't matter which we take first.) */
7658   if (array_a[i_a] < array_b[i_b]
7659    || (array_a[i_a] == array_b[i_b]
7660     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7661   {
7662    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7663    cp= array_a[i_a++];
7664   }
7665   else {
7666    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7667    cp = array_b[i_b++];
7668   }
7669
7670   /* Here, have chosen which of the two inputs to look at.  Only output
7671   * if the running count changes to/from 0, which marks the
7672   * beginning/end of a range in that's in the set */
7673   if (cp_in_set) {
7674    if (count == 0) {
7675     array_u[i_u++] = cp;
7676    }
7677    count++;
7678   }
7679   else {
7680    count--;
7681    if (count == 0) {
7682     array_u[i_u++] = cp;
7683    }
7684   }
7685  }
7686
7687  /* Here, we are finished going through at least one of the lists, which
7688  * means there is something remaining in at most one.  We check if the list
7689  * that hasn't been exhausted is positioned such that we are in the middle
7690  * of a range in its set or not.  (i_a and i_b point to the element beyond
7691  * the one we care about.) If in the set, we decrement 'count'; if 0, there
7692  * is potentially more to output.
7693  * There are four cases:
7694  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
7695  *    in the union is entirely from the non-exhausted set.
7696  * 2) Both were in their sets, count is 2.  Nothing further should
7697  *    be output, as everything that remains will be in the exhausted
7698  *    list's set, hence in the union; decrementing to 1 but not 0 insures
7699  *    that
7700  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7701  *    Nothing further should be output because the union includes
7702  *    everything from the exhausted set.  Not decrementing ensures that.
7703  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7704  *    decrementing to 0 insures that we look at the remainder of the
7705  *    non-exhausted set */
7706  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7707   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7708  {
7709   count--;
7710  }
7711
7712  /* The final length is what we've output so far, plus what else is about to
7713  * be output.  (If 'count' is non-zero, then the input list we exhausted
7714  * has everything remaining up to the machine's limit in its set, and hence
7715  * in the union, so there will be no further output. */
7716  len_u = i_u;
7717  if (count == 0) {
7718   /* At most one of the subexpressions will be non-zero */
7719   len_u += (len_a - i_a) + (len_b - i_b);
7720  }
7721
7722  /* Set result to final length, which can change the pointer to array_u, so
7723  * re-find it */
7724  if (len_u != _invlist_len(u)) {
7725   invlist_set_len(u, len_u);
7726   invlist_trim(u);
7727   array_u = invlist_array(u);
7728  }
7729
7730  /* When 'count' is 0, the list that was exhausted (if one was shorter than
7731  * the other) ended with everything above it not in its set.  That means
7732  * that the remaining part of the union is precisely the same as the
7733  * non-exhausted list, so can just copy it unchanged.  (If both list were
7734  * exhausted at the same time, then the operations below will be both 0.)
7735  */
7736  if (count == 0) {
7737   IV copy_count; /* At most one will have a non-zero copy count */
7738   if ((copy_count = len_a - i_a) > 0) {
7739    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7740   }
7741   else if ((copy_count = len_b - i_b) > 0) {
7742    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7743   }
7744  }
7745
7746  /* If we've changed b, restore it */
7747  if (complement_b) {
7748   array_b[0] = 1;
7749  }
7750
7751  /*  We may be removing a reference to one of the inputs */
7752  if (a == *output || b == *output) {
7753   assert(! invlist_is_iterating(*output));
7754   SvREFCNT_dec_NN(*output);
7755  }
7756
7757  *output = u;
7758  return;
7759 }
7760
7761 void
7762 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7763 {
7764  /* Take the intersection of two inversion lists and point <i> to it.  *i
7765  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7766  * the reference count to that list will be decremented.
7767  * If <complement_b> is TRUE, the result will be the intersection of <a>
7768  * and the complement (or inversion) of <b> instead of <b> directly.
7769  *
7770  * The basis for this comes from "Unicode Demystified" Chapter 13 by
7771  * Richard Gillam, published by Addison-Wesley, and explained at some
7772  * length there.  The preface says to incorporate its examples into your
7773  * code at your own risk.  In fact, it had bugs
7774  *
7775  * The algorithm is like a merge sort, and is essentially the same as the
7776  * union above
7777  */
7778
7779  UV* array_a;  /* a's array */
7780  UV* array_b;
7781  UV len_a; /* length of a's array */
7782  UV len_b;
7783
7784  SV* r;       /* the resulting intersection */
7785  UV* array_r;
7786  UV len_r;
7787
7788  UV i_a = 0;      /* current index into a's array */
7789  UV i_b = 0;
7790  UV i_r = 0;
7791
7792  /* running count, as explained in the algorithm source book; items are
7793  * stopped accumulating and are output when the count changes to/from 2.
7794  * The count is incremented when we start a range that's in the set, and
7795  * decremented when we start a range that's not in the set.  So its range
7796  * is 0 to 2.  Only when the count is 2 is something in the intersection.
7797  */
7798  UV count = 0;
7799
7800  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7801  assert(a != b);
7802
7803  /* Special case if either one is empty */
7804  len_a = _invlist_len(a);
7805  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7806
7807   if (len_a != 0 && complement_b) {
7808
7809    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7810    * be empty.  Here, also we are using 'b's complement, which hence
7811    * must be every possible code point.  Thus the intersection is
7812    * simply 'a'. */
7813    if (*i != a) {
7814     *i = invlist_clone(a);
7815
7816     if (*i == b) {
7817      SvREFCNT_dec_NN(b);
7818     }
7819    }
7820    /* else *i is already 'a' */
7821    return;
7822   }
7823
7824   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7825   * intersection must be empty */
7826   if (*i == a) {
7827    SvREFCNT_dec_NN(a);
7828   }
7829   else if (*i == b) {
7830    SvREFCNT_dec_NN(b);
7831   }
7832   *i = _new_invlist(0);
7833   return;
7834  }
7835
7836  /* Here both lists exist and are non-empty */
7837  array_a = invlist_array(a);
7838  array_b = invlist_array(b);
7839
7840  /* If are to take the intersection of 'a' with the complement of b, set it
7841  * up so are looking at b's complement. */
7842  if (complement_b) {
7843
7844   /* To complement, we invert: if the first element is 0, remove it.  To
7845   * do this, we just pretend the array starts one later, and clear the
7846   * flag as we don't have to do anything else later */
7847   if (array_b[0] == 0) {
7848    array_b++;
7849    len_b--;
7850    complement_b = FALSE;
7851   }
7852   else {
7853
7854    /* But if the first element is not zero, we unshift a 0 before the
7855    * array.  The data structure reserves a space for that 0 (which
7856    * should be a '1' right now), so physical shifting is unneeded,
7857    * but temporarily change that element to 0.  Before exiting the
7858    * routine, we must restore the element to '1' */
7859    array_b--;
7860    len_b++;
7861    array_b[0] = 0;
7862   }
7863  }
7864
7865  /* Size the intersection for the worst case: that the intersection ends up
7866  * fragmenting everything to be completely disjoint */
7867  r= _new_invlist(len_a + len_b);
7868
7869  /* Will contain U+0000 iff both components do */
7870  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7871          && len_b > 0 && array_b[0] == 0);
7872
7873  /* Go through each list item by item, stopping when exhausted one of
7874  * them */
7875  while (i_a < len_a && i_b < len_b) {
7876   UV cp;     /* The element to potentially add to the intersection's
7877      array */
7878   bool cp_in_set; /* Is it in the input list's set or not */
7879
7880   /* We need to take one or the other of the two inputs for the
7881   * intersection.  Since we are merging two sorted lists, we take the
7882   * smaller of the next items.  In case of a tie, we take the one that
7883   * is not in its set first (a difference from the union algorithm).  If
7884   * we took one in the set first, it would increment the count, possibly
7885   * to 2 which would cause it to be output as starting a range in the
7886   * intersection, and the next time through we would take that same
7887   * number, and output it again as ending the set.  By doing it the
7888   * opposite of this, there is no possibility that the count will be
7889   * momentarily incremented to 2.  (In a tie and both are in the set or
7890   * both not in the set, it doesn't matter which we take first.) */
7891   if (array_a[i_a] < array_b[i_b]
7892    || (array_a[i_a] == array_b[i_b]
7893     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7894   {
7895    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7896    cp= array_a[i_a++];
7897   }
7898   else {
7899    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7900    cp= array_b[i_b++];
7901   }
7902
7903   /* Here, have chosen which of the two inputs to look at.  Only output
7904   * if the running count changes to/from 2, which marks the
7905   * beginning/end of a range that's in the intersection */
7906   if (cp_in_set) {
7907    count++;
7908    if (count == 2) {
7909     array_r[i_r++] = cp;
7910    }
7911   }
7912   else {
7913    if (count == 2) {
7914     array_r[i_r++] = cp;
7915    }
7916    count--;
7917   }
7918  }
7919
7920  /* Here, we are finished going through at least one of the lists, which
7921  * means there is something remaining in at most one.  We check if the list
7922  * that has been exhausted is positioned such that we are in the middle
7923  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7924  * the ones we care about.)  There are four cases:
7925  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
7926  *    nothing left in the intersection.
7927  * 2) Both were in their sets, count is 2 and perhaps is incremented to
7928  *    above 2.  What should be output is exactly that which is in the
7929  *    non-exhausted set, as everything it has is also in the intersection
7930  *    set, and everything it doesn't have can't be in the intersection
7931  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7932  *    gets incremented to 2.  Like the previous case, the intersection is
7933  *    everything that remains in the non-exhausted set.
7934  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7935  *    remains 1.  And the intersection has nothing more. */
7936  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7937   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7938  {
7939   count++;
7940  }
7941
7942  /* The final length is what we've output so far plus what else is in the
7943  * intersection.  At most one of the subexpressions below will be non-zero */
7944  len_r = i_r;
7945  if (count >= 2) {
7946   len_r += (len_a - i_a) + (len_b - i_b);
7947  }
7948
7949  /* Set result to final length, which can change the pointer to array_r, so
7950  * re-find it */
7951  if (len_r != _invlist_len(r)) {
7952   invlist_set_len(r, len_r);
7953   invlist_trim(r);
7954   array_r = invlist_array(r);
7955  }
7956
7957  /* Finish outputting any remaining */
7958  if (count >= 2) { /* At most one will have a non-zero copy count */
7959   IV copy_count;
7960   if ((copy_count = len_a - i_a) > 0) {
7961    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7962   }
7963   else if ((copy_count = len_b - i_b) > 0) {
7964    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7965   }
7966  }
7967
7968  /* If we've changed b, restore it */
7969  if (complement_b) {
7970   array_b[0] = 1;
7971  }
7972
7973  /*  We may be removing a reference to one of the inputs */
7974  if (a == *i || b == *i) {
7975   assert(! invlist_is_iterating(*i));
7976   SvREFCNT_dec_NN(*i);
7977  }
7978
7979  *i = r;
7980  return;
7981 }
7982
7983 SV*
7984 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7985 {
7986  /* Add the range from 'start' to 'end' inclusive to the inversion list's
7987  * set.  A pointer to the inversion list is returned.  This may actually be
7988  * a new list, in which case the passed in one has been destroyed.  The
7989  * passed in inversion list can be NULL, in which case a new one is created
7990  * with just the one range in it */
7991
7992  SV* range_invlist;
7993  UV len;
7994
7995  if (invlist == NULL) {
7996   invlist = _new_invlist(2);
7997   len = 0;
7998  }
7999  else {
8000   len = _invlist_len(invlist);
8001  }
8002
8003  /* If comes after the final entry actually in the list, can just append it
8004  * to the end, */
8005  if (len == 0
8006   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8007    && start >= invlist_array(invlist)[len - 1]))
8008  {
8009   _append_range_to_invlist(invlist, start, end);
8010   return invlist;
8011  }
8012
8013  /* Here, can't just append things, create and return a new inversion list
8014  * which is the union of this range and the existing inversion list */
8015  range_invlist = _new_invlist(2);
8016  _append_range_to_invlist(range_invlist, start, end);
8017
8018  _invlist_union(invlist, range_invlist, &invlist);
8019
8020  /* The temporary can be freed */
8021  SvREFCNT_dec_NN(range_invlist);
8022
8023  return invlist;
8024 }
8025
8026 #endif
8027
8028 PERL_STATIC_INLINE SV*
8029 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8030  return _add_range_to_invlist(invlist, cp, cp);
8031 }
8032
8033 #ifndef PERL_IN_XSUB_RE
8034 void
8035 Perl__invlist_invert(pTHX_ SV* const invlist)
8036 {
8037  /* Complement the input inversion list.  This adds a 0 if the list didn't
8038  * have a zero; removes it otherwise.  As described above, the data
8039  * structure is set up so that this is very efficient */
8040
8041  UV* len_pos = _get_invlist_len_addr(invlist);
8042
8043  PERL_ARGS_ASSERT__INVLIST_INVERT;
8044
8045  assert(! invlist_is_iterating(invlist));
8046
8047  /* The inverse of matching nothing is matching everything */
8048  if (*len_pos == 0) {
8049   _append_range_to_invlist(invlist, 0, UV_MAX);
8050   return;
8051  }
8052
8053  /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
8054  * zero element was a 0, so it is being removed, so the length decrements
8055  * by 1; and vice-versa.  SvCUR is unaffected */
8056  if (*get_invlist_zero_addr(invlist) ^= 1) {
8057   (*len_pos)--;
8058  }
8059  else {
8060   (*len_pos)++;
8061  }
8062 }
8063
8064 void
8065 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8066 {
8067  /* Complement the input inversion list (which must be a Unicode property,
8068  * all of which don't match above the Unicode maximum code point.)  And
8069  * Perl has chosen to not have the inversion match above that either.  This
8070  * adds a 0x110000 if the list didn't end with it, and removes it if it did
8071  */
8072
8073  UV len;
8074  UV* array;
8075
8076  PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8077
8078  _invlist_invert(invlist);
8079
8080  len = _invlist_len(invlist);
8081
8082  if (len != 0) { /* If empty do nothing */
8083   array = invlist_array(invlist);
8084   if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8085    /* Add 0x110000.  First, grow if necessary */
8086    len++;
8087    if (invlist_max(invlist) < len) {
8088     invlist_extend(invlist, len);
8089     array = invlist_array(invlist);
8090    }
8091    invlist_set_len(invlist, len);
8092    array[len - 1] = PERL_UNICODE_MAX + 1;
8093   }
8094   else {  /* Remove the 0x110000 */
8095    invlist_set_len(invlist, len - 1);
8096   }
8097  }
8098
8099  return;
8100 }
8101 #endif
8102
8103 PERL_STATIC_INLINE SV*
8104 S_invlist_clone(pTHX_ SV* const invlist)
8105 {
8106
8107  /* Return a new inversion list that is a copy of the input one, which is
8108  * unchanged */
8109
8110  /* Need to allocate extra space to accommodate Perl's addition of a
8111  * trailing NUL to SvPV's, since it thinks they are always strings */
8112  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8113  STRLEN length = SvCUR(invlist);
8114
8115  PERL_ARGS_ASSERT_INVLIST_CLONE;
8116
8117  SvCUR_set(new_invlist, length); /* This isn't done automatically */
8118  Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8119
8120  return new_invlist;
8121 }
8122
8123 PERL_STATIC_INLINE UV*
8124 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8125 {
8126  /* Return the address of the UV that contains the current iteration
8127  * position */
8128
8129  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8130
8131  return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8132 }
8133
8134 PERL_STATIC_INLINE UV*
8135 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8136 {
8137  /* Return the address of the UV that contains the version id. */
8138
8139  PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8140
8141  return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8142 }
8143
8144 PERL_STATIC_INLINE void
8145 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8146 {
8147  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8148
8149  *get_invlist_iter_addr(invlist) = 0;
8150 }
8151
8152 PERL_STATIC_INLINE void
8153 S_invlist_iterfinish(pTHX_ SV* invlist)
8154 {
8155  /* Terminate iterator for invlist.  This is to catch development errors.
8156  * Any iteration that is interrupted before completed should call this
8157  * function.  Functions that add code points anywhere else but to the end
8158  * of an inversion list assert that they are not in the middle of an
8159  * iteration.  If they were, the addition would make the iteration
8160  * problematical: if the iteration hadn't reached the place where things
8161  * were being added, it would be ok */
8162
8163  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8164
8165  *get_invlist_iter_addr(invlist) = UV_MAX;
8166 }
8167
8168 STATIC bool
8169 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8170 {
8171  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8172  * This call sets in <*start> and <*end>, the next range in <invlist>.
8173  * Returns <TRUE> if successful and the next call will return the next
8174  * range; <FALSE> if was already at the end of the list.  If the latter,
8175  * <*start> and <*end> are unchanged, and the next call to this function
8176  * will start over at the beginning of the list */
8177
8178  UV* pos = get_invlist_iter_addr(invlist);
8179  UV len = _invlist_len(invlist);
8180  UV *array;
8181
8182  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8183
8184  if (*pos >= len) {
8185   *pos = UV_MAX; /* Force iterinit() to be required next time */
8186   return FALSE;
8187  }
8188
8189  array = invlist_array(invlist);
8190
8191  *start = array[(*pos)++];
8192
8193  if (*pos >= len) {
8194   *end = UV_MAX;
8195  }
8196  else {
8197   *end = array[(*pos)++] - 1;
8198  }
8199
8200  return TRUE;
8201 }
8202
8203 PERL_STATIC_INLINE bool
8204 S_invlist_is_iterating(pTHX_ SV* const invlist)
8205 {
8206  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8207
8208  return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8209 }
8210
8211 PERL_STATIC_INLINE UV
8212 S_invlist_highest(pTHX_ SV* const invlist)
8213 {
8214  /* Returns the highest code point that matches an inversion list.  This API
8215  * has an ambiguity, as it returns 0 under either the highest is actually
8216  * 0, or if the list is empty.  If this distinction matters to you, check
8217  * for emptiness before calling this function */
8218
8219  UV len = _invlist_len(invlist);
8220  UV *array;
8221
8222  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8223
8224  if (len == 0) {
8225   return 0;
8226  }
8227
8228  array = invlist_array(invlist);
8229
8230  /* The last element in the array in the inversion list always starts a
8231  * range that goes to infinity.  That range may be for code points that are
8232  * matched in the inversion list, or it may be for ones that aren't
8233  * matched.  In the latter case, the highest code point in the set is one
8234  * less than the beginning of this range; otherwise it is the final element
8235  * of this range: infinity */
8236  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8237   ? UV_MAX
8238   : array[len - 1] - 1;
8239 }
8240
8241 #ifndef PERL_IN_XSUB_RE
8242 SV *
8243 Perl__invlist_contents(pTHX_ SV* const invlist)
8244 {
8245  /* Get the contents of an inversion list into a string SV so that they can
8246  * be printed out.  It uses the format traditionally done for debug tracing
8247  */
8248
8249  UV start, end;
8250  SV* output = newSVpvs("\n");
8251
8252  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8253
8254  assert(! invlist_is_iterating(invlist));
8255
8256  invlist_iterinit(invlist);
8257  while (invlist_iternext(invlist, &start, &end)) {
8258   if (end == UV_MAX) {
8259    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8260   }
8261   else if (end != start) {
8262    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8263      start,       end);
8264   }
8265   else {
8266    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8267   }
8268  }
8269
8270  return output;
8271 }
8272 #endif
8273
8274 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8275 void
8276 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8277 {
8278  /* Dumps out the ranges in an inversion list.  The string 'header'
8279  * if present is output on a line before the first range */
8280
8281  UV start, end;
8282
8283  PERL_ARGS_ASSERT__INVLIST_DUMP;
8284
8285  if (header && strlen(header)) {
8286   PerlIO_printf(Perl_debug_log, "%s\n", header);
8287  }
8288  if (invlist_is_iterating(invlist)) {
8289   PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8290   return;
8291  }
8292
8293  invlist_iterinit(invlist);
8294  while (invlist_iternext(invlist, &start, &end)) {
8295   if (end == UV_MAX) {
8296    PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8297   }
8298   else if (end != start) {
8299    PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8300             start,         end);
8301   }
8302   else {
8303    PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8304   }
8305  }
8306 }
8307 #endif
8308
8309 #if 0
8310 bool
8311 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8312 {
8313  /* Return a boolean as to if the two passed in inversion lists are
8314  * identical.  The final argument, if TRUE, says to take the complement of
8315  * the second inversion list before doing the comparison */
8316
8317  UV* array_a = invlist_array(a);
8318  UV* array_b = invlist_array(b);
8319  UV len_a = _invlist_len(a);
8320  UV len_b = _invlist_len(b);
8321
8322  UV i = 0;      /* current index into the arrays */
8323  bool retval = TRUE;     /* Assume are identical until proven otherwise */
8324
8325  PERL_ARGS_ASSERT__INVLISTEQ;
8326
8327  /* If are to compare 'a' with the complement of b, set it
8328  * up so are looking at b's complement. */
8329  if (complement_b) {
8330
8331   /* The complement of nothing is everything, so <a> would have to have
8332   * just one element, starting at zero (ending at infinity) */
8333   if (len_b == 0) {
8334    return (len_a == 1 && array_a[0] == 0);
8335   }
8336   else if (array_b[0] == 0) {
8337
8338    /* Otherwise, to complement, we invert.  Here, the first element is
8339    * 0, just remove it.  To do this, we just pretend the array starts
8340    * one later, and clear the flag as we don't have to do anything
8341    * else later */
8342
8343    array_b++;
8344    len_b--;
8345    complement_b = FALSE;
8346   }
8347   else {
8348
8349    /* But if the first element is not zero, we unshift a 0 before the
8350    * array.  The data structure reserves a space for that 0 (which
8351    * should be a '1' right now), so physical shifting is unneeded,
8352    * but temporarily change that element to 0.  Before exiting the
8353    * routine, we must restore the element to '1' */
8354    array_b--;
8355    len_b++;
8356    array_b[0] = 0;
8357   }
8358  }
8359
8360  /* Make sure that the lengths are the same, as well as the final element
8361  * before looping through the remainder.  (Thus we test the length, final,
8362  * and first elements right off the bat) */
8363  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8364   retval = FALSE;
8365  }
8366  else for (i = 0; i < len_a - 1; i++) {
8367   if (array_a[i] != array_b[i]) {
8368    retval = FALSE;
8369    break;
8370   }
8371  }
8372
8373  if (complement_b) {
8374   array_b[0] = 1;
8375  }
8376  return retval;
8377 }
8378 #endif
8379
8380 #undef HEADER_LENGTH
8381 #undef INVLIST_INITIAL_LENGTH
8382 #undef TO_INTERNAL_SIZE
8383 #undef FROM_INTERNAL_SIZE
8384 #undef INVLIST_LEN_OFFSET
8385 #undef INVLIST_ZERO_OFFSET
8386 #undef INVLIST_ITER_OFFSET
8387 #undef INVLIST_VERSION_ID
8388 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8389
8390 /* End of inversion list object */
8391
8392 STATIC void
8393 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8394 {
8395  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8396  * constructs, and updates RExC_flags with them.  On input, RExC_parse
8397  * should point to the first flag; it is updated on output to point to the
8398  * final ')' or ':'.  There needs to be at least one flag, or this will
8399  * abort */
8400
8401  /* for (?g), (?gc), and (?o) warnings; warning
8402  about (?c) will warn about (?g) -- japhy    */
8403
8404 #define WASTED_O  0x01
8405 #define WASTED_G  0x02
8406 #define WASTED_C  0x04
8407 #define WASTED_GC (0x02|0x04)
8408  I32 wastedflags = 0x00;
8409  U32 posflags = 0, negflags = 0;
8410  U32 *flagsp = &posflags;
8411  char has_charset_modifier = '\0';
8412  regex_charset cs;
8413  bool has_use_defaults = FALSE;
8414  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8415
8416  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8417
8418  /* '^' as an initial flag sets certain defaults */
8419  if (UCHARAT(RExC_parse) == '^') {
8420   RExC_parse++;
8421   has_use_defaults = TRUE;
8422   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8423   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8424           ? REGEX_UNICODE_CHARSET
8425           : REGEX_DEPENDS_CHARSET);
8426  }
8427
8428  cs = get_regex_charset(RExC_flags);
8429  if (cs == REGEX_DEPENDS_CHARSET
8430   && (RExC_utf8 || RExC_uni_semantics))
8431  {
8432   cs = REGEX_UNICODE_CHARSET;
8433  }
8434
8435  while (*RExC_parse) {
8436   /* && strchr("iogcmsx", *RExC_parse) */
8437   /* (?g), (?gc) and (?o) are useless here
8438   and must be globally applied -- japhy */
8439   switch (*RExC_parse) {
8440
8441    /* Code for the imsx flags */
8442    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8443
8444    case LOCALE_PAT_MOD:
8445     if (has_charset_modifier) {
8446      goto excess_modifier;
8447     }
8448     else if (flagsp == &negflags) {
8449      goto neg_modifier;
8450     }
8451     cs = REGEX_LOCALE_CHARSET;
8452     has_charset_modifier = LOCALE_PAT_MOD;
8453     RExC_contains_locale = 1;
8454     break;
8455    case UNICODE_PAT_MOD:
8456     if (has_charset_modifier) {
8457      goto excess_modifier;
8458     }
8459     else if (flagsp == &negflags) {
8460      goto neg_modifier;
8461     }
8462     cs = REGEX_UNICODE_CHARSET;
8463     has_charset_modifier = UNICODE_PAT_MOD;
8464     break;
8465    case ASCII_RESTRICT_PAT_MOD:
8466     if (flagsp == &negflags) {
8467      goto neg_modifier;
8468     }
8469     if (has_charset_modifier) {
8470      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8471       goto excess_modifier;
8472      }
8473      /* Doubled modifier implies more restricted */
8474      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8475     }
8476     else {
8477      cs = REGEX_ASCII_RESTRICTED_CHARSET;
8478     }
8479     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8480     break;
8481    case DEPENDS_PAT_MOD:
8482     if (has_use_defaults) {
8483      goto fail_modifiers;
8484     }
8485     else if (flagsp == &negflags) {
8486      goto neg_modifier;
8487     }
8488     else if (has_charset_modifier) {
8489      goto excess_modifier;
8490     }
8491
8492     /* The dual charset means unicode semantics if the
8493     * pattern (or target, not known until runtime) are
8494     * utf8, or something in the pattern indicates unicode
8495     * semantics */
8496     cs = (RExC_utf8 || RExC_uni_semantics)
8497      ? REGEX_UNICODE_CHARSET
8498      : REGEX_DEPENDS_CHARSET;
8499     has_charset_modifier = DEPENDS_PAT_MOD;
8500     break;
8501    excess_modifier:
8502     RExC_parse++;
8503     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8504      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8505     }
8506     else if (has_charset_modifier == *(RExC_parse - 1)) {
8507      vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8508     }
8509     else {
8510      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8511     }
8512     /*NOTREACHED*/
8513    neg_modifier:
8514     RExC_parse++;
8515     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8516     /*NOTREACHED*/
8517    case ONCE_PAT_MOD: /* 'o' */
8518    case GLOBAL_PAT_MOD: /* 'g' */
8519     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8520      const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8521      if (! (wastedflags & wflagbit) ) {
8522       wastedflags |= wflagbit;
8523       vWARN5(
8524        RExC_parse + 1,
8525        "Useless (%s%c) - %suse /%c modifier",
8526        flagsp == &negflags ? "?-" : "?",
8527        *RExC_parse,
8528        flagsp == &negflags ? "don't " : "",
8529        *RExC_parse
8530       );
8531      }
8532     }
8533     break;
8534
8535    case CONTINUE_PAT_MOD: /* 'c' */
8536     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8537      if (! (wastedflags & WASTED_C) ) {
8538       wastedflags |= WASTED_GC;
8539       vWARN3(
8540        RExC_parse + 1,
8541        "Useless (%sc) - %suse /gc modifier",
8542        flagsp == &negflags ? "?-" : "?",
8543        flagsp == &negflags ? "don't " : ""
8544       );
8545      }
8546     }
8547     break;
8548    case KEEPCOPY_PAT_MOD: /* 'p' */
8549     if (flagsp == &negflags) {
8550      if (SIZE_ONLY)
8551       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8552     } else {
8553      *flagsp |= RXf_PMf_KEEPCOPY;
8554     }
8555     break;
8556    case '-':
8557     /* A flag is a default iff it is following a minus, so
8558     * if there is a minus, it means will be trying to
8559     * re-specify a default which is an error */
8560     if (has_use_defaults || flagsp == &negflags) {
8561      goto fail_modifiers;
8562     }
8563     flagsp = &negflags;
8564     wastedflags = 0;  /* reset so (?g-c) warns twice */
8565     break;
8566    case ':':
8567    case ')':
8568     RExC_flags |= posflags;
8569     RExC_flags &= ~negflags;
8570     set_regex_charset(&RExC_flags, cs);
8571     return;
8572     /*NOTREACHED*/
8573    default:
8574    fail_modifiers:
8575     RExC_parse++;
8576     vFAIL3("Sequence (%.*s...) not recognized",
8577      RExC_parse-seqstart, seqstart);
8578     /*NOTREACHED*/
8579   }
8580
8581   ++RExC_parse;
8582  }
8583 }
8584
8585 /*
8586  - reg - regular expression, i.e. main body or parenthesized thing
8587  *
8588  * Caller must absorb opening parenthesis.
8589  *
8590  * Combining parenthesis handling with the base level of regular expression
8591  * is a trifle forced, but the need to tie the tails of the branches to what
8592  * follows makes it hard to avoid.
8593  */
8594 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8595 #ifdef DEBUGGING
8596 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8597 #else
8598 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8599 #endif
8600
8601 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8602    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8603    needs to be restarted.
8604    Otherwise would only return NULL if regbranch() returns NULL, which
8605    cannot happen.  */
8606 STATIC regnode *
8607 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8608  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
8609  * 2 is like 1, but indicates that nextchar() has been called to advance
8610  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
8611  * this flag alerts us to the need to check for that */
8612 {
8613  dVAR;
8614  regnode *ret;  /* Will be the head of the group. */
8615  regnode *br;
8616  regnode *lastbr;
8617  regnode *ender = NULL;
8618  I32 parno = 0;
8619  I32 flags;
8620  U32 oregflags = RExC_flags;
8621  bool have_branch = 0;
8622  bool is_open = 0;
8623  I32 freeze_paren = 0;
8624  I32 after_freeze = 0;
8625
8626  char * parse_start = RExC_parse; /* MJD */
8627  char * const oregcomp_parse = RExC_parse;
8628
8629  GET_RE_DEBUG_FLAGS_DECL;
8630
8631  PERL_ARGS_ASSERT_REG;
8632  DEBUG_PARSE("reg ");
8633
8634  *flagp = 0;    /* Tentatively. */
8635
8636
8637  /* Make an OPEN node, if parenthesized. */
8638  if (paren) {
8639
8640   /* Under /x, space and comments can be gobbled up between the '(' and
8641   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
8642   * intervening space, as the sequence is a token, and a token should be
8643   * indivisible */
8644   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
8645
8646   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8647    char *start_verb = RExC_parse;
8648    STRLEN verb_len = 0;
8649    char *start_arg = NULL;
8650    unsigned char op = 0;
8651    int argok = 1;
8652    int internal_argval = 0; /* internal_argval is only useful if !argok */
8653
8654    if (has_intervening_patws && SIZE_ONLY) {
8655     ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
8656    }
8657    while ( *RExC_parse && *RExC_parse != ')' ) {
8658     if ( *RExC_parse == ':' ) {
8659      start_arg = RExC_parse + 1;
8660      break;
8661     }
8662     RExC_parse++;
8663    }
8664    ++start_verb;
8665    verb_len = RExC_parse - start_verb;
8666    if ( start_arg ) {
8667     RExC_parse++;
8668     while ( *RExC_parse && *RExC_parse != ')' )
8669      RExC_parse++;
8670     if ( *RExC_parse != ')' )
8671      vFAIL("Unterminated verb pattern argument");
8672     if ( RExC_parse == start_arg )
8673      start_arg = NULL;
8674    } else {
8675     if ( *RExC_parse != ')' )
8676      vFAIL("Unterminated verb pattern");
8677    }
8678
8679    switch ( *start_verb ) {
8680    case 'A':  /* (*ACCEPT) */
8681     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8682      op = ACCEPT;
8683      internal_argval = RExC_nestroot;
8684     }
8685     break;
8686    case 'C':  /* (*COMMIT) */
8687     if ( memEQs(start_verb,verb_len,"COMMIT") )
8688      op = COMMIT;
8689     break;
8690    case 'F':  /* (*FAIL) */
8691     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8692      op = OPFAIL;
8693      argok = 0;
8694     }
8695     break;
8696    case ':':  /* (*:NAME) */
8697    case 'M':  /* (*MARK:NAME) */
8698     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8699      op = MARKPOINT;
8700      argok = -1;
8701     }
8702     break;
8703    case 'P':  /* (*PRUNE) */
8704     if ( memEQs(start_verb,verb_len,"PRUNE") )
8705      op = PRUNE;
8706     break;
8707    case 'S':   /* (*SKIP) */
8708     if ( memEQs(start_verb,verb_len,"SKIP") )
8709      op = SKIP;
8710     break;
8711    case 'T':  /* (*THEN) */
8712     /* [19:06] <TimToady> :: is then */
8713     if ( memEQs(start_verb,verb_len,"THEN") ) {
8714      op = CUTGROUP;
8715      RExC_seen |= REG_SEEN_CUTGROUP;
8716     }
8717     break;
8718    }
8719    if ( ! op ) {
8720     RExC_parse++;
8721     vFAIL3("Unknown verb pattern '%.*s'",
8722      verb_len, start_verb);
8723    }
8724    if ( argok ) {
8725     if ( start_arg && internal_argval ) {
8726      vFAIL3("Verb pattern '%.*s' may not have an argument",
8727       verb_len, start_verb);
8728     } else if ( argok < 0 && !start_arg ) {
8729      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8730       verb_len, start_verb);
8731     } else {
8732      ret = reganode(pRExC_state, op, internal_argval);
8733      if ( ! internal_argval && ! SIZE_ONLY ) {
8734       if (start_arg) {
8735        SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8736        ARG(ret) = add_data( pRExC_state, 1, "S" );
8737        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8738        ret->flags = 0;
8739       } else {
8740        ret->flags = 1;
8741       }
8742      }
8743     }
8744     if (!internal_argval)
8745      RExC_seen |= REG_SEEN_VERBARG;
8746    } else if ( start_arg ) {
8747     vFAIL3("Verb pattern '%.*s' may not have an argument",
8748       verb_len, start_verb);
8749    } else {
8750     ret = reg_node(pRExC_state, op);
8751    }
8752    nextchar(pRExC_state);
8753    return ret;
8754   } else
8755   if (*RExC_parse == '?') { /* (?...) */
8756    bool is_logical = 0;
8757    const char * const seqstart = RExC_parse;
8758    if (has_intervening_patws && SIZE_ONLY) {
8759     ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
8760    }
8761
8762    RExC_parse++;
8763    paren = *RExC_parse++;
8764    ret = NULL;   /* For look-ahead/behind. */
8765    switch (paren) {
8766
8767    case 'P': /* (?P...) variants for those used to PCRE/Python */
8768     paren = *RExC_parse++;
8769     if ( paren == '<')         /* (?P<...>) named capture */
8770      goto named_capture;
8771     else if (paren == '>') {   /* (?P>name) named recursion */
8772      goto named_recursion;
8773     }
8774     else if (paren == '=') {   /* (?P=...)  named backref */
8775      /* this pretty much dupes the code for \k<NAME> in regatom(), if
8776      you change this make sure you change that */
8777      char* name_start = RExC_parse;
8778      U32 num = 0;
8779      SV *sv_dat = reg_scan_name(pRExC_state,
8780       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8781      if (RExC_parse == name_start || *RExC_parse != ')')
8782       vFAIL2("Sequence %.3s... not terminated",parse_start);
8783
8784      if (!SIZE_ONLY) {
8785       num = add_data( pRExC_state, 1, "S" );
8786       RExC_rxi->data->data[num]=(void*)sv_dat;
8787       SvREFCNT_inc_simple_void(sv_dat);
8788      }
8789      RExC_sawback = 1;
8790      ret = reganode(pRExC_state,
8791         ((! FOLD)
8792          ? NREF
8793          : (ASCII_FOLD_RESTRICTED)
8794          ? NREFFA
8795          : (AT_LEAST_UNI_SEMANTICS)
8796           ? NREFFU
8797           : (LOC)
8798           ? NREFFL
8799           : NREFF),
8800          num);
8801      *flagp |= HASWIDTH;
8802
8803      Set_Node_Offset(ret, parse_start+1);
8804      Set_Node_Cur_Length(ret); /* MJD */
8805
8806      nextchar(pRExC_state);
8807      return ret;
8808     }
8809     RExC_parse++;
8810     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8811     /*NOTREACHED*/
8812    case '<':           /* (?<...) */
8813     if (*RExC_parse == '!')
8814      paren = ',';
8815     else if (*RExC_parse != '=')
8816    named_capture:
8817     {               /* (?<...>) */
8818      char *name_start;
8819      SV *svname;
8820      paren= '>';
8821    case '\'':          /* (?'...') */
8822       name_start= RExC_parse;
8823       svname = reg_scan_name(pRExC_state,
8824        SIZE_ONLY ?  /* reverse test from the others */
8825        REG_RSN_RETURN_NAME :
8826        REG_RSN_RETURN_NULL);
8827      if (RExC_parse == name_start) {
8828       RExC_parse++;
8829       vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8830       /*NOTREACHED*/
8831      }
8832      if (*RExC_parse != paren)
8833       vFAIL2("Sequence (?%c... not terminated",
8834        paren=='>' ? '<' : paren);
8835      if (SIZE_ONLY) {
8836       HE *he_str;
8837       SV *sv_dat = NULL;
8838       if (!svname) /* shouldn't happen */
8839        Perl_croak(aTHX_
8840         "panic: reg_scan_name returned NULL");
8841       if (!RExC_paren_names) {
8842        RExC_paren_names= newHV();
8843        sv_2mortal(MUTABLE_SV(RExC_paren_names));
8844 #ifdef DEBUGGING
8845        RExC_paren_name_list= newAV();
8846        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8847 #endif
8848       }
8849       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8850       if ( he_str )
8851        sv_dat = HeVAL(he_str);
8852       if ( ! sv_dat ) {
8853        /* croak baby croak */
8854        Perl_croak(aTHX_
8855         "panic: paren_name hash element allocation failed");
8856       } else if ( SvPOK(sv_dat) ) {
8857        /* (?|...) can mean we have dupes so scan to check
8858        its already been stored. Maybe a flag indicating
8859        we are inside such a construct would be useful,
8860        but the arrays are likely to be quite small, so
8861        for now we punt -- dmq */
8862        IV count = SvIV(sv_dat);
8863        I32 *pv = (I32*)SvPVX(sv_dat);
8864        IV i;
8865        for ( i = 0 ; i < count ; i++ ) {
8866         if ( pv[i] == RExC_npar ) {
8867          count = 0;
8868          break;
8869         }
8870        }
8871        if ( count ) {
8872         pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8873         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8874         pv[count] = RExC_npar;
8875         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8876        }
8877       } else {
8878        (void)SvUPGRADE(sv_dat,SVt_PVNV);
8879        sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8880        SvIOK_on(sv_dat);
8881        SvIV_set(sv_dat, 1);
8882       }
8883 #ifdef DEBUGGING
8884       /* Yes this does cause a memory leak in debugging Perls */
8885       if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8886        SvREFCNT_dec_NN(svname);
8887 #endif
8888
8889       /*sv_dump(sv_dat);*/
8890      }
8891      nextchar(pRExC_state);
8892      paren = 1;
8893      goto capturing_parens;
8894     }
8895     RExC_seen |= REG_SEEN_LOOKBEHIND;
8896     RExC_in_lookbehind++;
8897     RExC_parse++;
8898    case '=':           /* (?=...) */
8899     RExC_seen_zerolen++;
8900     break;
8901    case '!':           /* (?!...) */
8902     RExC_seen_zerolen++;
8903     if (*RExC_parse == ')') {
8904      ret=reg_node(pRExC_state, OPFAIL);
8905      nextchar(pRExC_state);
8906      return ret;
8907     }
8908     break;
8909    case '|':           /* (?|...) */
8910     /* branch reset, behave like a (?:...) except that
8911     buffers in alternations share the same numbers */
8912     paren = ':';
8913     after_freeze = freeze_paren = RExC_npar;
8914     break;
8915    case ':':           /* (?:...) */
8916    case '>':           /* (?>...) */
8917     break;
8918    case '$':           /* (?$...) */
8919    case '@':           /* (?@...) */
8920     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8921     break;
8922    case '#':           /* (?#...) */
8923     /* XXX As soon as we disallow separating the '?' and '*' (by
8924     * spaces or (?#...) comment), it is believed that this case
8925     * will be unreachable and can be removed.  See
8926     * [perl #117327] */
8927     while (*RExC_parse && *RExC_parse != ')')
8928      RExC_parse++;
8929     if (*RExC_parse != ')')
8930      FAIL("Sequence (?#... not terminated");
8931     nextchar(pRExC_state);
8932     *flagp = TRYAGAIN;
8933     return NULL;
8934    case '0' :           /* (?0) */
8935    case 'R' :           /* (?R) */
8936     if (*RExC_parse != ')')
8937      FAIL("Sequence (?R) not terminated");
8938     ret = reg_node(pRExC_state, GOSTART);
8939     *flagp |= POSTPONED;
8940     nextchar(pRExC_state);
8941     return ret;
8942     /*notreached*/
8943    { /* named and numeric backreferences */
8944     I32 num;
8945    case '&':            /* (?&NAME) */
8946     parse_start = RExC_parse - 1;
8947    named_recursion:
8948     {
8949       SV *sv_dat = reg_scan_name(pRExC_state,
8950        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8951       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8952     }
8953     goto gen_recurse_regop;
8954     assert(0); /* NOT REACHED */
8955    case '+':
8956     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8957      RExC_parse++;
8958      vFAIL("Illegal pattern");
8959     }
8960     goto parse_recursion;
8961     /* NOT REACHED*/
8962    case '-': /* (?-1) */
8963     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8964      RExC_parse--; /* rewind to let it be handled later */
8965      goto parse_flags;
8966     }
8967     /*FALLTHROUGH */
8968    case '1': case '2': case '3': case '4': /* (?1) */
8969    case '5': case '6': case '7': case '8': case '9':
8970     RExC_parse--;
8971    parse_recursion:
8972     num = atoi(RExC_parse);
8973     parse_start = RExC_parse - 1; /* MJD */
8974     if (*RExC_parse == '-')
8975      RExC_parse++;
8976     while (isDIGIT(*RExC_parse))
8977       RExC_parse++;
8978     if (*RExC_parse!=')')
8979      vFAIL("Expecting close bracket");
8980
8981    gen_recurse_regop:
8982     if ( paren == '-' ) {
8983      /*
8984      Diagram of capture buffer numbering.
8985      Top line is the normal capture buffer numbers
8986      Bottom line is the negative indexing as from
8987      the X (the (?-2))
8988
8989      +   1 2    3 4 5 X          6 7
8990      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8991      -   5 4    3 2 1 X          x x
8992
8993      */
8994      num = RExC_npar + num;
8995      if (num < 1)  {
8996       RExC_parse++;
8997       vFAIL("Reference to nonexistent group");
8998      }
8999     } else if ( paren == '+' ) {
9000      num = RExC_npar + num - 1;
9001     }
9002
9003     ret = reganode(pRExC_state, GOSUB, num);
9004     if (!SIZE_ONLY) {
9005      if (num > (I32)RExC_rx->nparens) {
9006       RExC_parse++;
9007       vFAIL("Reference to nonexistent group");
9008      }
9009      ARG2L_SET( ret, RExC_recurse_count++);
9010      RExC_emit++;
9011      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9012       "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9013     } else {
9014      RExC_size++;
9015      }
9016      RExC_seen |= REG_SEEN_RECURSE;
9017     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9018     Set_Node_Offset(ret, parse_start); /* MJD */
9019
9020     *flagp |= POSTPONED;
9021     nextchar(pRExC_state);
9022     return ret;
9023    } /* named and numeric backreferences */
9024    assert(0); /* NOT REACHED */
9025
9026    case '?':           /* (??...) */
9027     is_logical = 1;
9028     if (*RExC_parse != '{') {
9029      RExC_parse++;
9030      vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9031      /*NOTREACHED*/
9032     }
9033     *flagp |= POSTPONED;
9034     paren = *RExC_parse++;
9035     /* FALL THROUGH */
9036    case '{':           /* (?{...}) */
9037    {
9038     U32 n = 0;
9039     struct reg_code_block *cb;
9040
9041     RExC_seen_zerolen++;
9042
9043     if (   !pRExC_state->num_code_blocks
9044      || pRExC_state->code_index >= pRExC_state->num_code_blocks
9045      || pRExC_state->code_blocks[pRExC_state->code_index].start
9046       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9047        - RExC_start)
9048     ) {
9049      if (RExC_pm_flags & PMf_USE_RE_EVAL)
9050       FAIL("panic: Sequence (?{...}): no code block found\n");
9051      FAIL("Eval-group not allowed at runtime, use re 'eval'");
9052     }
9053     /* this is a pre-compiled code block (?{...}) */
9054     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9055     RExC_parse = RExC_start + cb->end;
9056     if (!SIZE_ONLY) {
9057      OP *o = cb->block;
9058      if (cb->src_regex) {
9059       n = add_data(pRExC_state, 2, "rl");
9060       RExC_rxi->data->data[n] =
9061        (void*)SvREFCNT_inc((SV*)cb->src_regex);
9062       RExC_rxi->data->data[n+1] = (void*)o;
9063      }
9064      else {
9065       n = add_data(pRExC_state, 1,
9066        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9067       RExC_rxi->data->data[n] = (void*)o;
9068      }
9069     }
9070     pRExC_state->code_index++;
9071     nextchar(pRExC_state);
9072
9073     if (is_logical) {
9074      regnode *eval;
9075      ret = reg_node(pRExC_state, LOGICAL);
9076      eval = reganode(pRExC_state, EVAL, n);
9077      if (!SIZE_ONLY) {
9078       ret->flags = 2;
9079       /* for later propagation into (??{}) return value */
9080       eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9081      }
9082      REGTAIL(pRExC_state, ret, eval);
9083      /* deal with the length of this later - MJD */
9084      return ret;
9085     }
9086     ret = reganode(pRExC_state, EVAL, n);
9087     Set_Node_Length(ret, RExC_parse - parse_start + 1);
9088     Set_Node_Offset(ret, parse_start);
9089     return ret;
9090    }
9091    case '(':           /* (?(?{...})...) and (?(?=...)...) */
9092    {
9093     int is_define= 0;
9094     if (RExC_parse[0] == '?') {        /* (?(?...)) */
9095      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9096       || RExC_parse[1] == '<'
9097       || RExC_parse[1] == '{') { /* Lookahead or eval. */
9098       I32 flag;
9099       regnode *tail;
9100
9101       ret = reg_node(pRExC_state, LOGICAL);
9102       if (!SIZE_ONLY)
9103        ret->flags = 1;
9104
9105       tail = reg(pRExC_state, 1, &flag, depth+1);
9106       if (flag & RESTART_UTF8) {
9107        *flagp = RESTART_UTF8;
9108        return NULL;
9109       }
9110       REGTAIL(pRExC_state, ret, tail);
9111       goto insert_if;
9112      }
9113     }
9114     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9115       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9116     {
9117      char ch = RExC_parse[0] == '<' ? '>' : '\'';
9118      char *name_start= RExC_parse++;
9119      U32 num = 0;
9120      SV *sv_dat=reg_scan_name(pRExC_state,
9121       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9122      if (RExC_parse == name_start || *RExC_parse != ch)
9123       vFAIL2("Sequence (?(%c... not terminated",
9124        (ch == '>' ? '<' : ch));
9125      RExC_parse++;
9126      if (!SIZE_ONLY) {
9127       num = add_data( pRExC_state, 1, "S" );
9128       RExC_rxi->data->data[num]=(void*)sv_dat;
9129       SvREFCNT_inc_simple_void(sv_dat);
9130      }
9131      ret = reganode(pRExC_state,NGROUPP,num);
9132      goto insert_if_check_paren;
9133     }
9134     else if (RExC_parse[0] == 'D' &&
9135       RExC_parse[1] == 'E' &&
9136       RExC_parse[2] == 'F' &&
9137       RExC_parse[3] == 'I' &&
9138       RExC_parse[4] == 'N' &&
9139       RExC_parse[5] == 'E')
9140     {
9141      ret = reganode(pRExC_state,DEFINEP,0);
9142      RExC_parse +=6 ;
9143      is_define = 1;
9144      goto insert_if_check_paren;
9145     }
9146     else if (RExC_parse[0] == 'R') {
9147      RExC_parse++;
9148      parno = 0;
9149      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9150       parno = atoi(RExC_parse++);
9151       while (isDIGIT(*RExC_parse))
9152        RExC_parse++;
9153      } else if (RExC_parse[0] == '&') {
9154       SV *sv_dat;
9155       RExC_parse++;
9156       sv_dat = reg_scan_name(pRExC_state,
9157         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9158        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9159      }
9160      ret = reganode(pRExC_state,INSUBP,parno);
9161      goto insert_if_check_paren;
9162     }
9163     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9164      /* (?(1)...) */
9165      char c;
9166      parno = atoi(RExC_parse++);
9167
9168      while (isDIGIT(*RExC_parse))
9169       RExC_parse++;
9170      ret = reganode(pRExC_state, GROUPP, parno);
9171
9172     insert_if_check_paren:
9173      if ((c = *nextchar(pRExC_state)) != ')')
9174       vFAIL("Switch condition not recognized");
9175     insert_if:
9176      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9177      br = regbranch(pRExC_state, &flags, 1,depth+1);
9178      if (br == NULL) {
9179       if (flags & RESTART_UTF8) {
9180        *flagp = RESTART_UTF8;
9181        return NULL;
9182       }
9183       FAIL2("panic: regbranch returned NULL, flags=%#X",
9184        flags);
9185      } else
9186       REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9187      c = *nextchar(pRExC_state);
9188      if (flags&HASWIDTH)
9189       *flagp |= HASWIDTH;
9190      if (c == '|') {
9191       if (is_define)
9192        vFAIL("(?(DEFINE)....) does not allow branches");
9193       lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9194       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9195        if (flags & RESTART_UTF8) {
9196         *flagp = RESTART_UTF8;
9197         return NULL;
9198        }
9199        FAIL2("panic: regbranch returned NULL, flags=%#X",
9200         flags);
9201       }
9202       REGTAIL(pRExC_state, ret, lastbr);
9203       if (flags&HASWIDTH)
9204        *flagp |= HASWIDTH;
9205       c = *nextchar(pRExC_state);
9206      }
9207      else
9208       lastbr = NULL;
9209      if (c != ')')
9210       vFAIL("Switch (?(condition)... contains too many branches");
9211      ender = reg_node(pRExC_state, TAIL);
9212      REGTAIL(pRExC_state, br, ender);
9213      if (lastbr) {
9214       REGTAIL(pRExC_state, lastbr, ender);
9215       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9216      }
9217      else
9218       REGTAIL(pRExC_state, ret, ender);
9219      RExC_size++; /* XXX WHY do we need this?!!
9220          For large programs it seems to be required
9221          but I can't figure out why. -- dmq*/
9222      return ret;
9223     }
9224     else {
9225      vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9226     }
9227    }
9228    case '[':           /* (?[ ... ]) */
9229     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9230           oregcomp_parse);
9231    case 0:
9232     RExC_parse--; /* for vFAIL to print correctly */
9233     vFAIL("Sequence (? incomplete");
9234     break;
9235    default: /* e.g., (?i) */
9236     --RExC_parse;
9237    parse_flags:
9238     parse_lparen_question_flags(pRExC_state);
9239     if (UCHARAT(RExC_parse) != ':') {
9240      nextchar(pRExC_state);
9241      *flagp = TRYAGAIN;
9242      return NULL;
9243     }
9244     paren = ':';
9245     nextchar(pRExC_state);
9246     ret = NULL;
9247     goto parse_rest;
9248    } /* end switch */
9249   }
9250   else {                  /* (...) */
9251   capturing_parens:
9252    parno = RExC_npar;
9253    RExC_npar++;
9254
9255    ret = reganode(pRExC_state, OPEN, parno);
9256    if (!SIZE_ONLY ){
9257     if (!RExC_nestroot)
9258      RExC_nestroot = parno;
9259     if (RExC_seen & REG_SEEN_RECURSE
9260      && !RExC_open_parens[parno-1])
9261     {
9262      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9263       "Setting open paren #%"IVdf" to %d\n",
9264       (IV)parno, REG_NODE_NUM(ret)));
9265      RExC_open_parens[parno-1]= ret;
9266     }
9267    }
9268    Set_Node_Length(ret, 1); /* MJD */
9269    Set_Node_Offset(ret, RExC_parse); /* MJD */
9270    is_open = 1;
9271   }
9272  }
9273  else                        /* ! paren */
9274   ret = NULL;
9275
9276    parse_rest:
9277  /* Pick up the branches, linking them together. */
9278  parse_start = RExC_parse;   /* MJD */
9279  br = regbranch(pRExC_state, &flags, 1,depth+1);
9280
9281  /*     branch_len = (paren != 0); */
9282
9283  if (br == NULL) {
9284   if (flags & RESTART_UTF8) {
9285    *flagp = RESTART_UTF8;
9286    return NULL;
9287   }
9288   FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9289  }
9290  if (*RExC_parse == '|') {
9291   if (!SIZE_ONLY && RExC_extralen) {
9292    reginsert(pRExC_state, BRANCHJ, br, depth+1);
9293   }
9294   else {                  /* MJD */
9295    reginsert(pRExC_state, BRANCH, br, depth+1);
9296    Set_Node_Length(br, paren != 0);
9297    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9298   }
9299   have_branch = 1;
9300   if (SIZE_ONLY)
9301    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
9302  }
9303  else if (paren == ':') {
9304   *flagp |= flags&SIMPLE;
9305  }
9306  if (is_open) {    /* Starts with OPEN. */
9307   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9308  }
9309  else if (paren != '?')  /* Not Conditional */
9310   ret = br;
9311  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9312  lastbr = br;
9313  while (*RExC_parse == '|') {
9314   if (!SIZE_ONLY && RExC_extralen) {
9315    ender = reganode(pRExC_state, LONGJMP,0);
9316    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9317   }
9318   if (SIZE_ONLY)
9319    RExC_extralen += 2;  /* Account for LONGJMP. */
9320   nextchar(pRExC_state);
9321   if (freeze_paren) {
9322    if (RExC_npar > after_freeze)
9323     after_freeze = RExC_npar;
9324    RExC_npar = freeze_paren;
9325   }
9326   br = regbranch(pRExC_state, &flags, 0, depth+1);
9327
9328   if (br == NULL) {
9329    if (flags & RESTART_UTF8) {
9330     *flagp = RESTART_UTF8;
9331     return NULL;
9332    }
9333    FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9334   }
9335   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9336   lastbr = br;
9337   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9338  }
9339
9340  if (have_branch || paren != ':') {
9341   /* Make a closing node, and hook it on the end. */
9342   switch (paren) {
9343   case ':':
9344    ender = reg_node(pRExC_state, TAIL);
9345    break;
9346   case 1: case 2:
9347    ender = reganode(pRExC_state, CLOSE, parno);
9348    if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9349     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9350       "Setting close paren #%"IVdf" to %d\n",
9351       (IV)parno, REG_NODE_NUM(ender)));
9352     RExC_close_parens[parno-1]= ender;
9353     if (RExC_nestroot == parno)
9354      RExC_nestroot = 0;
9355    }
9356    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9357    Set_Node_Length(ender,1); /* MJD */
9358    break;
9359   case '<':
9360   case ',':
9361   case '=':
9362   case '!':
9363    *flagp &= ~HASWIDTH;
9364    /* FALL THROUGH */
9365   case '>':
9366    ender = reg_node(pRExC_state, SUCCEED);
9367    break;
9368   case 0:
9369    ender = reg_node(pRExC_state, END);
9370    if (!SIZE_ONLY) {
9371     assert(!RExC_opend); /* there can only be one! */
9372     RExC_opend = ender;
9373    }
9374    break;
9375   }
9376   DEBUG_PARSE_r(if (!SIZE_ONLY) {
9377    SV * const mysv_val1=sv_newmortal();
9378    SV * const mysv_val2=sv_newmortal();
9379    DEBUG_PARSE_MSG("lsbr");
9380    regprop(RExC_rx, mysv_val1, lastbr);
9381    regprop(RExC_rx, mysv_val2, ender);
9382    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9383       SvPV_nolen_const(mysv_val1),
9384       (IV)REG_NODE_NUM(lastbr),
9385       SvPV_nolen_const(mysv_val2),
9386       (IV)REG_NODE_NUM(ender),
9387       (IV)(ender - lastbr)
9388    );
9389   });
9390   REGTAIL(pRExC_state, lastbr, ender);
9391
9392   if (have_branch && !SIZE_ONLY) {
9393    char is_nothing= 1;
9394    if (depth==1)
9395     RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9396
9397    /* Hook the tails of the branches to the closing node. */
9398    for (br = ret; br; br = regnext(br)) {
9399     const U8 op = PL_regkind[OP(br)];
9400     if (op == BRANCH) {
9401      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9402      if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9403       is_nothing= 0;
9404     }
9405     else if (op == BRANCHJ) {
9406      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9407      /* for now we always disable this optimisation * /
9408      if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9409      */
9410       is_nothing= 0;
9411     }
9412    }
9413    if (is_nothing) {
9414     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9415     DEBUG_PARSE_r(if (!SIZE_ONLY) {
9416      SV * const mysv_val1=sv_newmortal();
9417      SV * const mysv_val2=sv_newmortal();
9418      DEBUG_PARSE_MSG("NADA");
9419      regprop(RExC_rx, mysv_val1, ret);
9420      regprop(RExC_rx, mysv_val2, ender);
9421      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9422         SvPV_nolen_const(mysv_val1),
9423         (IV)REG_NODE_NUM(ret),
9424         SvPV_nolen_const(mysv_val2),
9425         (IV)REG_NODE_NUM(ender),
9426         (IV)(ender - ret)
9427      );
9428     });
9429     OP(br)= NOTHING;
9430     if (OP(ender) == TAIL) {
9431      NEXT_OFF(br)= 0;
9432      RExC_emit= br + 1;
9433     } else {
9434      regnode *opt;
9435      for ( opt= br + 1; opt < ender ; opt++ )
9436       OP(opt)= OPTIMIZED;
9437      NEXT_OFF(br)= ender - br;
9438     }
9439    }
9440   }
9441  }
9442
9443  {
9444   const char *p;
9445   static const char parens[] = "=!<,>";
9446
9447   if (paren && (p = strchr(parens, paren))) {
9448    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9449    int flag = (p - parens) > 1;
9450
9451    if (paren == '>')
9452     node = SUSPEND, flag = 0;
9453    reginsert(pRExC_state, node,ret, depth+1);
9454    Set_Node_Cur_Length(ret);
9455    Set_Node_Offset(ret, parse_start + 1);
9456    ret->flags = flag;
9457    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9458   }
9459  }
9460
9461  /* Check for proper termination. */
9462  if (paren) {
9463   /* restore original flags, but keep (?p) */
9464   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9465   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9466    RExC_parse = oregcomp_parse;
9467    vFAIL("Unmatched (");
9468   }
9469  }
9470  else if (!paren && RExC_parse < RExC_end) {
9471   if (*RExC_parse == ')') {
9472    RExC_parse++;
9473    vFAIL("Unmatched )");
9474   }
9475   else
9476    FAIL("Junk on end of regexp"); /* "Can't happen". */
9477   assert(0); /* NOTREACHED */
9478  }
9479
9480  if (RExC_in_lookbehind) {
9481   RExC_in_lookbehind--;
9482  }
9483  if (after_freeze > RExC_npar)
9484   RExC_npar = after_freeze;
9485  return(ret);
9486 }
9487
9488 /*
9489  - regbranch - one alternative of an | operator
9490  *
9491  * Implements the concatenation operator.
9492  *
9493  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9494  * restarted.
9495  */
9496 STATIC regnode *
9497 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9498 {
9499  dVAR;
9500  regnode *ret;
9501  regnode *chain = NULL;
9502  regnode *latest;
9503  I32 flags = 0, c = 0;
9504  GET_RE_DEBUG_FLAGS_DECL;
9505
9506  PERL_ARGS_ASSERT_REGBRANCH;
9507
9508  DEBUG_PARSE("brnc");
9509
9510  if (first)
9511   ret = NULL;
9512  else {
9513   if (!SIZE_ONLY && RExC_extralen)
9514    ret = reganode(pRExC_state, BRANCHJ,0);
9515   else {
9516    ret = reg_node(pRExC_state, BRANCH);
9517    Set_Node_Length(ret, 1);
9518   }
9519  }
9520
9521  if (!first && SIZE_ONLY)
9522   RExC_extralen += 1;   /* BRANCHJ */
9523
9524  *flagp = WORST;   /* Tentatively. */
9525
9526  RExC_parse--;
9527  nextchar(pRExC_state);
9528  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9529   flags &= ~TRYAGAIN;
9530   latest = regpiece(pRExC_state, &flags,depth+1);
9531   if (latest == NULL) {
9532    if (flags & TRYAGAIN)
9533     continue;
9534    if (flags & RESTART_UTF8) {
9535     *flagp = RESTART_UTF8;
9536     return NULL;
9537    }
9538    FAIL2("panic: regpiece returned NULL, flags=%#X", flags);
9539   }
9540   else if (ret == NULL)
9541    ret = latest;
9542   *flagp |= flags&(HASWIDTH|POSTPONED);
9543   if (chain == NULL)  /* First piece. */
9544    *flagp |= flags&SPSTART;
9545   else {
9546    RExC_naughty++;
9547    REGTAIL(pRExC_state, chain, latest);
9548   }
9549   chain = latest;
9550   c++;
9551  }
9552  if (chain == NULL) { /* Loop ran zero times. */
9553   chain = reg_node(pRExC_state, NOTHING);
9554   if (ret == NULL)
9555    ret = chain;
9556  }
9557  if (c == 1) {
9558   *flagp |= flags&SIMPLE;
9559  }
9560
9561  return ret;
9562 }
9563
9564 /*
9565  - regpiece - something followed by possible [*+?]
9566  *
9567  * Note that the branching code sequences used for ? and the general cases
9568  * of * and + are somewhat optimized:  they use the same NOTHING node as
9569  * both the endmarker for their branch list and the body of the last branch.
9570  * It might seem that this node could be dispensed with entirely, but the
9571  * endmarker role is not redundant.
9572  *
9573  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9574  * TRYAGAIN.
9575  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9576  * restarted.
9577  */
9578 STATIC regnode *
9579 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9580 {
9581  dVAR;
9582  regnode *ret;
9583  char op;
9584  char *next;
9585  I32 flags;
9586  const char * const origparse = RExC_parse;
9587  I32 min;
9588  I32 max = REG_INFTY;
9589 #ifdef RE_TRACK_PATTERN_OFFSETS
9590  char *parse_start;
9591 #endif
9592  const char *maxpos = NULL;
9593
9594  /* Save the original in case we change the emitted regop to a FAIL. */
9595  regnode * const orig_emit = RExC_emit;
9596
9597  GET_RE_DEBUG_FLAGS_DECL;
9598
9599  PERL_ARGS_ASSERT_REGPIECE;
9600
9601  DEBUG_PARSE("piec");
9602
9603  ret = regatom(pRExC_state, &flags,depth+1);
9604  if (ret == NULL) {
9605   if (flags & (TRYAGAIN|RESTART_UTF8))
9606    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9607   else
9608    FAIL2("panic: regatom returned NULL, flags=%#X", flags);
9609   return(NULL);
9610  }
9611
9612  op = *RExC_parse;
9613
9614  if (op == '{' && regcurly(RExC_parse, FALSE)) {
9615   maxpos = NULL;
9616 #ifdef RE_TRACK_PATTERN_OFFSETS
9617   parse_start = RExC_parse; /* MJD */
9618 #endif
9619   next = RExC_parse + 1;
9620   while (isDIGIT(*next) || *next == ',') {
9621    if (*next == ',') {
9622     if (maxpos)
9623      break;
9624     else
9625      maxpos = next;
9626    }
9627    next++;
9628   }
9629   if (*next == '}') {  /* got one */
9630    if (!maxpos)
9631     maxpos = next;
9632    RExC_parse++;
9633    min = atoi(RExC_parse);
9634    if (*maxpos == ',')
9635     maxpos++;
9636    else
9637     maxpos = RExC_parse;
9638    max = atoi(maxpos);
9639    if (!max && *maxpos != '0')
9640     max = REG_INFTY;  /* meaning "infinity" */
9641    else if (max >= REG_INFTY)
9642     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9643    RExC_parse = next;
9644    nextchar(pRExC_state);
9645    if (max < min) {    /* If can't match, warn and optimize to fail
9646         unconditionally */
9647     if (SIZE_ONLY) {
9648      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9649
9650      /* We can't back off the size because we have to reserve
9651      * enough space for all the things we are about to throw
9652      * away, but we can shrink it by the ammount we are about
9653      * to re-use here */
9654      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9655     }
9656     else {
9657      RExC_emit = orig_emit;
9658     }
9659     ret = reg_node(pRExC_state, OPFAIL);
9660     return ret;
9661    }
9662
9663   do_curly:
9664    if ((flags&SIMPLE)) {
9665     RExC_naughty += 2 + RExC_naughty / 2;
9666     reginsert(pRExC_state, CURLY, ret, depth+1);
9667     Set_Node_Offset(ret, parse_start+1); /* MJD */
9668     Set_Node_Cur_Length(ret);
9669    }
9670    else {
9671     regnode * const w = reg_node(pRExC_state, WHILEM);
9672
9673     w->flags = 0;
9674     REGTAIL(pRExC_state, ret, w);
9675     if (!SIZE_ONLY && RExC_extralen) {
9676      reginsert(pRExC_state, LONGJMP,ret, depth+1);
9677      reginsert(pRExC_state, NOTHING,ret, depth+1);
9678      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9679     }
9680     reginsert(pRExC_state, CURLYX,ret, depth+1);
9681         /* MJD hk */
9682     Set_Node_Offset(ret, parse_start+1);
9683     Set_Node_Length(ret,
9684         op == '{' ? (RExC_parse - parse_start) : 1);
9685
9686     if (!SIZE_ONLY && RExC_extralen)
9687      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9688     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9689     if (SIZE_ONLY)
9690      RExC_whilem_seen++, RExC_extralen += 3;
9691     RExC_naughty += 4 + RExC_naughty; /* compound interest */
9692    }
9693    ret->flags = 0;
9694
9695    if (min > 0)
9696     *flagp = WORST;
9697    if (max > 0)
9698     *flagp |= HASWIDTH;
9699    if (!SIZE_ONLY) {
9700     ARG1_SET(ret, (U16)min);
9701     ARG2_SET(ret, (U16)max);
9702    }
9703
9704    goto nest_check;
9705   }
9706  }
9707
9708  if (!ISMULT1(op)) {
9709   *flagp = flags;
9710   return(ret);
9711  }
9712
9713 #if 0    /* Now runtime fix should be reliable. */
9714
9715  /* if this is reinstated, don't forget to put this back into perldiag:
9716
9717    =item Regexp *+ operand could be empty at {#} in regex m/%s/
9718
9719   (F) The part of the regexp subject to either the * or + quantifier
9720   could match an empty string. The {#} shows in the regular
9721   expression about where the problem was discovered.
9722
9723  */
9724
9725  if (!(flags&HASWIDTH) && op != '?')
9726  vFAIL("Regexp *+ operand could be empty");
9727 #endif
9728
9729 #ifdef RE_TRACK_PATTERN_OFFSETS
9730  parse_start = RExC_parse;
9731 #endif
9732  nextchar(pRExC_state);
9733
9734  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9735
9736  if (op == '*' && (flags&SIMPLE)) {
9737   reginsert(pRExC_state, STAR, ret, depth+1);
9738   ret->flags = 0;
9739   RExC_naughty += 4;
9740  }
9741  else if (op == '*') {
9742   min = 0;
9743   goto do_curly;
9744  }
9745  else if (op == '+' && (flags&SIMPLE)) {
9746   reginsert(pRExC_state, PLUS, ret, depth+1);
9747   ret->flags = 0;
9748   RExC_naughty += 3;
9749  }
9750  else if (op == '+') {
9751   min = 1;
9752   goto do_curly;
9753  }
9754  else if (op == '?') {
9755   min = 0; max = 1;
9756   goto do_curly;
9757  }
9758   nest_check:
9759  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9760   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9761   ckWARN3reg(RExC_parse,
9762     "%.*s matches null string many times",
9763     (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9764     origparse);
9765   (void)ReREFCNT_inc(RExC_rx_sv);
9766  }
9767
9768  if (RExC_parse < RExC_end && *RExC_parse == '?') {
9769   nextchar(pRExC_state);
9770   reginsert(pRExC_state, MINMOD, ret, depth+1);
9771   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9772  }
9773 #ifndef REG_ALLOW_MINMOD_SUSPEND
9774  else
9775 #endif
9776  if (RExC_parse < RExC_end && *RExC_parse == '+') {
9777   regnode *ender;
9778   nextchar(pRExC_state);
9779   ender = reg_node(pRExC_state, SUCCEED);
9780   REGTAIL(pRExC_state, ret, ender);
9781   reginsert(pRExC_state, SUSPEND, ret, depth+1);
9782   ret->flags = 0;
9783   ender = reg_node(pRExC_state, TAIL);
9784   REGTAIL(pRExC_state, ret, ender);
9785   /*ret= ender;*/
9786  }
9787
9788  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9789   RExC_parse++;
9790   vFAIL("Nested quantifiers");
9791  }
9792
9793  return(ret);
9794 }
9795
9796 STATIC bool
9797 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9798   const bool strict   /* Apply stricter parsing rules? */
9799  )
9800 {
9801
9802  /* This is expected to be called by a parser routine that has recognized '\N'
9803    and needs to handle the rest. RExC_parse is expected to point at the first
9804    char following the N at the time of the call.  On successful return,
9805    RExC_parse has been updated to point to just after the sequence identified
9806    by this routine, and <*flagp> has been updated.
9807
9808    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9809    character class.
9810
9811    \N may begin either a named sequence, or if outside a character class, mean
9812    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9813    attempted to decide which, and in the case of a named sequence, converted it
9814    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9815    where c1... are the characters in the sequence.  For single-quoted regexes,
9816    the tokenizer passes the \N sequence through unchanged; this code will not
9817    attempt to determine this nor expand those, instead raising a syntax error.
9818    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9819    or there is no '}', it signals that this \N occurrence means to match a
9820    non-newline.
9821
9822    Only the \N{U+...} form should occur in a character class, for the same
9823    reason that '.' inside a character class means to just match a period: it
9824    just doesn't make sense.
9825
9826    The function raises an error (via vFAIL), and doesn't return for various
9827    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9828    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9829    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9830    only possible if node_p is non-NULL.
9831
9832
9833    If <valuep> is non-null, it means the caller can accept an input sequence
9834    consisting of a just a single code point; <*valuep> is set to that value
9835    if the input is such.
9836
9837    If <node_p> is non-null it signifies that the caller can accept any other
9838    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9839    is set as follows:
9840  1) \N means not-a-NL: points to a newly created REG_ANY node;
9841  2) \N{}:              points to a new NOTHING node;
9842  3) otherwise:         points to a new EXACT node containing the resolved
9843       string.
9844    Note that FALSE is returned for single code point sequences if <valuep> is
9845    null.
9846  */
9847
9848  char * endbrace;    /* '}' following the name */
9849  char* p;
9850  char *endchar; /* Points to '.' or '}' ending cur char in the input
9851       stream */
9852  bool has_multiple_chars; /* true if the input stream contains a sequence of
9853         more than one character */
9854
9855  GET_RE_DEBUG_FLAGS_DECL;
9856
9857  PERL_ARGS_ASSERT_GROK_BSLASH_N;
9858
9859  GET_RE_DEBUG_FLAGS;
9860
9861  assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9862
9863  /* The [^\n] meaning of \N ignores spaces and comments under the /x
9864  * modifier.  The other meaning does not */
9865  p = (RExC_flags & RXf_PMf_EXTENDED)
9866   ? regwhite( pRExC_state, RExC_parse )
9867   : RExC_parse;
9868
9869  /* Disambiguate between \N meaning a named character versus \N meaning
9870  * [^\n].  The former is assumed when it can't be the latter. */
9871  if (*p != '{' || regcurly(p, FALSE)) {
9872   RExC_parse = p;
9873   if (! node_p) {
9874    /* no bare \N in a charclass */
9875    if (in_char_class) {
9876     vFAIL("\\N in a character class must be a named character: \\N{...}");
9877    }
9878    return FALSE;
9879   }
9880   nextchar(pRExC_state);
9881   *node_p = reg_node(pRExC_state, REG_ANY);
9882   *flagp |= HASWIDTH|SIMPLE;
9883   RExC_naughty++;
9884   RExC_parse--;
9885   Set_Node_Length(*node_p, 1); /* MJD */
9886   return TRUE;
9887  }
9888
9889  /* Here, we have decided it should be a named character or sequence */
9890
9891  /* The test above made sure that the next real character is a '{', but
9892  * under the /x modifier, it could be separated by space (or a comment and
9893  * \n) and this is not allowed (for consistency with \x{...} and the
9894  * tokenizer handling of \N{NAME}). */
9895  if (*RExC_parse != '{') {
9896   vFAIL("Missing braces on \\N{}");
9897  }
9898
9899  RExC_parse++; /* Skip past the '{' */
9900
9901  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9902   || ! (endbrace == RExC_parse  /* nothing between the {} */
9903    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9904     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9905  {
9906   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9907   vFAIL("\\N{NAME} must be resolved by the lexer");
9908  }
9909
9910  if (endbrace == RExC_parse) {   /* empty: \N{} */
9911   bool ret = TRUE;
9912   if (node_p) {
9913    *node_p = reg_node(pRExC_state,NOTHING);
9914   }
9915   else if (in_char_class) {
9916    if (SIZE_ONLY && in_char_class) {
9917     if (strict) {
9918      RExC_parse++;   /* Position after the "}" */
9919      vFAIL("Zero length \\N{}");
9920     }
9921     else {
9922      ckWARNreg(RExC_parse,
9923        "Ignoring zero length \\N{} in character class");
9924     }
9925    }
9926    ret = FALSE;
9927   }
9928   else {
9929    return FALSE;
9930   }
9931   nextchar(pRExC_state);
9932   return ret;
9933  }
9934
9935  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9936  RExC_parse += 2; /* Skip past the 'U+' */
9937
9938  endchar = RExC_parse + strcspn(RExC_parse, ".}");
9939
9940  /* Code points are separated by dots.  If none, there is only one code
9941  * point, and is terminated by the brace */
9942  has_multiple_chars = (endchar < endbrace);
9943
9944  if (valuep && (! has_multiple_chars || in_char_class)) {
9945   /* We only pay attention to the first char of
9946   multichar strings being returned in char classes. I kinda wonder
9947   if this makes sense as it does change the behaviour
9948   from earlier versions, OTOH that behaviour was broken
9949   as well. XXX Solution is to recharacterize as
9950   [rest-of-class]|multi1|multi2... */
9951
9952   STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9953   I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9954    | PERL_SCAN_DISALLOW_PREFIX
9955    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9956
9957   *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9958
9959   /* The tokenizer should have guaranteed validity, but it's possible to
9960   * bypass it by using single quoting, so check */
9961   if (length_of_hex == 0
9962    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9963   {
9964    RExC_parse += length_of_hex; /* Includes all the valid */
9965    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9966        ? UTF8SKIP(RExC_parse)
9967        : 1;
9968    /* Guard against malformed utf8 */
9969    if (RExC_parse >= endchar) {
9970     RExC_parse = endchar;
9971    }
9972    vFAIL("Invalid hexadecimal number in \\N{U+...}");
9973   }
9974
9975   if (in_char_class && has_multiple_chars) {
9976    if (strict) {
9977     RExC_parse = endbrace;
9978     vFAIL("\\N{} in character class restricted to one character");
9979    }
9980    else {
9981     ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9982    }
9983   }
9984
9985   RExC_parse = endbrace + 1;
9986  }
9987  else if (! node_p || ! has_multiple_chars) {
9988
9989   /* Here, the input is legal, but not according to the caller's
9990   * options.  We fail without advancing the parse, so that the
9991   * caller can try again */
9992   RExC_parse = p;
9993   return FALSE;
9994  }
9995  else {
9996
9997   /* What is done here is to convert this to a sub-pattern of the form
9998   * (?:\x{char1}\x{char2}...)
9999   * and then call reg recursively.  That way, it retains its atomicness,
10000   * while not having to worry about special handling that some code
10001   * points may have.  toke.c has converted the original Unicode values
10002   * to native, so that we can just pass on the hex values unchanged.  We
10003   * do have to set a flag to keep recoding from happening in the
10004   * recursion */
10005
10006   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10007   STRLEN len;
10008   char *orig_end = RExC_end;
10009   I32 flags;
10010
10011   while (RExC_parse < endbrace) {
10012
10013    /* Convert to notation the rest of the code understands */
10014    sv_catpv(substitute_parse, "\\x{");
10015    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10016    sv_catpv(substitute_parse, "}");
10017
10018    /* Point to the beginning of the next character in the sequence. */
10019    RExC_parse = endchar + 1;
10020    endchar = RExC_parse + strcspn(RExC_parse, ".}");
10021   }
10022   sv_catpv(substitute_parse, ")");
10023
10024   RExC_parse = SvPV(substitute_parse, len);
10025
10026   /* Don't allow empty number */
10027   if (len < 8) {
10028    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10029   }
10030   RExC_end = RExC_parse + len;
10031
10032   /* The values are Unicode, and therefore not subject to recoding */
10033   RExC_override_recoding = 1;
10034
10035   if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10036    if (flags & RESTART_UTF8) {
10037     *flagp = RESTART_UTF8;
10038     return FALSE;
10039    }
10040    FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X",
10041     flags);
10042   }
10043   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10044
10045   RExC_parse = endbrace;
10046   RExC_end = orig_end;
10047   RExC_override_recoding = 0;
10048
10049   nextchar(pRExC_state);
10050  }
10051
10052  return TRUE;
10053 }
10054
10055
10056 /*
10057  * reg_recode
10058  *
10059  * It returns the code point in utf8 for the value in *encp.
10060  *    value: a code value in the source encoding
10061  *    encp:  a pointer to an Encode object
10062  *
10063  * If the result from Encode is not a single character,
10064  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10065  */
10066 STATIC UV
10067 S_reg_recode(pTHX_ const char value, SV **encp)
10068 {
10069  STRLEN numlen = 1;
10070  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10071  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10072  const STRLEN newlen = SvCUR(sv);
10073  UV uv = UNICODE_REPLACEMENT;
10074
10075  PERL_ARGS_ASSERT_REG_RECODE;
10076
10077  if (newlen)
10078   uv = SvUTF8(sv)
10079    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10080    : *(U8*)s;
10081
10082  if (!newlen || numlen != newlen) {
10083   uv = UNICODE_REPLACEMENT;
10084   *encp = NULL;
10085  }
10086  return uv;
10087 }
10088
10089 PERL_STATIC_INLINE U8
10090 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10091 {
10092  U8 op;
10093
10094  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10095
10096  if (! FOLD) {
10097   return EXACT;
10098  }
10099
10100  op = get_regex_charset(RExC_flags);
10101  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10102   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10103     been, so there is no hole */
10104  }
10105
10106  return op + EXACTF;
10107 }
10108
10109 PERL_STATIC_INLINE void
10110 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10111 {
10112  /* This knows the details about sizing an EXACTish node, setting flags for
10113  * it (by setting <*flagp>, and potentially populating it with a single
10114  * character.
10115  *
10116  * If <len> (the length in bytes) is non-zero, this function assumes that
10117  * the node has already been populated, and just does the sizing.  In this
10118  * case <code_point> should be the final code point that has already been
10119  * placed into the node.  This value will be ignored except that under some
10120  * circumstances <*flagp> is set based on it.
10121  *
10122  * If <len> is zero, the function assumes that the node is to contain only
10123  * the single character given by <code_point> and calculates what <len>
10124  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10125  * additionally will populate the node's STRING with <code_point>, if <len>
10126  * is 0.  In both cases <*flagp> is appropriately set
10127  *
10128  * It knows that under FOLD, the Latin Sharp S and UTF characters above
10129  * 255, must be folded (the former only when the rules indicate it can
10130  * match 'ss') */
10131
10132  bool len_passed_in = cBOOL(len != 0);
10133  U8 character[UTF8_MAXBYTES_CASE+1];
10134
10135  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10136
10137  if (! len_passed_in) {
10138   if (UTF) {
10139    if (FOLD && (! LOC || code_point > 255)) {
10140     _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
10141         character,
10142         &len,
10143         FOLD_FLAGS_FULL | ((LOC)
10144              ? FOLD_FLAGS_LOCALE
10145              : (ASCII_FOLD_RESTRICTED)
10146              ? FOLD_FLAGS_NOMIX_ASCII
10147              : 0));
10148    }
10149    else {
10150     uvchr_to_utf8( character, code_point);
10151     len = UTF8SKIP(character);
10152    }
10153   }
10154   else if (! FOLD
10155     || code_point != LATIN_SMALL_LETTER_SHARP_S
10156     || ASCII_FOLD_RESTRICTED
10157     || ! AT_LEAST_UNI_SEMANTICS)
10158   {
10159    *character = (U8) code_point;
10160    len = 1;
10161   }
10162   else {
10163    *character = 's';
10164    *(character + 1) = 's';
10165    len = 2;
10166   }
10167  }
10168
10169  if (SIZE_ONLY) {
10170   RExC_size += STR_SZ(len);
10171  }
10172  else {
10173   RExC_emit += STR_SZ(len);
10174   STR_LEN(node) = len;
10175   if (! len_passed_in) {
10176    Copy((char *) character, STRING(node), len, char);
10177   }
10178  }
10179
10180  *flagp |= HASWIDTH;
10181
10182  /* A single character node is SIMPLE, except for the special-cased SHARP S
10183  * under /di. */
10184  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10185   && (code_point != LATIN_SMALL_LETTER_SHARP_S
10186    || ! FOLD || ! DEPENDS_SEMANTICS))
10187  {
10188   *flagp |= SIMPLE;
10189  }
10190 }
10191
10192 /*
10193  - regatom - the lowest level
10194
10195    Try to identify anything special at the start of the pattern. If there
10196    is, then handle it as required. This may involve generating a single regop,
10197    such as for an assertion; or it may involve recursing, such as to
10198    handle a () structure.
10199
10200    If the string doesn't start with something special then we gobble up
10201    as much literal text as we can.
10202
10203    Once we have been able to handle whatever type of thing started the
10204    sequence, we return.
10205
10206    Note: we have to be careful with escapes, as they can be both literal
10207    and special, and in the case of \10 and friends, context determines which.
10208
10209    A summary of the code structure is:
10210
10211    switch (first_byte) {
10212   cases for each special:
10213    handle this special;
10214    break;
10215   case '\\':
10216    switch (2nd byte) {
10217     cases for each unambiguous special:
10218      handle this special;
10219      break;
10220     cases for each ambigous special/literal:
10221      disambiguate;
10222      if (special)  handle here
10223      else goto defchar;
10224     default: // unambiguously literal:
10225      goto defchar;
10226    }
10227   default:  // is a literal char
10228    // FALL THROUGH
10229   defchar:
10230    create EXACTish node for literal;
10231    while (more input and node isn't full) {
10232     switch (input_byte) {
10233     cases for each special;
10234      make sure parse pointer is set so that the next call to
10235       regatom will see this special first
10236      goto loopdone; // EXACTish node terminated by prev. char
10237     default:
10238      append char to EXACTISH node;
10239     }
10240     get next input byte;
10241    }
10242   loopdone:
10243    }
10244    return the generated node;
10245
10246    Specifically there are two separate switches for handling
10247    escape sequences, with the one for handling literal escapes requiring
10248    a dummy entry for all of the special escapes that are actually handled
10249    by the other.
10250
10251    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10252    TRYAGAIN.
10253    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10254    restarted.
10255    Otherwise does not return NULL.
10256 */
10257
10258 STATIC regnode *
10259 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10260 {
10261  dVAR;
10262  regnode *ret = NULL;
10263  I32 flags = 0;
10264  char *parse_start = RExC_parse;
10265  U8 op;
10266  int invert = 0;
10267
10268  GET_RE_DEBUG_FLAGS_DECL;
10269
10270  *flagp = WORST;  /* Tentatively. */
10271
10272  DEBUG_PARSE("atom");
10273
10274  PERL_ARGS_ASSERT_REGATOM;
10275
10276 tryagain:
10277  switch ((U8)*RExC_parse) {
10278  case '^':
10279   RExC_seen_zerolen++;
10280   nextchar(pRExC_state);
10281   if (RExC_flags & RXf_PMf_MULTILINE)
10282    ret = reg_node(pRExC_state, MBOL);
10283   else if (RExC_flags & RXf_PMf_SINGLELINE)
10284    ret = reg_node(pRExC_state, SBOL);
10285   else
10286    ret = reg_node(pRExC_state, BOL);
10287   Set_Node_Length(ret, 1); /* MJD */
10288   break;
10289  case '$':
10290   nextchar(pRExC_state);
10291   if (*RExC_parse)
10292    RExC_seen_zerolen++;
10293   if (RExC_flags & RXf_PMf_MULTILINE)
10294    ret = reg_node(pRExC_state, MEOL);
10295   else if (RExC_flags & RXf_PMf_SINGLELINE)
10296    ret = reg_node(pRExC_state, SEOL);
10297   else
10298    ret = reg_node(pRExC_state, EOL);
10299   Set_Node_Length(ret, 1); /* MJD */
10300   break;
10301  case '.':
10302   nextchar(pRExC_state);
10303   if (RExC_flags & RXf_PMf_SINGLELINE)
10304    ret = reg_node(pRExC_state, SANY);
10305   else
10306    ret = reg_node(pRExC_state, REG_ANY);
10307   *flagp |= HASWIDTH|SIMPLE;
10308   RExC_naughty++;
10309   Set_Node_Length(ret, 1); /* MJD */
10310   break;
10311  case '[':
10312  {
10313   char * const oregcomp_parse = ++RExC_parse;
10314   ret = regclass(pRExC_state, flagp,depth+1,
10315      FALSE, /* means parse the whole char class */
10316      TRUE, /* allow multi-char folds */
10317      FALSE, /* don't silence non-portable warnings. */
10318      NULL);
10319   if (*RExC_parse != ']') {
10320    RExC_parse = oregcomp_parse;
10321    vFAIL("Unmatched [");
10322   }
10323   if (ret == NULL) {
10324    if (*flagp & RESTART_UTF8)
10325     return NULL;
10326    FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10327     *flagp);
10328   }
10329   nextchar(pRExC_state);
10330   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10331   break;
10332  }
10333  case '(':
10334   nextchar(pRExC_state);
10335   ret = reg(pRExC_state, 2, &flags,depth+1);
10336   if (ret == NULL) {
10337     if (flags & TRYAGAIN) {
10338      if (RExC_parse == RExC_end) {
10339       /* Make parent create an empty node if needed. */
10340       *flagp |= TRYAGAIN;
10341       return(NULL);
10342      }
10343      goto tryagain;
10344     }
10345     if (flags & RESTART_UTF8) {
10346      *flagp = RESTART_UTF8;
10347      return NULL;
10348     }
10349     FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags);
10350   }
10351   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10352   break;
10353  case '|':
10354  case ')':
10355   if (flags & TRYAGAIN) {
10356    *flagp |= TRYAGAIN;
10357    return NULL;
10358   }
10359   vFAIL("Internal urp");
10360         /* Supposed to be caught earlier. */
10361   break;
10362  case '{':
10363   if (!regcurly(RExC_parse, FALSE)) {
10364    RExC_parse++;
10365    goto defchar;
10366   }
10367   /* FALL THROUGH */
10368  case '?':
10369  case '+':
10370  case '*':
10371   RExC_parse++;
10372   vFAIL("Quantifier follows nothing");
10373   break;
10374  case '\\':
10375   /* Special Escapes
10376
10377   This switch handles escape sequences that resolve to some kind
10378   of special regop and not to literal text. Escape sequnces that
10379   resolve to literal text are handled below in the switch marked
10380   "Literal Escapes".
10381
10382   Every entry in this switch *must* have a corresponding entry
10383   in the literal escape switch. However, the opposite is not
10384   required, as the default for this switch is to jump to the
10385   literal text handling code.
10386   */
10387   switch ((U8)*++RExC_parse) {
10388    U8 arg;
10389   /* Special Escapes */
10390   case 'A':
10391    RExC_seen_zerolen++;
10392    ret = reg_node(pRExC_state, SBOL);
10393    *flagp |= SIMPLE;
10394    goto finish_meta_pat;
10395   case 'G':
10396    ret = reg_node(pRExC_state, GPOS);
10397    RExC_seen |= REG_SEEN_GPOS;
10398    *flagp |= SIMPLE;
10399    goto finish_meta_pat;
10400   case 'K':
10401    RExC_seen_zerolen++;
10402    ret = reg_node(pRExC_state, KEEPS);
10403    *flagp |= SIMPLE;
10404    /* XXX:dmq : disabling in-place substitution seems to
10405    * be necessary here to avoid cases of memory corruption, as
10406    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10407    */
10408    RExC_seen |= REG_SEEN_LOOKBEHIND;
10409    goto finish_meta_pat;
10410   case 'Z':
10411    ret = reg_node(pRExC_state, SEOL);
10412    *flagp |= SIMPLE;
10413    RExC_seen_zerolen++;  /* Do not optimize RE away */
10414    goto finish_meta_pat;
10415   case 'z':
10416    ret = reg_node(pRExC_state, EOS);
10417    *flagp |= SIMPLE;
10418    RExC_seen_zerolen++;  /* Do not optimize RE away */
10419    goto finish_meta_pat;
10420   case 'C':
10421    ret = reg_node(pRExC_state, CANY);
10422    RExC_seen |= REG_SEEN_CANY;
10423    *flagp |= HASWIDTH|SIMPLE;
10424    goto finish_meta_pat;
10425   case 'X':
10426    ret = reg_node(pRExC_state, CLUMP);
10427    *flagp |= HASWIDTH;
10428    goto finish_meta_pat;
10429
10430   case 'W':
10431    invert = 1;
10432    /* FALLTHROUGH */
10433   case 'w':
10434    arg = ANYOF_WORDCHAR;
10435    goto join_posix;
10436
10437   case 'b':
10438    RExC_seen_zerolen++;
10439    RExC_seen |= REG_SEEN_LOOKBEHIND;
10440    op = BOUND + get_regex_charset(RExC_flags);
10441    if (op > BOUNDA) {  /* /aa is same as /a */
10442     op = BOUNDA;
10443    }
10444    ret = reg_node(pRExC_state, op);
10445    FLAGS(ret) = get_regex_charset(RExC_flags);
10446    *flagp |= SIMPLE;
10447    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10448     ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10449    }
10450    goto finish_meta_pat;
10451   case 'B':
10452    RExC_seen_zerolen++;
10453    RExC_seen |= REG_SEEN_LOOKBEHIND;
10454    op = NBOUND + get_regex_charset(RExC_flags);
10455    if (op > NBOUNDA) { /* /aa is same as /a */
10456     op = NBOUNDA;
10457    }
10458    ret = reg_node(pRExC_state, op);
10459    FLAGS(ret) = get_regex_charset(RExC_flags);
10460    *flagp |= SIMPLE;
10461    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10462     ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10463    }
10464    goto finish_meta_pat;
10465
10466   case 'D':
10467    invert = 1;
10468    /* FALLTHROUGH */
10469   case 'd':
10470    arg = ANYOF_DIGIT;
10471    goto join_posix;
10472
10473   case 'R':
10474    ret = reg_node(pRExC_state, LNBREAK);
10475    *flagp |= HASWIDTH|SIMPLE;
10476    goto finish_meta_pat;
10477
10478   case 'H':
10479    invert = 1;
10480    /* FALLTHROUGH */
10481   case 'h':
10482    arg = ANYOF_BLANK;
10483    op = POSIXU;
10484    goto join_posix_op_known;
10485
10486   case 'V':
10487    invert = 1;
10488    /* FALLTHROUGH */
10489   case 'v':
10490    arg = ANYOF_VERTWS;
10491    op = POSIXU;
10492    goto join_posix_op_known;
10493
10494   case 'S':
10495    invert = 1;
10496    /* FALLTHROUGH */
10497   case 's':
10498    arg = ANYOF_SPACE;
10499
10500   join_posix:
10501
10502    op = POSIXD + get_regex_charset(RExC_flags);
10503    if (op > POSIXA) {  /* /aa is same as /a */
10504     op = POSIXA;
10505    }
10506
10507   join_posix_op_known:
10508
10509    if (invert) {
10510     op += NPOSIXD - POSIXD;
10511    }
10512
10513    ret = reg_node(pRExC_state, op);
10514    if (! SIZE_ONLY) {
10515     FLAGS(ret) = namedclass_to_classnum(arg);
10516    }
10517
10518    *flagp |= HASWIDTH|SIMPLE;
10519    /* FALL THROUGH */
10520
10521   finish_meta_pat:
10522    nextchar(pRExC_state);
10523    Set_Node_Length(ret, 2); /* MJD */
10524    break;
10525   case 'p':
10526   case 'P':
10527    {
10528 #ifdef DEBUGGING
10529     char* parse_start = RExC_parse - 2;
10530 #endif
10531
10532     RExC_parse--;
10533
10534     ret = regclass(pRExC_state, flagp,depth+1,
10535        TRUE, /* means just parse this element */
10536        FALSE, /* don't allow multi-char folds */
10537        FALSE, /* don't silence non-portable warnings.
10538           It would be a bug if these returned
10539           non-portables */
10540        NULL);
10541     /* regclass() can only return RESTART_UTF8 if multi-char folds
10542     are allowed.  */
10543     if (!ret)
10544      FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10545       *flagp);
10546
10547     RExC_parse--;
10548
10549     Set_Node_Offset(ret, parse_start + 2);
10550     Set_Node_Cur_Length(ret);
10551     nextchar(pRExC_state);
10552    }
10553    break;
10554   case 'N':
10555    /* Handle \N and \N{NAME} with multiple code points here and not
10556    * below because it can be multicharacter. join_exact() will join
10557    * them up later on.  Also this makes sure that things like
10558    * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10559    * The options to the grok function call causes it to fail if the
10560    * sequence is just a single code point.  We then go treat it as
10561    * just another character in the current EXACT node, and hence it
10562    * gets uniform treatment with all the other characters.  The
10563    * special treatment for quantifiers is not needed for such single
10564    * character sequences */
10565    ++RExC_parse;
10566    if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10567         FALSE /* not strict */ )) {
10568     if (*flagp & RESTART_UTF8)
10569      return NULL;
10570     RExC_parse--;
10571     goto defchar;
10572    }
10573    break;
10574   case 'k':    /* Handle \k<NAME> and \k'NAME' */
10575   parse_named_seq:
10576   {
10577    char ch= RExC_parse[1];
10578    if (ch != '<' && ch != '\'' && ch != '{') {
10579     RExC_parse++;
10580     vFAIL2("Sequence %.2s... not terminated",parse_start);
10581    } else {
10582     /* this pretty much dupes the code for (?P=...) in reg(), if
10583     you change this make sure you change that */
10584     char* name_start = (RExC_parse += 2);
10585     U32 num = 0;
10586     SV *sv_dat = reg_scan_name(pRExC_state,
10587      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10588     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10589     if (RExC_parse == name_start || *RExC_parse != ch)
10590      vFAIL2("Sequence %.3s... not terminated",parse_start);
10591
10592     if (!SIZE_ONLY) {
10593      num = add_data( pRExC_state, 1, "S" );
10594      RExC_rxi->data->data[num]=(void*)sv_dat;
10595      SvREFCNT_inc_simple_void(sv_dat);
10596     }
10597
10598     RExC_sawback = 1;
10599     ret = reganode(pRExC_state,
10600        ((! FOLD)
10601         ? NREF
10602         : (ASCII_FOLD_RESTRICTED)
10603         ? NREFFA
10604         : (AT_LEAST_UNI_SEMANTICS)
10605          ? NREFFU
10606          : (LOC)
10607          ? NREFFL
10608          : NREFF),
10609         num);
10610     *flagp |= HASWIDTH;
10611
10612     /* override incorrect value set in reganode MJD */
10613     Set_Node_Offset(ret, parse_start+1);
10614     Set_Node_Cur_Length(ret); /* MJD */
10615     nextchar(pRExC_state);
10616
10617    }
10618    break;
10619   }
10620   case 'g':
10621   case '1': case '2': case '3': case '4':
10622   case '5': case '6': case '7': case '8': case '9':
10623    {
10624     I32 num;
10625     bool isg = *RExC_parse == 'g';
10626     bool isrel = 0;
10627     bool hasbrace = 0;
10628     if (isg) {
10629      RExC_parse++;
10630      if (*RExC_parse == '{') {
10631       RExC_parse++;
10632       hasbrace = 1;
10633      }
10634      if (*RExC_parse == '-') {
10635       RExC_parse++;
10636       isrel = 1;
10637      }
10638      if (hasbrace && !isDIGIT(*RExC_parse)) {
10639       if (isrel) RExC_parse--;
10640       RExC_parse -= 2;
10641       goto parse_named_seq;
10642     }   }
10643     num = atoi(RExC_parse);
10644     if (isg && num == 0)
10645      vFAIL("Reference to invalid group 0");
10646     if (isrel) {
10647      num = RExC_npar - num;
10648      if (num < 1)
10649       vFAIL("Reference to nonexistent or unclosed group");
10650     }
10651     if (!isg && num > 9 && num >= RExC_npar)
10652      /* Probably a character specified in octal, e.g. \35 */
10653      goto defchar;
10654     else {
10655      char * const parse_start = RExC_parse - 1; /* MJD */
10656      while (isDIGIT(*RExC_parse))
10657       RExC_parse++;
10658      if (parse_start == RExC_parse - 1)
10659       vFAIL("Unterminated \\g... pattern");
10660      if (hasbrace) {
10661       if (*RExC_parse != '}')
10662        vFAIL("Unterminated \\g{...} pattern");
10663       RExC_parse++;
10664      }
10665      if (!SIZE_ONLY) {
10666       if (num > (I32)RExC_rx->nparens)
10667        vFAIL("Reference to nonexistent group");
10668      }
10669      RExC_sawback = 1;
10670      ret = reganode(pRExC_state,
10671         ((! FOLD)
10672          ? REF
10673          : (ASCII_FOLD_RESTRICTED)
10674          ? REFFA
10675          : (AT_LEAST_UNI_SEMANTICS)
10676           ? REFFU
10677           : (LOC)
10678           ? REFFL
10679           : REFF),
10680          num);
10681      *flagp |= HASWIDTH;
10682
10683      /* override incorrect value set in reganode MJD */
10684      Set_Node_Offset(ret, parse_start+1);
10685      Set_Node_Cur_Length(ret); /* MJD */
10686      RExC_parse--;
10687      nextchar(pRExC_state);
10688     }
10689    }
10690    break;
10691   case '\0':
10692    if (RExC_parse >= RExC_end)
10693     FAIL("Trailing \\");
10694    /* FALL THROUGH */
10695   default:
10696    /* Do not generate "unrecognized" warnings here, we fall
10697    back into the quick-grab loop below */
10698    parse_start--;
10699    goto defchar;
10700   }
10701   break;
10702
10703  case '#':
10704   if (RExC_flags & RXf_PMf_EXTENDED) {
10705    if ( reg_skipcomment( pRExC_state ) )
10706     goto tryagain;
10707   }
10708   /* FALL THROUGH */
10709
10710  default:
10711
10712    parse_start = RExC_parse - 1;
10713
10714    RExC_parse++;
10715
10716   defchar: {
10717    STRLEN len = 0;
10718    UV ender;
10719    char *p;
10720    char *s;
10721 #define MAX_NODE_STRING_SIZE 127
10722    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10723    char *s0;
10724    U8 upper_parse = MAX_NODE_STRING_SIZE;
10725    STRLEN foldlen;
10726    U8 node_type;
10727    bool next_is_quantifier;
10728    char * oldp = NULL;
10729
10730    /* If a folding node contains only code points that don't
10731    * participate in folds, it can be changed into an EXACT node,
10732    * which allows the optimizer more things to look for */
10733    bool maybe_exact;
10734
10735    ender = 0;
10736    node_type = compute_EXACTish(pRExC_state);
10737    ret = reg_node(pRExC_state, node_type);
10738
10739    /* In pass1, folded, we use a temporary buffer instead of the
10740    * actual node, as the node doesn't exist yet */
10741    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10742
10743    s0 = s;
10744
10745   reparse:
10746
10747    /* We do the EXACTFish to EXACT node only if folding, and not if in
10748    * locale, as whether a character folds or not isn't known until
10749    * runtime */
10750    maybe_exact = FOLD && ! LOC;
10751
10752    /* XXX The node can hold up to 255 bytes, yet this only goes to
10753    * 127.  I (khw) do not know why.  Keeping it somewhat less than
10754    * 255 allows us to not have to worry about overflow due to
10755    * converting to utf8 and fold expansion, but that value is
10756    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10757    * split up by this limit into a single one using the real max of
10758    * 255.  Even at 127, this breaks under rare circumstances.  If
10759    * folding, we do not want to split a node at a character that is a
10760    * non-final in a multi-char fold, as an input string could just
10761    * happen to want to match across the node boundary.  The join
10762    * would solve that problem if the join actually happens.  But a
10763    * series of more than two nodes in a row each of 127 would cause
10764    * the first join to succeed to get to 254, but then there wouldn't
10765    * be room for the next one, which could at be one of those split
10766    * multi-char folds.  I don't know of any fool-proof solution.  One
10767    * could back off to end with only a code point that isn't such a
10768    * non-final, but it is possible for there not to be any in the
10769    * entire node. */
10770    for (p = RExC_parse - 1;
10771     len < upper_parse && p < RExC_end;
10772     len++)
10773    {
10774     oldp = p;
10775
10776     if (RExC_flags & RXf_PMf_EXTENDED)
10777      p = regwhite( pRExC_state, p );
10778     switch ((U8)*p) {
10779     case '^':
10780     case '$':
10781     case '.':
10782     case '[':
10783     case '(':
10784     case ')':
10785     case '|':
10786      goto loopdone;
10787     case '\\':
10788      /* Literal Escapes Switch
10789
10790      This switch is meant to handle escape sequences that
10791      resolve to a literal character.
10792
10793      Every escape sequence that represents something
10794      else, like an assertion or a char class, is handled
10795      in the switch marked 'Special Escapes' above in this
10796      routine, but also has an entry here as anything that
10797      isn't explicitly mentioned here will be treated as
10798      an unescaped equivalent literal.
10799      */
10800
10801      switch ((U8)*++p) {
10802      /* These are all the special escapes. */
10803      case 'A':             /* Start assertion */
10804      case 'b': case 'B':   /* Word-boundary assertion*/
10805      case 'C':             /* Single char !DANGEROUS! */
10806      case 'd': case 'D':   /* digit class */
10807      case 'g': case 'G':   /* generic-backref, pos assertion */
10808      case 'h': case 'H':   /* HORIZWS */
10809      case 'k': case 'K':   /* named backref, keep marker */
10810      case 'p': case 'P':   /* Unicode property */
10811        case 'R':   /* LNBREAK */
10812      case 's': case 'S':   /* space class */
10813      case 'v': case 'V':   /* VERTWS */
10814      case 'w': case 'W':   /* word class */
10815      case 'X':             /* eXtended Unicode "combining character sequence" */
10816      case 'z': case 'Z':   /* End of line/string assertion */
10817       --p;
10818       goto loopdone;
10819
10820      /* Anything after here is an escape that resolves to a
10821      literal. (Except digits, which may or may not)
10822      */
10823      case 'n':
10824       ender = '\n';
10825       p++;
10826       break;
10827      case 'N': /* Handle a single-code point named character. */
10828       /* The options cause it to fail if a multiple code
10829       * point sequence.  Handle those in the switch() above
10830       * */
10831       RExC_parse = p + 1;
10832       if (! grok_bslash_N(pRExC_state, NULL, &ender,
10833            flagp, depth, FALSE,
10834            FALSE /* not strict */ ))
10835       {
10836        if (*flagp & RESTART_UTF8)
10837         FAIL("panic: grok_bslash_N set RESTART_UTF8");
10838        RExC_parse = p = oldp;
10839        goto loopdone;
10840       }
10841       p = RExC_parse;
10842       if (ender > 0xff) {
10843        REQUIRE_UTF8;
10844       }
10845       break;
10846      case 'r':
10847       ender = '\r';
10848       p++;
10849       break;
10850      case 't':
10851       ender = '\t';
10852       p++;
10853       break;
10854      case 'f':
10855       ender = '\f';
10856       p++;
10857       break;
10858      case 'e':
10859       ender = ASCII_TO_NATIVE('\033');
10860       p++;
10861       break;
10862      case 'a':
10863       ender = ASCII_TO_NATIVE('\007');
10864       p++;
10865       break;
10866      case 'o':
10867       {
10868        UV result;
10869        const char* error_msg;
10870
10871        bool valid = grok_bslash_o(&p,
10872              &result,
10873              &error_msg,
10874              TRUE, /* out warnings */
10875              FALSE, /* not strict */
10876              TRUE, /* Output warnings
10877                 for non-
10878                 portables */
10879              UTF);
10880        if (! valid) {
10881         RExC_parse = p; /* going to die anyway; point
10882             to exact spot of failure */
10883         vFAIL(error_msg);
10884        }
10885        ender = result;
10886        if (PL_encoding && ender < 0x100) {
10887         goto recode_encoding;
10888        }
10889        if (ender > 0xff) {
10890         REQUIRE_UTF8;
10891        }
10892        break;
10893       }
10894      case 'x':
10895       {
10896        UV result = UV_MAX; /* initialize to erroneous
10897             value */
10898        const char* error_msg;
10899
10900        bool valid = grok_bslash_x(&p,
10901              &result,
10902              &error_msg,
10903              TRUE, /* out warnings */
10904              FALSE, /* not strict */
10905              TRUE, /* Output warnings
10906                 for non-
10907                 portables */
10908              UTF);
10909        if (! valid) {
10910         RExC_parse = p; /* going to die anyway; point
10911             to exact spot of failure */
10912         vFAIL(error_msg);
10913        }
10914        ender = result;
10915
10916        if (PL_encoding && ender < 0x100) {
10917         goto recode_encoding;
10918        }
10919        if (ender > 0xff) {
10920         REQUIRE_UTF8;
10921        }
10922        break;
10923       }
10924      case 'c':
10925       p++;
10926       ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10927       break;
10928      case '0': case '1': case '2': case '3':case '4':
10929      case '5': case '6': case '7':
10930       if (*p == '0' ||
10931        (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10932       {
10933        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10934        STRLEN numlen = 3;
10935        ender = grok_oct(p, &numlen, &flags, NULL);
10936        if (ender > 0xff) {
10937         REQUIRE_UTF8;
10938        }
10939        p += numlen;
10940        if (SIZE_ONLY   /* like \08, \178 */
10941         && numlen < 3
10942         && p < RExC_end
10943         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10944        {
10945         reg_warn_non_literal_string(
10946           p + 1,
10947           form_short_octal_warning(p, numlen));
10948        }
10949       }
10950       else {  /* Not to be treated as an octal constant, go
10951         find backref */
10952        --p;
10953        goto loopdone;
10954       }
10955       if (PL_encoding && ender < 0x100)
10956        goto recode_encoding;
10957       break;
10958      recode_encoding:
10959       if (! RExC_override_recoding) {
10960        SV* enc = PL_encoding;
10961        ender = reg_recode((const char)(U8)ender, &enc);
10962        if (!enc && SIZE_ONLY)
10963         ckWARNreg(p, "Invalid escape in the specified encoding");
10964        REQUIRE_UTF8;
10965       }
10966       break;
10967      case '\0':
10968       if (p >= RExC_end)
10969        FAIL("Trailing \\");
10970       /* FALL THROUGH */
10971      default:
10972       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10973        /* Include any { following the alpha to emphasize
10974        * that it could be part of an escape at some point
10975        * in the future */
10976        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10977        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10978       }
10979       goto normal_default;
10980      } /* End of switch on '\' */
10981      break;
10982     default:    /* A literal character */
10983
10984      if (! SIZE_ONLY
10985       && RExC_flags & RXf_PMf_EXTENDED
10986       && ckWARN(WARN_DEPRECATED)
10987       && is_PATWS_non_low(p, UTF))
10988      {
10989       vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
10990         "Escape literal pattern white space under /x");
10991      }
10992
10993     normal_default:
10994      if (UTF8_IS_START(*p) && UTF) {
10995       STRLEN numlen;
10996       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10997            &numlen, UTF8_ALLOW_DEFAULT);
10998       p += numlen;
10999      }
11000      else
11001       ender = (U8) *p++;
11002      break;
11003     } /* End of switch on the literal */
11004
11005     /* Here, have looked at the literal character and <ender>
11006     * contains its ordinal, <p> points to the character after it
11007     */
11008
11009     if ( RExC_flags & RXf_PMf_EXTENDED)
11010      p = regwhite( pRExC_state, p );
11011
11012     /* If the next thing is a quantifier, it applies to this
11013     * character only, which means that this character has to be in
11014     * its own node and can't just be appended to the string in an
11015     * existing node, so if there are already other characters in
11016     * the node, close the node with just them, and set up to do
11017     * this character again next time through, when it will be the
11018     * only thing in its new node */
11019     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11020     {
11021      p = oldp;
11022      goto loopdone;
11023     }
11024
11025     if (FOLD) {
11026      if (UTF
11027        /* See comments for join_exact() as to why we fold
11028        * this non-UTF at compile time */
11029       || (node_type == EXACTFU
11030        && ender == LATIN_SMALL_LETTER_SHARP_S))
11031      {
11032
11033
11034       /* Prime the casefolded buffer.  Locale rules, which
11035       * apply only to code points < 256, aren't known until
11036       * execution, so for them, just output the original
11037       * character using utf8.  If we start to fold non-UTF
11038       * patterns, be sure to update join_exact() */
11039       if (LOC && ender < 256) {
11040        if (UNI_IS_INVARIANT(ender)) {
11041         *s = (U8) ender;
11042         foldlen = 1;
11043        } else {
11044         *s = UTF8_TWO_BYTE_HI(ender);
11045         *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11046         foldlen = 2;
11047        }
11048       }
11049       else {
11050        UV folded = _to_uni_fold_flags(
11051           ender,
11052           (U8 *) s,
11053           &foldlen,
11054           FOLD_FLAGS_FULL
11055           | ((LOC) ?  FOLD_FLAGS_LOCALE
11056              : (ASCII_FOLD_RESTRICTED)
11057              ? FOLD_FLAGS_NOMIX_ASCII
11058              : 0)
11059            );
11060
11061        /* If this node only contains non-folding code
11062        * points so far, see if this new one is also
11063        * non-folding */
11064        if (maybe_exact) {
11065         if (folded != ender) {
11066          maybe_exact = FALSE;
11067         }
11068         else {
11069          /* Here the fold is the original; we have
11070          * to check further to see if anything
11071          * folds to it */
11072          if (! PL_utf8_foldable) {
11073           SV* swash = swash_init("utf8",
11074               "_Perl_Any_Folds",
11075               &PL_sv_undef, 1, 0);
11076           PL_utf8_foldable =
11077              _get_swash_invlist(swash);
11078           SvREFCNT_dec_NN(swash);
11079          }
11080          if (_invlist_contains_cp(PL_utf8_foldable,
11081                ender))
11082          {
11083           maybe_exact = FALSE;
11084          }
11085         }
11086        }
11087        ender = folded;
11088       }
11089       s += foldlen;
11090
11091       /* The loop increments <len> each time, as all but this
11092       * path (and the one just below for UTF) through it add
11093       * a single byte to the EXACTish node.  But this one
11094       * has changed len to be the correct final value, so
11095       * subtract one to cancel out the increment that
11096       * follows */
11097       len += foldlen - 1;
11098      }
11099      else {
11100       *(s++) = (char) ender;
11101       maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
11102      }
11103     }
11104     else if (UTF) {
11105      const STRLEN unilen = reguni(pRExC_state, ender, s);
11106      if (unilen > 0) {
11107      s   += unilen;
11108      len += unilen;
11109      }
11110
11111      /* See comment just above for - 1 */
11112      len--;
11113     }
11114     else {
11115      REGC((char)ender, s++);
11116     }
11117
11118     if (next_is_quantifier) {
11119
11120      /* Here, the next input is a quantifier, and to get here,
11121      * the current character is the only one in the node.
11122      * Also, here <len> doesn't include the final byte for this
11123      * character */
11124      len++;
11125      goto loopdone;
11126     }
11127
11128    } /* End of loop through literal characters */
11129
11130    /* Here we have either exhausted the input or ran out of room in
11131    * the node.  (If we encountered a character that can't be in the
11132    * node, transfer is made directly to <loopdone>, and so we
11133    * wouldn't have fallen off the end of the loop.)  In the latter
11134    * case, we artificially have to split the node into two, because
11135    * we just don't have enough space to hold everything.  This
11136    * creates a problem if the final character participates in a
11137    * multi-character fold in the non-final position, as a match that
11138    * should have occurred won't, due to the way nodes are matched,
11139    * and our artificial boundary.  So back off until we find a non-
11140    * problematic character -- one that isn't at the beginning or
11141    * middle of such a fold.  (Either it doesn't participate in any
11142    * folds, or appears only in the final position of all the folds it
11143    * does participate in.)  A better solution with far fewer false
11144    * positives, and that would fill the nodes more completely, would
11145    * be to actually have available all the multi-character folds to
11146    * test against, and to back-off only far enough to be sure that
11147    * this node isn't ending with a partial one.  <upper_parse> is set
11148    * further below (if we need to reparse the node) to include just
11149    * up through that final non-problematic character that this code
11150    * identifies, so when it is set to less than the full node, we can
11151    * skip the rest of this */
11152    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11153
11154     const STRLEN full_len = len;
11155
11156     assert(len >= MAX_NODE_STRING_SIZE);
11157
11158     /* Here, <s> points to the final byte of the final character.
11159     * Look backwards through the string until find a non-
11160     * problematic character */
11161
11162     if (! UTF) {
11163
11164      /* These two have no multi-char folds to non-UTF characters
11165      */
11166      if (ASCII_FOLD_RESTRICTED || LOC) {
11167       goto loopdone;
11168      }
11169
11170      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11171      len = s - s0 + 1;
11172     }
11173     else {
11174      if (!  PL_NonL1NonFinalFold) {
11175       PL_NonL1NonFinalFold = _new_invlist_C_array(
11176           NonL1_Perl_Non_Final_Folds_invlist);
11177      }
11178
11179      /* Point to the first byte of the final character */
11180      s = (char *) utf8_hop((U8 *) s, -1);
11181
11182      while (s >= s0) {   /* Search backwards until find
11183           non-problematic char */
11184       if (UTF8_IS_INVARIANT(*s)) {
11185
11186        /* There are no ascii characters that participate
11187        * in multi-char folds under /aa.  In EBCDIC, the
11188        * non-ascii invariants are all control characters,
11189        * so don't ever participate in any folds. */
11190        if (ASCII_FOLD_RESTRICTED
11191         || ! IS_NON_FINAL_FOLD(*s))
11192        {
11193         break;
11194        }
11195       }
11196       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11197
11198        /* No Latin1 characters participate in multi-char
11199        * folds under /l */
11200        if (LOC
11201         || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11202                 *s, *(s+1))))
11203        {
11204         break;
11205        }
11206       }
11207       else if (! _invlist_contains_cp(
11208           PL_NonL1NonFinalFold,
11209           valid_utf8_to_uvchr((U8 *) s, NULL)))
11210       {
11211        break;
11212       }
11213
11214       /* Here, the current character is problematic in that
11215       * it does occur in the non-final position of some
11216       * fold, so try the character before it, but have to
11217       * special case the very first byte in the string, so
11218       * we don't read outside the string */
11219       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11220      } /* End of loop backwards through the string */
11221
11222      /* If there were only problematic characters in the string,
11223      * <s> will point to before s0, in which case the length
11224      * should be 0, otherwise include the length of the
11225      * non-problematic character just found */
11226      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11227     }
11228
11229     /* Here, have found the final character, if any, that is
11230     * non-problematic as far as ending the node without splitting
11231     * it across a potential multi-char fold.  <len> contains the
11232     * number of bytes in the node up-to and including that
11233     * character, or is 0 if there is no such character, meaning
11234     * the whole node contains only problematic characters.  In
11235     * this case, give up and just take the node as-is.  We can't
11236     * do any better */
11237     if (len == 0) {
11238      len = full_len;
11239     } else {
11240
11241      /* Here, the node does contain some characters that aren't
11242      * problematic.  If one such is the final character in the
11243      * node, we are done */
11244      if (len == full_len) {
11245       goto loopdone;
11246      }
11247      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11248
11249       /* If the final character is problematic, but the
11250       * penultimate is not, back-off that last character to
11251       * later start a new node with it */
11252       p = oldp;
11253       goto loopdone;
11254      }
11255
11256      /* Here, the final non-problematic character is earlier
11257      * in the input than the penultimate character.  What we do
11258      * is reparse from the beginning, going up only as far as
11259      * this final ok one, thus guaranteeing that the node ends
11260      * in an acceptable character.  The reason we reparse is
11261      * that we know how far in the character is, but we don't
11262      * know how to correlate its position with the input parse.
11263      * An alternate implementation would be to build that
11264      * correlation as we go along during the original parse,
11265      * but that would entail extra work for every node, whereas
11266      * this code gets executed only when the string is too
11267      * large for the node, and the final two characters are
11268      * problematic, an infrequent occurrence.  Yet another
11269      * possible strategy would be to save the tail of the
11270      * string, and the next time regatom is called, initialize
11271      * with that.  The problem with this is that unless you
11272      * back off one more character, you won't be guaranteed
11273      * regatom will get called again, unless regbranch,
11274      * regpiece ... are also changed.  If you do back off that
11275      * extra character, so that there is input guaranteed to
11276      * force calling regatom, you can't handle the case where
11277      * just the first character in the node is acceptable.  I
11278      * (khw) decided to try this method which doesn't have that
11279      * pitfall; if performance issues are found, we can do a
11280      * combination of the current approach plus that one */
11281      upper_parse = len;
11282      len = 0;
11283      s = s0;
11284      goto reparse;
11285     }
11286    }   /* End of verifying node ends with an appropriate char */
11287
11288   loopdone:   /* Jumped to when encounters something that shouldn't be in
11289      the node */
11290
11291    /* If 'maybe_exact' is still set here, means there are no
11292    * code points in the node that participate in folds */
11293    if (FOLD && maybe_exact) {
11294     OP(ret) = EXACT;
11295    }
11296
11297    /* I (khw) don't know if you can get here with zero length, but the
11298    * old code handled this situation by creating a zero-length EXACT
11299    * node.  Might as well be NOTHING instead */
11300    if (len == 0) {
11301     OP(ret) = NOTHING;
11302    }
11303    else{
11304     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11305    }
11306
11307    RExC_parse = p - 1;
11308    Set_Node_Cur_Length(ret); /* MJD */
11309    nextchar(pRExC_state);
11310    {
11311     /* len is STRLEN which is unsigned, need to copy to signed */
11312     IV iv = len;
11313     if (iv < 0)
11314      vFAIL("Internal disaster");
11315    }
11316
11317   } /* End of label 'defchar:' */
11318   break;
11319  } /* End of giant switch on input character */
11320
11321  return(ret);
11322 }
11323
11324 STATIC char *
11325 S_regwhite( RExC_state_t *pRExC_state, char *p )
11326 {
11327  const char *e = RExC_end;
11328
11329  PERL_ARGS_ASSERT_REGWHITE;
11330
11331  while (p < e) {
11332   if (isSPACE(*p))
11333    ++p;
11334   else if (*p == '#') {
11335    bool ended = 0;
11336    do {
11337     if (*p++ == '\n') {
11338      ended = 1;
11339      break;
11340     }
11341    } while (p < e);
11342    if (!ended)
11343     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11344   }
11345   else
11346    break;
11347  }
11348  return p;
11349 }
11350
11351 STATIC char *
11352 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11353 {
11354  /* Returns the next non-pattern-white space, non-comment character (the
11355  * latter only if 'recognize_comment is true) in the string p, which is
11356  * ended by RExC_end.  If there is no line break ending a comment,
11357  * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11358  const char *e = RExC_end;
11359
11360  PERL_ARGS_ASSERT_REGPATWS;
11361
11362  while (p < e) {
11363   STRLEN len;
11364   if ((len = is_PATWS_safe(p, e, UTF))) {
11365    p += len;
11366   }
11367   else if (recognize_comment && *p == '#') {
11368    bool ended = 0;
11369    do {
11370     p++;
11371     if (is_LNBREAK_safe(p, e, UTF)) {
11372      ended = 1;
11373      break;
11374     }
11375    } while (p < e);
11376    if (!ended)
11377     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11378   }
11379   else
11380    break;
11381  }
11382  return p;
11383 }
11384
11385 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11386    Character classes ([:foo:]) can also be negated ([:^foo:]).
11387    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11388    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11389    but trigger failures because they are currently unimplemented. */
11390
11391 #define POSIXCC_DONE(c)   ((c) == ':')
11392 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11393 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11394
11395 PERL_STATIC_INLINE I32
11396 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11397 {
11398  dVAR;
11399  I32 namedclass = OOB_NAMEDCLASS;
11400
11401  PERL_ARGS_ASSERT_REGPPOSIXCC;
11402
11403  if (value == '[' && RExC_parse + 1 < RExC_end &&
11404   /* I smell either [: or [= or [. -- POSIX has been here, right? */
11405   POSIXCC(UCHARAT(RExC_parse)))
11406  {
11407   const char c = UCHARAT(RExC_parse);
11408   char* const s = RExC_parse++;
11409
11410   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11411    RExC_parse++;
11412   if (RExC_parse == RExC_end) {
11413    if (strict) {
11414
11415     /* Try to give a better location for the error (than the end of
11416     * the string) by looking for the matching ']' */
11417     RExC_parse = s;
11418     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11419      RExC_parse++;
11420     }
11421     vFAIL2("Unmatched '%c' in POSIX class", c);
11422    }
11423    /* Grandfather lone [:, [=, [. */
11424    RExC_parse = s;
11425   }
11426   else {
11427    const char* const t = RExC_parse++; /* skip over the c */
11428    assert(*t == c);
11429
11430    if (UCHARAT(RExC_parse) == ']') {
11431     const char *posixcc = s + 1;
11432     RExC_parse++; /* skip over the ending ] */
11433
11434     if (*s == ':') {
11435      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11436      const I32 skip = t - posixcc;
11437
11438      /* Initially switch on the length of the name.  */
11439      switch (skip) {
11440      case 4:
11441       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11442               this is the Perl \w
11443               */
11444        namedclass = ANYOF_WORDCHAR;
11445       break;
11446      case 5:
11447       /* Names all of length 5.  */
11448       /* alnum alpha ascii blank cntrl digit graph lower
11449       print punct space upper  */
11450       /* Offset 4 gives the best switch position.  */
11451       switch (posixcc[4]) {
11452       case 'a':
11453        if (memEQ(posixcc, "alph", 4)) /* alpha */
11454         namedclass = ANYOF_ALPHA;
11455        break;
11456       case 'e':
11457        if (memEQ(posixcc, "spac", 4)) /* space */
11458         namedclass = ANYOF_PSXSPC;
11459        break;
11460       case 'h':
11461        if (memEQ(posixcc, "grap", 4)) /* graph */
11462         namedclass = ANYOF_GRAPH;
11463        break;
11464       case 'i':
11465        if (memEQ(posixcc, "asci", 4)) /* ascii */
11466         namedclass = ANYOF_ASCII;
11467        break;
11468       case 'k':
11469        if (memEQ(posixcc, "blan", 4)) /* blank */
11470         namedclass = ANYOF_BLANK;
11471        break;
11472       case 'l':
11473        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11474         namedclass = ANYOF_CNTRL;
11475        break;
11476       case 'm':
11477        if (memEQ(posixcc, "alnu", 4)) /* alnum */
11478         namedclass = ANYOF_ALPHANUMERIC;
11479        break;
11480       case 'r':
11481        if (memEQ(posixcc, "lowe", 4)) /* lower */
11482         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11483        else if (memEQ(posixcc, "uppe", 4)) /* upper */
11484         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11485        break;
11486       case 't':
11487        if (memEQ(posixcc, "digi", 4)) /* digit */
11488         namedclass = ANYOF_DIGIT;
11489        else if (memEQ(posixcc, "prin", 4)) /* print */
11490         namedclass = ANYOF_PRINT;
11491        else if (memEQ(posixcc, "punc", 4)) /* punct */
11492         namedclass = ANYOF_PUNCT;
11493        break;
11494       }
11495       break;
11496      case 6:
11497       if (memEQ(posixcc, "xdigit", 6))
11498        namedclass = ANYOF_XDIGIT;
11499       break;
11500      }
11501
11502      if (namedclass == OOB_NAMEDCLASS)
11503       Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11504          t - s - 1, s + 1);
11505
11506      /* The #defines are structured so each complement is +1 to
11507      * the normal one */
11508      if (complement) {
11509       namedclass++;
11510      }
11511      assert (posixcc[skip] == ':');
11512      assert (posixcc[skip+1] == ']');
11513     } else if (!SIZE_ONLY) {
11514      /* [[=foo=]] and [[.foo.]] are still future. */
11515
11516      /* adjust RExC_parse so the warning shows after
11517      the class closes */
11518      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11519       RExC_parse++;
11520      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11521     }
11522    } else {
11523     /* Maternal grandfather:
11524     * "[:" ending in ":" but not in ":]" */
11525     if (strict) {
11526      vFAIL("Unmatched '[' in POSIX class");
11527     }
11528
11529     /* Grandfather lone [:, [=, [. */
11530     RExC_parse = s;
11531    }
11532   }
11533  }
11534
11535  return namedclass;
11536 }
11537
11538 STATIC bool
11539 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11540 {
11541  /* This applies some heuristics at the current parse position (which should
11542  * be at a '[') to see if what follows might be intended to be a [:posix:]
11543  * class.  It returns true if it really is a posix class, of course, but it
11544  * also can return true if it thinks that what was intended was a posix
11545  * class that didn't quite make it.
11546  *
11547  * It will return true for
11548  *      [:alphanumerics:
11549  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11550  *                         ')' indicating the end of the (?[
11551  *      [:any garbage including %^&$ punctuation:]
11552  *
11553  * This is designed to be called only from S_handle_regex_sets; it could be
11554  * easily adapted to be called from the spot at the beginning of regclass()
11555  * that checks to see in a normal bracketed class if the surrounding []
11556  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11557  * change long-standing behavior, so I (khw) didn't do that */
11558  char* p = RExC_parse + 1;
11559  char first_char = *p;
11560
11561  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11562
11563  assert(*(p - 1) == '[');
11564
11565  if (! POSIXCC(first_char)) {
11566   return FALSE;
11567  }
11568
11569  p++;
11570  while (p < RExC_end && isWORDCHAR(*p)) p++;
11571
11572  if (p >= RExC_end) {
11573   return FALSE;
11574  }
11575
11576  if (p - RExC_parse > 2    /* Got at least 1 word character */
11577   && (*p == first_char
11578    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11579  {
11580   return TRUE;
11581  }
11582
11583  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11584
11585  return (p
11586    && p - RExC_parse > 2 /* [:] evaluates to colon;
11587          [::] is a bad posix class. */
11588    && first_char == *(p - 1));
11589 }
11590
11591 STATIC regnode *
11592 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11593     char * const oregcomp_parse)
11594 {
11595  /* Handle the (?[...]) construct to do set operations */
11596
11597  U8 curchar;
11598  UV start, end; /* End points of code point ranges */
11599  SV* result_string;
11600  char *save_end, *save_parse;
11601  SV* final;
11602  STRLEN len;
11603  regnode* node;
11604  AV* stack;
11605  const bool save_fold = FOLD;
11606
11607  GET_RE_DEBUG_FLAGS_DECL;
11608
11609  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11610
11611  if (LOC) {
11612   vFAIL("(?[...]) not valid in locale");
11613  }
11614  RExC_uni_semantics = 1;
11615
11616  /* This will return only an ANYOF regnode, or (unlikely) something smaller
11617  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11618  * call regclass to handle '[]' so as to not have to reinvent its parsing
11619  * rules here (throwing away the size it computes each time).  And, we exit
11620  * upon an unescaped ']' that isn't one ending a regclass.  To do both
11621  * these things, we need to realize that something preceded by a backslash
11622  * is escaped, so we have to keep track of backslashes */
11623  if (SIZE_ONLY) {
11624
11625   Perl_ck_warner_d(aTHX_
11626    packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11627    "The regex_sets feature is experimental" REPORT_LOCATION,
11628    (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11629
11630   while (RExC_parse < RExC_end) {
11631    SV* current = NULL;
11632    RExC_parse = regpatws(pRExC_state, RExC_parse,
11633         TRUE); /* means recognize comments */
11634    switch (*RExC_parse) {
11635     default:
11636      break;
11637     case '\\':
11638      /* Skip the next byte (which could cause us to end up in
11639      * the middle of a UTF-8 character, but since none of those
11640      * are confusable with anything we currently handle in this
11641      * switch (invariants all), it's safe.  We'll just hit the
11642      * default: case next time and keep on incrementing until
11643      * we find one of the invariants we do handle. */
11644      RExC_parse++;
11645      break;
11646     case '[':
11647     {
11648      /* If this looks like it is a [:posix:] class, leave the
11649      * parse pointer at the '[' to fool regclass() into
11650      * thinking it is part of a '[[:posix:]]'.  That function
11651      * will use strict checking to force a syntax error if it
11652      * doesn't work out to a legitimate class */
11653      bool is_posix_class
11654          = could_it_be_a_POSIX_class(pRExC_state);
11655      if (! is_posix_class) {
11656       RExC_parse++;
11657      }
11658
11659      /* regclass() can only return RESTART_UTF8 if multi-char
11660      folds are allowed.  */
11661      if (!regclass(pRExC_state, flagp,depth+1,
11662         is_posix_class, /* parse the whole char
11663              class only if not a
11664              posix class */
11665         FALSE, /* don't allow multi-char folds */
11666         TRUE, /* silence non-portable warnings. */
11667         &current))
11668       FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11669        *flagp);
11670
11671      /* function call leaves parse pointing to the ']', except
11672      * if we faked it */
11673      if (is_posix_class) {
11674       RExC_parse--;
11675      }
11676
11677      SvREFCNT_dec(current);   /* In case it returned something */
11678      break;
11679     }
11680
11681     case ']':
11682      RExC_parse++;
11683      if (RExC_parse < RExC_end
11684       && *RExC_parse == ')')
11685      {
11686       node = reganode(pRExC_state, ANYOF, 0);
11687       RExC_size += ANYOF_SKIP;
11688       nextchar(pRExC_state);
11689       Set_Node_Length(node,
11690         RExC_parse - oregcomp_parse + 1); /* MJD */
11691       return node;
11692      }
11693      goto no_close;
11694    }
11695    RExC_parse++;
11696   }
11697
11698   no_close:
11699   FAIL("Syntax error in (?[...])");
11700  }
11701
11702  /* Pass 2 only after this.  Everything in this construct is a
11703  * metacharacter.  Operands begin with either a '\' (for an escape
11704  * sequence), or a '[' for a bracketed character class.  Any other
11705  * character should be an operator, or parenthesis for grouping.  Both
11706  * types of operands are handled by calling regclass() to parse them.  It
11707  * is called with a parameter to indicate to return the computed inversion
11708  * list.  The parsing here is implemented via a stack.  Each entry on the
11709  * stack is a single character representing one of the operators, or the
11710  * '('; or else a pointer to an operand inversion list. */
11711
11712 #define IS_OPERAND(a)  (! SvIOK(a))
11713
11714  /* The stack starts empty.  It is a syntax error if the first thing parsed
11715  * is a binary operator; everything else is pushed on the stack.  When an
11716  * operand is parsed, the top of the stack is examined.  If it is a binary
11717  * operator, the item before it should be an operand, and both are replaced
11718  * by the result of doing that operation on the new operand and the one on
11719  * the stack.   Thus a sequence of binary operands is reduced to a single
11720  * one before the next one is parsed.
11721  *
11722  * A unary operator may immediately follow a binary in the input, for
11723  * example
11724  *      [a] + ! [b]
11725  * When an operand is parsed and the top of the stack is a unary operator,
11726  * the operation is performed, and then the stack is rechecked to see if
11727  * this new operand is part of a binary operation; if so, it is handled as
11728  * above.
11729  *
11730  * A '(' is simply pushed on the stack; it is valid only if the stack is
11731  * empty, or the top element of the stack is an operator or another '('
11732  * (for which the parenthesized expression will become an operand).  By the
11733  * time the corresponding ')' is parsed everything in between should have
11734  * been parsed and evaluated to a single operand (or else is a syntax
11735  * error), and is handled as a regular operand */
11736
11737  sv_2mortal((SV *)(stack = newAV()));
11738
11739  while (RExC_parse < RExC_end) {
11740   I32 top_index = av_tindex(stack);
11741   SV** top_ptr;
11742   SV* current = NULL;
11743
11744   /* Skip white space */
11745   RExC_parse = regpatws(pRExC_state, RExC_parse,
11746         TRUE); /* means recognize comments */
11747   if (RExC_parse >= RExC_end) {
11748    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11749   }
11750   if ((curchar = UCHARAT(RExC_parse)) == ']') {
11751    break;
11752   }
11753
11754   switch (curchar) {
11755
11756    case '?':
11757     if (av_tindex(stack) >= 0   /* This makes sure that we can
11758            safely subtract 1 from
11759            RExC_parse in the next clause.
11760            If we have something on the
11761            stack, we have parsed something
11762            */
11763      && UCHARAT(RExC_parse - 1) == '('
11764      && RExC_parse < RExC_end)
11765     {
11766      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11767      * This happens when we have some thing like
11768      *
11769      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11770      *   ...
11771      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
11772      *
11773      * Here we would be handling the interpolated
11774      * '$thai_or_lao'.  We handle this by a recursive call to
11775      * ourselves which returns the inversion list the
11776      * interpolated expression evaluates to.  We use the flags
11777      * from the interpolated pattern. */
11778      U32 save_flags = RExC_flags;
11779      const char * const save_parse = ++RExC_parse;
11780
11781      parse_lparen_question_flags(pRExC_state);
11782
11783      if (RExC_parse == save_parse  /* Makes sure there was at
11784              least one flag (or this
11785              embedding wasn't compiled)
11786             */
11787       || RExC_parse >= RExC_end - 4
11788       || UCHARAT(RExC_parse) != ':'
11789       || UCHARAT(++RExC_parse) != '('
11790       || UCHARAT(++RExC_parse) != '?'
11791       || UCHARAT(++RExC_parse) != '[')
11792      {
11793
11794       /* In combination with the above, this moves the
11795       * pointer to the point just after the first erroneous
11796       * character (or if there are no flags, to where they
11797       * should have been) */
11798       if (RExC_parse >= RExC_end - 4) {
11799        RExC_parse = RExC_end;
11800       }
11801       else if (RExC_parse != save_parse) {
11802        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11803       }
11804       vFAIL("Expecting '(?flags:(?[...'");
11805      }
11806      RExC_parse++;
11807      (void) handle_regex_sets(pRExC_state, &current, flagp,
11808              depth+1, oregcomp_parse);
11809
11810      /* Here, 'current' contains the embedded expression's
11811      * inversion list, and RExC_parse points to the trailing
11812      * ']'; the next character should be the ')' which will be
11813      * paired with the '(' that has been put on the stack, so
11814      * the whole embedded expression reduces to '(operand)' */
11815      RExC_parse++;
11816
11817      RExC_flags = save_flags;
11818      goto handle_operand;
11819     }
11820     /* FALL THROUGH */
11821
11822    default:
11823     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11824     vFAIL("Unexpected character");
11825
11826    case '\\':
11827     /* regclass() can only return RESTART_UTF8 if multi-char
11828     folds are allowed.  */
11829     if (!regclass(pRExC_state, flagp,depth+1,
11830        TRUE, /* means parse just the next thing */
11831        FALSE, /* don't allow multi-char folds */
11832        FALSE, /* don't silence non-portable warnings.  */
11833        &current))
11834      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11835       *flagp);
11836     /* regclass() will return with parsing just the \ sequence,
11837     * leaving the parse pointer at the next thing to parse */
11838     RExC_parse--;
11839     goto handle_operand;
11840
11841    case '[':   /* Is a bracketed character class */
11842    {
11843     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11844
11845     if (! is_posix_class) {
11846      RExC_parse++;
11847     }
11848
11849     /* regclass() can only return RESTART_UTF8 if multi-char
11850     folds are allowed.  */
11851     if(!regclass(pRExC_state, flagp,depth+1,
11852        is_posix_class, /* parse the whole char class
11853             only if not a posix class */
11854        FALSE, /* don't allow multi-char folds */
11855        FALSE, /* don't silence non-portable warnings.  */
11856        &current))
11857      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11858       *flagp);
11859     /* function call leaves parse pointing to the ']', except if we
11860     * faked it */
11861     if (is_posix_class) {
11862      RExC_parse--;
11863     }
11864
11865     goto handle_operand;
11866    }
11867
11868    case '&':
11869    case '|':
11870    case '+':
11871    case '-':
11872    case '^':
11873     if (top_index < 0
11874      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11875      || ! IS_OPERAND(*top_ptr))
11876     {
11877      RExC_parse++;
11878      vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11879     }
11880     av_push(stack, newSVuv(curchar));
11881     break;
11882
11883    case '!':
11884     av_push(stack, newSVuv(curchar));
11885     break;
11886
11887    case '(':
11888     if (top_index >= 0) {
11889      top_ptr = av_fetch(stack, top_index, FALSE);
11890      assert(top_ptr);
11891      if (IS_OPERAND(*top_ptr)) {
11892       RExC_parse++;
11893       vFAIL("Unexpected '(' with no preceding operator");
11894      }
11895     }
11896     av_push(stack, newSVuv(curchar));
11897     break;
11898
11899    case ')':
11900    {
11901     SV* lparen;
11902     if (top_index < 1
11903      || ! (current = av_pop(stack))
11904      || ! IS_OPERAND(current)
11905      || ! (lparen = av_pop(stack))
11906      || IS_OPERAND(lparen)
11907      || SvUV(lparen) != '(')
11908     {
11909      SvREFCNT_dec(current);
11910      RExC_parse++;
11911      vFAIL("Unexpected ')'");
11912     }
11913     top_index -= 2;
11914     SvREFCNT_dec_NN(lparen);
11915
11916     /* FALL THROUGH */
11917    }
11918
11919    handle_operand:
11920
11921     /* Here, we have an operand to process, in 'current' */
11922
11923     if (top_index < 0) {    /* Just push if stack is empty */
11924      av_push(stack, current);
11925     }
11926     else {
11927      SV* top = av_pop(stack);
11928      SV *prev = NULL;
11929      char current_operator;
11930
11931      if (IS_OPERAND(top)) {
11932       SvREFCNT_dec_NN(top);
11933       SvREFCNT_dec_NN(current);
11934       vFAIL("Operand with no preceding operator");
11935      }
11936      current_operator = (char) SvUV(top);
11937      switch (current_operator) {
11938       case '(':   /* Push the '(' back on followed by the new
11939          operand */
11940        av_push(stack, top);
11941        av_push(stack, current);
11942        SvREFCNT_inc(top);  /* Counters the '_dec' done
11943             just after the 'break', so
11944             it doesn't get wrongly freed
11945             */
11946        break;
11947
11948       case '!':
11949        _invlist_invert(current);
11950
11951        /* Unlike binary operators, the top of the stack,
11952        * now that this unary one has been popped off, may
11953        * legally be an operator, and we now have operand
11954        * for it. */
11955        top_index--;
11956        SvREFCNT_dec_NN(top);
11957        goto handle_operand;
11958
11959       case '&':
11960        prev = av_pop(stack);
11961        _invlist_intersection(prev,
11962             current,
11963             &current);
11964        av_push(stack, current);
11965        break;
11966
11967       case '|':
11968       case '+':
11969        prev = av_pop(stack);
11970        _invlist_union(prev, current, &current);
11971        av_push(stack, current);
11972        break;
11973
11974       case '-':
11975        prev = av_pop(stack);;
11976        _invlist_subtract(prev, current, &current);
11977        av_push(stack, current);
11978        break;
11979
11980       case '^':   /* The union minus the intersection */
11981       {
11982        SV* i = NULL;
11983        SV* u = NULL;
11984        SV* element;
11985
11986        prev = av_pop(stack);
11987        _invlist_union(prev, current, &u);
11988        _invlist_intersection(prev, current, &i);
11989        /* _invlist_subtract will overwrite current
11990         without freeing what it already contains */
11991        element = current;
11992        _invlist_subtract(u, i, &current);
11993        av_push(stack, current);
11994        SvREFCNT_dec_NN(i);
11995        SvREFCNT_dec_NN(u);
11996        SvREFCNT_dec_NN(element);
11997        break;
11998       }
11999
12000       default:
12001        Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12002     }
12003     SvREFCNT_dec_NN(top);
12004     SvREFCNT_dec(prev);
12005    }
12006   }
12007
12008   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12009  }
12010
12011  if (av_tindex(stack) < 0   /* Was empty */
12012   || ((final = av_pop(stack)) == NULL)
12013   || ! IS_OPERAND(final)
12014   || av_tindex(stack) >= 0)  /* More left on stack */
12015  {
12016   vFAIL("Incomplete expression within '(?[ ])'");
12017  }
12018
12019  /* Here, 'final' is the resultant inversion list from evaluating the
12020  * expression.  Return it if so requested */
12021  if (return_invlist) {
12022   *return_invlist = final;
12023   return END;
12024  }
12025
12026  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
12027  * expecting a string of ranges and individual code points */
12028  invlist_iterinit(final);
12029  result_string = newSVpvs("");
12030  while (invlist_iternext(final, &start, &end)) {
12031   if (start == end) {
12032    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12033   }
12034   else {
12035    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12036              start,          end);
12037   }
12038  }
12039
12040  save_parse = RExC_parse;
12041  RExC_parse = SvPV(result_string, len);
12042  save_end = RExC_end;
12043  RExC_end = RExC_parse + len;
12044
12045  /* We turn off folding around the call, as the class we have constructed
12046  * already has all folding taken into consideration, and we don't want
12047  * regclass() to add to that */
12048  RExC_flags &= ~RXf_PMf_FOLD;
12049  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12050  */
12051  node = regclass(pRExC_state, flagp,depth+1,
12052      FALSE, /* means parse the whole char class */
12053      FALSE, /* don't allow multi-char folds */
12054      TRUE, /* silence non-portable warnings.  The above may very
12055        well have generated non-portable code points, but
12056        they're valid on this machine */
12057      NULL);
12058  if (!node)
12059   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12060      PTR2UV(flagp));
12061  if (save_fold) {
12062   RExC_flags |= RXf_PMf_FOLD;
12063  }
12064  RExC_parse = save_parse + 1;
12065  RExC_end = save_end;
12066  SvREFCNT_dec_NN(final);
12067  SvREFCNT_dec_NN(result_string);
12068
12069  nextchar(pRExC_state);
12070  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12071  return node;
12072 }
12073 #undef IS_OPERAND
12074
12075 /* The names of properties whose definitions are not known at compile time are
12076  * stored in this SV, after a constant heading.  So if the length has been
12077  * changed since initialization, then there is a run-time definition. */
12078 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12079
12080 STATIC regnode *
12081 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12082     const bool stop_at_1,  /* Just parse the next thing, don't
12083           look for a full character class */
12084     bool allow_multi_folds,
12085     const bool silence_non_portable,   /* Don't output warnings
12086              about too large
12087              characters */
12088     SV** ret_invlist)  /* Return an inversion list, not a node */
12089 {
12090  /* parse a bracketed class specification.  Most of these will produce an
12091  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12092  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
12093  * under /i with multi-character folds: it will be rewritten following the
12094  * paradigm of this example, where the <multi-fold>s are characters which
12095  * fold to multiple character sequences:
12096  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12097  * gets effectively rewritten as:
12098  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12099  * reg() gets called (recursively) on the rewritten version, and this
12100  * function will return what it constructs.  (Actually the <multi-fold>s
12101  * aren't physically removed from the [abcdefghi], it's just that they are
12102  * ignored in the recursion by means of a flag:
12103  * <RExC_in_multi_char_class>.)
12104  *
12105  * ANYOF nodes contain a bit map for the first 256 characters, with the
12106  * corresponding bit set if that character is in the list.  For characters
12107  * above 255, a range list or swash is used.  There are extra bits for \w,
12108  * etc. in locale ANYOFs, as what these match is not determinable at
12109  * compile time
12110  *
12111  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12112  * to be restarted.  This can only happen if ret_invlist is non-NULL.
12113  */
12114
12115  dVAR;
12116  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12117  IV range = 0;
12118  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12119  regnode *ret;
12120  STRLEN numlen;
12121  IV namedclass = OOB_NAMEDCLASS;
12122  char *rangebegin = NULL;
12123  bool need_class = 0;
12124  SV *listsv = NULL;
12125  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12126          than just initialized.  */
12127  SV* properties = NULL;    /* Code points that match \p{} \P{} */
12128  SV* posixes = NULL;     /* Code points that match classes like, [:word:],
12129        extended beyond the Latin1 range */
12130  UV element_count = 0;   /* Number of distinct elements in the class.
12131        Optimizations may be possible if this is tiny */
12132  AV * multi_char_matches = NULL; /* Code points that fold to more than one
12133          character; used under /i */
12134  UV n;
12135  char * stop_ptr = RExC_end;    /* where to stop parsing */
12136  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12137             space? */
12138  const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12139
12140  /* Unicode properties are stored in a swash; this holds the current one
12141  * being parsed.  If this swash is the only above-latin1 component of the
12142  * character class, an optimization is to pass it directly on to the
12143  * execution engine.  Otherwise, it is set to NULL to indicate that there
12144  * are other things in the class that have to be dealt with at execution
12145  * time */
12146  SV* swash = NULL;  /* Code points that match \p{} \P{} */
12147
12148  /* Set if a component of this character class is user-defined; just passed
12149  * on to the engine */
12150  bool has_user_defined_property = FALSE;
12151
12152  /* inversion list of code points this node matches only when the target
12153  * string is in UTF-8.  (Because is under /d) */
12154  SV* depends_list = NULL;
12155
12156  /* inversion list of code points this node matches.  For much of the
12157  * function, it includes only those that match regardless of the utf8ness
12158  * of the target string */
12159  SV* cp_list = NULL;
12160
12161 #ifdef EBCDIC
12162  /* In a range, counts how many 0-2 of the ends of it came from literals,
12163  * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12164  UV literal_endpoint = 0;
12165 #endif
12166  bool invert = FALSE;    /* Is this class to be complemented */
12167
12168  /* Is there any thing like \W or [:^digit:] that matches above the legal
12169  * Unicode range? */
12170  bool runtime_posix_matches_above_Unicode = FALSE;
12171
12172  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12173   case we need to change the emitted regop to an EXACT. */
12174  const char * orig_parse = RExC_parse;
12175  const I32 orig_size = RExC_size;
12176  GET_RE_DEBUG_FLAGS_DECL;
12177
12178  PERL_ARGS_ASSERT_REGCLASS;
12179 #ifndef DEBUGGING
12180  PERL_UNUSED_ARG(depth);
12181 #endif
12182
12183  DEBUG_PARSE("clas");
12184
12185  /* Assume we are going to generate an ANYOF node. */
12186  ret = reganode(pRExC_state, ANYOF, 0);
12187
12188  if (SIZE_ONLY) {
12189   RExC_size += ANYOF_SKIP;
12190   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12191  }
12192  else {
12193   ANYOF_FLAGS(ret) = 0;
12194
12195   RExC_emit += ANYOF_SKIP;
12196   if (LOC) {
12197    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12198   }
12199   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12200   initial_listsv_len = SvCUR(listsv);
12201   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12202  }
12203
12204  if (skip_white) {
12205   RExC_parse = regpatws(pRExC_state, RExC_parse,
12206        FALSE /* means don't recognize comments */);
12207  }
12208
12209  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12210   RExC_parse++;
12211   invert = TRUE;
12212   allow_multi_folds = FALSE;
12213   RExC_naughty++;
12214   if (skip_white) {
12215    RExC_parse = regpatws(pRExC_state, RExC_parse,
12216         FALSE /* means don't recognize comments */);
12217   }
12218  }
12219
12220  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12221  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12222   const char *s = RExC_parse;
12223   const char  c = *s++;
12224
12225   while (isWORDCHAR(*s))
12226    s++;
12227   if (*s && c == *s && s[1] == ']') {
12228    SAVEFREESV(RExC_rx_sv);
12229    ckWARN3reg(s+2,
12230      "POSIX syntax [%c %c] belongs inside character classes",
12231      c, c);
12232    (void)ReREFCNT_inc(RExC_rx_sv);
12233   }
12234  }
12235
12236  /* If the caller wants us to just parse a single element, accomplish this
12237  * by faking the loop ending condition */
12238  if (stop_at_1 && RExC_end > RExC_parse) {
12239   stop_ptr = RExC_parse + 1;
12240  }
12241
12242  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12243  if (UCHARAT(RExC_parse) == ']')
12244   goto charclassloop;
12245
12246 parseit:
12247  while (1) {
12248   if  (RExC_parse >= stop_ptr) {
12249    break;
12250   }
12251
12252   if (skip_white) {
12253    RExC_parse = regpatws(pRExC_state, RExC_parse,
12254         FALSE /* means don't recognize comments */);
12255   }
12256
12257   if  (UCHARAT(RExC_parse) == ']') {
12258    break;
12259   }
12260
12261  charclassloop:
12262
12263   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12264   save_value = value;
12265   save_prevvalue = prevvalue;
12266
12267   if (!range) {
12268    rangebegin = RExC_parse;
12269    element_count++;
12270   }
12271   if (UTF) {
12272    value = utf8n_to_uvchr((U8*)RExC_parse,
12273         RExC_end - RExC_parse,
12274         &numlen, UTF8_ALLOW_DEFAULT);
12275    RExC_parse += numlen;
12276   }
12277   else
12278    value = UCHARAT(RExC_parse++);
12279
12280   if (value == '['
12281    && RExC_parse < RExC_end
12282    && POSIXCC(UCHARAT(RExC_parse)))
12283   {
12284    namedclass = regpposixcc(pRExC_state, value, strict);
12285   }
12286   else if (value == '\\') {
12287    if (UTF) {
12288     value = utf8n_to_uvchr((U8*)RExC_parse,
12289         RExC_end - RExC_parse,
12290         &numlen, UTF8_ALLOW_DEFAULT);
12291     RExC_parse += numlen;
12292    }
12293    else
12294     value = UCHARAT(RExC_parse++);
12295
12296    /* Some compilers cannot handle switching on 64-bit integer
12297    * values, therefore value cannot be an UV.  Yes, this will
12298    * be a problem later if we want switch on Unicode.
12299    * A similar issue a little bit later when switching on
12300    * namedclass. --jhi */
12301
12302    /* If the \ is escaping white space when white space is being
12303    * skipped, it means that that white space is wanted literally, and
12304    * is already in 'value'.  Otherwise, need to translate the escape
12305    * into what it signifies. */
12306    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12307
12308    case 'w': namedclass = ANYOF_WORDCHAR; break;
12309    case 'W': namedclass = ANYOF_NWORDCHAR; break;
12310    case 's': namedclass = ANYOF_SPACE; break;
12311    case 'S': namedclass = ANYOF_NSPACE; break;
12312    case 'd': namedclass = ANYOF_DIGIT; break;
12313    case 'D': namedclass = ANYOF_NDIGIT; break;
12314    case 'v': namedclass = ANYOF_VERTWS; break;
12315    case 'V': namedclass = ANYOF_NVERTWS; break;
12316    case 'h': namedclass = ANYOF_HORIZWS; break;
12317    case 'H': namedclass = ANYOF_NHORIZWS; break;
12318    case 'N':  /* Handle \N{NAME} in class */
12319     {
12320      /* We only pay attention to the first char of
12321      multichar strings being returned. I kinda wonder
12322      if this makes sense as it does change the behaviour
12323      from earlier versions, OTOH that behaviour was broken
12324      as well. */
12325      if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12326          TRUE, /* => charclass */
12327          strict))
12328      {
12329       if (*flagp & RESTART_UTF8)
12330        FAIL("panic: grok_bslash_N set RESTART_UTF8");
12331       goto parseit;
12332      }
12333     }
12334     break;
12335    case 'p':
12336    case 'P':
12337     {
12338     char *e;
12339
12340     /* We will handle any undefined properties ourselves */
12341     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12342
12343     if (RExC_parse >= RExC_end)
12344      vFAIL2("Empty \\%c{}", (U8)value);
12345     if (*RExC_parse == '{') {
12346      const U8 c = (U8)value;
12347      e = strchr(RExC_parse++, '}');
12348      if (!e)
12349       vFAIL2("Missing right brace on \\%c{}", c);
12350      while (isSPACE(UCHARAT(RExC_parse)))
12351       RExC_parse++;
12352      if (e == RExC_parse)
12353       vFAIL2("Empty \\%c{}", c);
12354      n = e - RExC_parse;
12355      while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12356       n--;
12357     }
12358     else {
12359      e = RExC_parse;
12360      n = 1;
12361     }
12362     if (!SIZE_ONLY) {
12363      SV* invlist;
12364      char* name;
12365
12366      if (UCHARAT(RExC_parse) == '^') {
12367       RExC_parse++;
12368       n--;
12369       /* toggle.  (The rhs xor gets the single bit that
12370       * differs between P and p; the other xor inverts just
12371       * that bit) */
12372       value ^= 'P' ^ 'p';
12373
12374       while (isSPACE(UCHARAT(RExC_parse))) {
12375        RExC_parse++;
12376        n--;
12377       }
12378      }
12379      /* Try to get the definition of the property into
12380      * <invlist>.  If /i is in effect, the effective property
12381      * will have its name be <__NAME_i>.  The design is
12382      * discussed in commit
12383      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12384      Newx(name, n + sizeof("_i__\n"), char);
12385
12386      sprintf(name, "%s%.*s%s\n",
12387          (FOLD) ? "__" : "",
12388          (int)n,
12389          RExC_parse,
12390          (FOLD) ? "_i" : ""
12391      );
12392
12393      /* Look up the property name, and get its swash and
12394      * inversion list, if the property is found  */
12395      if (swash) {
12396       SvREFCNT_dec_NN(swash);
12397      }
12398      swash = _core_swash_init("utf8", name, &PL_sv_undef,
12399            1, /* binary */
12400            0, /* not tr/// */
12401            NULL, /* No inversion list */
12402            &swash_init_flags
12403            );
12404      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12405       if (swash) {
12406        SvREFCNT_dec_NN(swash);
12407        swash = NULL;
12408       }
12409
12410       /* Here didn't find it.  It could be a user-defined
12411       * property that will be available at run-time.  If we
12412       * accept only compile-time properties, is an error;
12413       * otherwise add it to the list for run-time look up */
12414       if (ret_invlist) {
12415        RExC_parse = e + 1;
12416        vFAIL3("Property '%.*s' is unknown", (int) n, name);
12417       }
12418       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12419           (value == 'p' ? '+' : '!'),
12420           name);
12421       has_user_defined_property = TRUE;
12422
12423       /* We don't know yet, so have to assume that the
12424       * property could match something in the Latin1 range,
12425       * hence something that isn't utf8.  Note that this
12426       * would cause things in <depends_list> to match
12427       * inappropriately, except that any \p{}, including
12428       * this one forces Unicode semantics, which means there
12429       * is <no depends_list> */
12430       ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12431      }
12432      else {
12433
12434       /* Here, did get the swash and its inversion list.  If
12435       * the swash is from a user-defined property, then this
12436       * whole character class should be regarded as such */
12437       has_user_defined_property =
12438          (swash_init_flags
12439          & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12440
12441       /* Invert if asking for the complement */
12442       if (value == 'P') {
12443        _invlist_union_complement_2nd(properties,
12444               invlist,
12445               &properties);
12446
12447        /* The swash can't be used as-is, because we've
12448        * inverted things; delay removing it to here after
12449        * have copied its invlist above */
12450        SvREFCNT_dec_NN(swash);
12451        swash = NULL;
12452       }
12453       else {
12454        _invlist_union(properties, invlist, &properties);
12455       }
12456      }
12457      Safefree(name);
12458     }
12459     RExC_parse = e + 1;
12460     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12461             named */
12462
12463     /* \p means they want Unicode semantics */
12464     RExC_uni_semantics = 1;
12465     }
12466     break;
12467    case 'n': value = '\n';   break;
12468    case 'r': value = '\r';   break;
12469    case 't': value = '\t';   break;
12470    case 'f': value = '\f';   break;
12471    case 'b': value = '\b';   break;
12472    case 'e': value = ASCII_TO_NATIVE('\033');break;
12473    case 'a': value = ASCII_TO_NATIVE('\007');break;
12474    case 'o':
12475     RExC_parse--; /* function expects to be pointed at the 'o' */
12476     {
12477      const char* error_msg;
12478      bool valid = grok_bslash_o(&RExC_parse,
12479            &value,
12480            &error_msg,
12481            SIZE_ONLY,   /* warnings in pass
12482                1 only */
12483            strict,
12484            silence_non_portable,
12485            UTF);
12486      if (! valid) {
12487       vFAIL(error_msg);
12488      }
12489     }
12490     if (PL_encoding && value < 0x100) {
12491      goto recode_encoding;
12492     }
12493     break;
12494    case 'x':
12495     RExC_parse--; /* function expects to be pointed at the 'x' */
12496     {
12497      const char* error_msg;
12498      bool valid = grok_bslash_x(&RExC_parse,
12499            &value,
12500            &error_msg,
12501            TRUE, /* Output warnings */
12502            strict,
12503            silence_non_portable,
12504            UTF);
12505      if (! valid) {
12506       vFAIL(error_msg);
12507      }
12508     }
12509     if (PL_encoding && value < 0x100)
12510      goto recode_encoding;
12511     break;
12512    case 'c':
12513     value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12514     break;
12515    case '0': case '1': case '2': case '3': case '4':
12516    case '5': case '6': case '7':
12517     {
12518      /* Take 1-3 octal digits */
12519      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12520      numlen = (strict) ? 4 : 3;
12521      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12522      RExC_parse += numlen;
12523      if (numlen != 3) {
12524       if (strict) {
12525        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12526        vFAIL("Need exactly 3 octal digits");
12527       }
12528       else if (! SIZE_ONLY /* like \08, \178 */
12529         && numlen < 3
12530         && RExC_parse < RExC_end
12531         && isDIGIT(*RExC_parse)
12532         && ckWARN(WARN_REGEXP))
12533       {
12534        SAVEFREESV(RExC_rx_sv);
12535        reg_warn_non_literal_string(
12536         RExC_parse + 1,
12537         form_short_octal_warning(RExC_parse, numlen));
12538        (void)ReREFCNT_inc(RExC_rx_sv);
12539       }
12540      }
12541      if (PL_encoding && value < 0x100)
12542       goto recode_encoding;
12543      break;
12544     }
12545    recode_encoding:
12546     if (! RExC_override_recoding) {
12547      SV* enc = PL_encoding;
12548      value = reg_recode((const char)(U8)value, &enc);
12549      if (!enc) {
12550       if (strict) {
12551        vFAIL("Invalid escape in the specified encoding");
12552       }
12553       else if (SIZE_ONLY) {
12554        ckWARNreg(RExC_parse,
12555         "Invalid escape in the specified encoding");
12556       }
12557      }
12558      break;
12559     }
12560    default:
12561     /* Allow \_ to not give an error */
12562     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12563      if (strict) {
12564       vFAIL2("Unrecognized escape \\%c in character class",
12565        (int)value);
12566      }
12567      else {
12568       SAVEFREESV(RExC_rx_sv);
12569       ckWARN2reg(RExC_parse,
12570        "Unrecognized escape \\%c in character class passed through",
12571        (int)value);
12572       (void)ReREFCNT_inc(RExC_rx_sv);
12573      }
12574     }
12575     break;
12576    }   /* End of switch on char following backslash */
12577   } /* end of handling backslash escape sequences */
12578 #ifdef EBCDIC
12579   else
12580    literal_endpoint++;
12581 #endif
12582
12583   /* Here, we have the current token in 'value' */
12584
12585   /* What matches in a locale is not known until runtime.  This includes
12586   * what the Posix classes (like \w, [:space:]) match.  Room must be
12587   * reserved (one time per class) to store such classes, either if Perl
12588   * is compiled so that locale nodes always should have this space, or
12589   * if there is such class info to be stored.  The space will contain a
12590   * bit for each named class that is to be matched against.  This isn't
12591   * needed for \p{} and pseudo-classes, as they are not affected by
12592   * locale, and hence are dealt with separately */
12593   if (LOC
12594    && ! need_class
12595    && (ANYOF_LOCALE == ANYOF_CLASS
12596     || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12597   {
12598    need_class = 1;
12599    if (SIZE_ONLY) {
12600     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12601    }
12602    else {
12603     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12604     ANYOF_CLASS_ZERO(ret);
12605    }
12606    ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12607   }
12608
12609   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12610
12611    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12612    * literal, as is the character that began the false range, i.e.
12613    * the 'a' in the examples */
12614    if (range) {
12615     if (!SIZE_ONLY) {
12616      const int w = (RExC_parse >= rangebegin)
12617         ? RExC_parse - rangebegin
12618         : 0;
12619      if (strict) {
12620       vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12621      }
12622      else {
12623       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12624       ckWARN4reg(RExC_parse,
12625         "False [] range \"%*.*s\"",
12626         w, w, rangebegin);
12627       (void)ReREFCNT_inc(RExC_rx_sv);
12628       cp_list = add_cp_to_invlist(cp_list, '-');
12629       cp_list = add_cp_to_invlist(cp_list, prevvalue);
12630      }
12631     }
12632
12633     range = 0; /* this was not a true range */
12634     element_count += 2; /* So counts for three values */
12635    }
12636
12637    if (! SIZE_ONLY) {
12638     U8 classnum = namedclass_to_classnum(namedclass);
12639     if (namedclass >= ANYOF_MAX) {  /* If a special class */
12640      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12641
12642       /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12643       * /l make a difference in what these match.  There
12644       * would be problems if these characters had folds
12645       * other than themselves, as cp_list is subject to
12646       * folding. */
12647       if (classnum != _CC_VERTSPACE) {
12648        assert(   namedclass == ANYOF_HORIZWS
12649         || namedclass == ANYOF_NHORIZWS);
12650
12651        /* It turns out that \h is just a synonym for
12652        * XPosixBlank */
12653        classnum = _CC_BLANK;
12654       }
12655
12656       _invlist_union_maybe_complement_2nd(
12657         cp_list,
12658         PL_XPosix_ptrs[classnum],
12659         cBOOL(namedclass % 2), /* Complement if odd
12660               (NHORIZWS, NVERTWS)
12661               */
12662         &cp_list);
12663      }
12664     }
12665     else if (classnum == _CC_ASCII) {
12666 #ifdef HAS_ISASCII
12667      if (LOC) {
12668       ANYOF_CLASS_SET(ret, namedclass);
12669      }
12670      else
12671 #endif  /* Not isascii(); just use the hard-coded definition for it */
12672       _invlist_union_maybe_complement_2nd(
12673         posixes,
12674         PL_ASCII,
12675         cBOOL(namedclass % 2), /* Complement if odd
12676               (NASCII) */
12677         &posixes);
12678     }
12679     else {  /* Garden variety class */
12680
12681      /* The ascii range inversion list */
12682      SV* ascii_source = PL_Posix_ptrs[classnum];
12683
12684      /* The full Latin1 range inversion list */
12685      SV* l1_source = PL_L1Posix_ptrs[classnum];
12686
12687      /* This code is structured into two major clauses.  The
12688      * first is for classes whose complete definitions may not
12689      * already be known.  It not, the Latin1 definition
12690      * (guaranteed to already known) is used plus code is
12691      * generated to load the rest at run-time (only if needed).
12692      * If the complete definition is known, it drops down to
12693      * the second clause, where the complete definition is
12694      * known */
12695
12696      if (classnum < _FIRST_NON_SWASH_CC) {
12697
12698       /* Here, the class has a swash, which may or not
12699       * already be loaded */
12700
12701       /* The name of the property to use to match the full
12702       * eXtended Unicode range swash for this character
12703       * class */
12704       const char *Xname = swash_property_names[classnum];
12705
12706       /* If returning the inversion list, we can't defer
12707       * getting this until runtime */
12708       if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12709        PL_utf8_swash_ptrs[classnum] =
12710         _core_swash_init("utf8", Xname, &PL_sv_undef,
12711            1, /* binary */
12712            0, /* not tr/// */
12713            NULL, /* No inversion list */
12714            NULL  /* No flags */
12715            );
12716        assert(PL_utf8_swash_ptrs[classnum]);
12717       }
12718       if ( !  PL_utf8_swash_ptrs[classnum]) {
12719        if (namedclass % 2 == 0) { /* A non-complemented
12720               class */
12721         /* If not /a matching, there are code points we
12722         * don't know at compile time.  Arrange for the
12723         * unknown matches to be loaded at run-time, if
12724         * needed */
12725         if (! AT_LEAST_ASCII_RESTRICTED) {
12726          Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12727                 Xname);
12728         }
12729         if (LOC) {  /* Under locale, set run-time
12730            lookup */
12731          ANYOF_CLASS_SET(ret, namedclass);
12732         }
12733         else {
12734          /* Add the current class's code points to
12735          * the running total */
12736          _invlist_union(posixes,
12737             (AT_LEAST_ASCII_RESTRICTED)
12738               ? ascii_source
12739               : l1_source,
12740             &posixes);
12741         }
12742        }
12743        else {  /* A complemented class */
12744         if (AT_LEAST_ASCII_RESTRICTED) {
12745          /* Under /a should match everything above
12746          * ASCII, plus the complement of the set's
12747          * ASCII matches */
12748          _invlist_union_complement_2nd(posixes,
12749                 ascii_source,
12750                 &posixes);
12751         }
12752         else {
12753          /* Arrange for the unknown matches to be
12754          * loaded at run-time, if needed */
12755          Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12756                 Xname);
12757          runtime_posix_matches_above_Unicode = TRUE;
12758          if (LOC) {
12759           ANYOF_CLASS_SET(ret, namedclass);
12760          }
12761          else {
12762
12763           /* We want to match everything in
12764           * Latin1, except those things that
12765           * l1_source matches */
12766           SV* scratch_list = NULL;
12767           _invlist_subtract(PL_Latin1, l1_source,
12768               &scratch_list);
12769
12770           /* Add the list from this class to the
12771           * running total */
12772           if (! posixes) {
12773            posixes = scratch_list;
12774           }
12775           else {
12776            _invlist_union(posixes,
12777               scratch_list,
12778               &posixes);
12779            SvREFCNT_dec_NN(scratch_list);
12780           }
12781           if (DEPENDS_SEMANTICS) {
12782            ANYOF_FLAGS(ret)
12783             |= ANYOF_NON_UTF8_LATIN1_ALL;
12784           }
12785          }
12786         }
12787        }
12788        goto namedclass_done;
12789       }
12790
12791       /* Here, there is a swash loaded for the class.  If no
12792       * inversion list for it yet, get it */
12793       if (! PL_XPosix_ptrs[classnum]) {
12794        PL_XPosix_ptrs[classnum]
12795        = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12796       }
12797      }
12798
12799      /* Here there is an inversion list already loaded for the
12800      * entire class */
12801
12802      if (namedclass % 2 == 0) {  /* A non-complemented class,
12803             like ANYOF_PUNCT */
12804       if (! LOC) {
12805        /* For non-locale, just add it to any existing list
12806        * */
12807        _invlist_union(posixes,
12808           (AT_LEAST_ASCII_RESTRICTED)
12809            ? ascii_source
12810            : PL_XPosix_ptrs[classnum],
12811           &posixes);
12812       }
12813       else {  /* Locale */
12814        SV* scratch_list = NULL;
12815
12816        /* For above Latin1 code points, we use the full
12817        * Unicode range */
12818        _invlist_intersection(PL_AboveLatin1,
12819             PL_XPosix_ptrs[classnum],
12820             &scratch_list);
12821        /* And set the output to it, adding instead if
12822        * there already is an output.  Checking if
12823        * 'posixes' is NULL first saves an extra clone.
12824        * Its reference count will be decremented at the
12825        * next union, etc, or if this is the only
12826        * instance, at the end of the routine */
12827        if (! posixes) {
12828         posixes = scratch_list;
12829        }
12830        else {
12831         _invlist_union(posixes, scratch_list, &posixes);
12832         SvREFCNT_dec_NN(scratch_list);
12833        }
12834
12835 #ifndef HAS_ISBLANK
12836        if (namedclass != ANYOF_BLANK) {
12837 #endif
12838         /* Set this class in the node for runtime
12839         * matching */
12840         ANYOF_CLASS_SET(ret, namedclass);
12841 #ifndef HAS_ISBLANK
12842        }
12843        else {
12844         /* No isblank(), use the hard-coded ASCII-range
12845         * blanks, adding them to the running total. */
12846
12847         _invlist_union(posixes, ascii_source, &posixes);
12848        }
12849 #endif
12850       }
12851      }
12852      else {  /* A complemented class, like ANYOF_NPUNCT */
12853       if (! LOC) {
12854        _invlist_union_complement_2nd(
12855             posixes,
12856             (AT_LEAST_ASCII_RESTRICTED)
12857              ? ascii_source
12858              : PL_XPosix_ptrs[classnum],
12859             &posixes);
12860        /* Under /d, everything in the upper half of the
12861        * Latin1 range matches this complement */
12862        if (DEPENDS_SEMANTICS) {
12863         ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12864        }
12865       }
12866       else {  /* Locale */
12867        SV* scratch_list = NULL;
12868        _invlist_subtract(PL_AboveLatin1,
12869            PL_XPosix_ptrs[classnum],
12870            &scratch_list);
12871        if (! posixes) {
12872         posixes = scratch_list;
12873        }
12874        else {
12875         _invlist_union(posixes, scratch_list, &posixes);
12876         SvREFCNT_dec_NN(scratch_list);
12877        }
12878 #ifndef HAS_ISBLANK
12879        if (namedclass != ANYOF_NBLANK) {
12880 #endif
12881         ANYOF_CLASS_SET(ret, namedclass);
12882 #ifndef HAS_ISBLANK
12883        }
12884        else {
12885         /* Get the list of all code points in Latin1
12886         * that are not ASCII blanks, and add them to
12887         * the running total */
12888         _invlist_subtract(PL_Latin1, ascii_source,
12889             &scratch_list);
12890         _invlist_union(posixes, scratch_list, &posixes);
12891         SvREFCNT_dec_NN(scratch_list);
12892        }
12893 #endif
12894       }
12895      }
12896     }
12897    namedclass_done:
12898     continue;   /* Go get next character */
12899    }
12900   } /* end of namedclass \blah */
12901
12902   /* Here, we have a single value.  If 'range' is set, it is the ending
12903   * of a range--check its validity.  Later, we will handle each
12904   * individual code point in the range.  If 'range' isn't set, this
12905   * could be the beginning of a range, so check for that by looking
12906   * ahead to see if the next real character to be processed is the range
12907   * indicator--the minus sign */
12908
12909   if (skip_white) {
12910    RExC_parse = regpatws(pRExC_state, RExC_parse,
12911         FALSE /* means don't recognize comments */);
12912   }
12913
12914   if (range) {
12915    if (prevvalue > value) /* b-a */ {
12916     const int w = RExC_parse - rangebegin;
12917     Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12918     range = 0; /* not a valid range */
12919    }
12920   }
12921   else {
12922    prevvalue = value; /* save the beginning of the potential range */
12923    if (! stop_at_1     /* Can't be a range if parsing just one thing */
12924     && *RExC_parse == '-')
12925    {
12926     char* next_char_ptr = RExC_parse + 1;
12927     if (skip_white) {   /* Get the next real char after the '-' */
12928      next_char_ptr = regpatws(pRExC_state,
12929            RExC_parse + 1,
12930            FALSE); /* means don't recognize
12931               comments */
12932     }
12933
12934     /* If the '-' is at the end of the class (just before the ']',
12935     * it is a literal minus; otherwise it is a range */
12936     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12937      RExC_parse = next_char_ptr;
12938
12939      /* a bad range like \w-, [:word:]- ? */
12940      if (namedclass > OOB_NAMEDCLASS) {
12941       if (strict || ckWARN(WARN_REGEXP)) {
12942        const int w =
12943         RExC_parse >= rangebegin ?
12944         RExC_parse - rangebegin : 0;
12945        if (strict) {
12946         vFAIL4("False [] range \"%*.*s\"",
12947          w, w, rangebegin);
12948        }
12949        else {
12950         vWARN4(RExC_parse,
12951          "False [] range \"%*.*s\"",
12952          w, w, rangebegin);
12953        }
12954       }
12955       if (!SIZE_ONLY) {
12956        cp_list = add_cp_to_invlist(cp_list, '-');
12957       }
12958       element_count++;
12959      } else
12960       range = 1; /* yeah, it's a range! */
12961      continue; /* but do it the next time */
12962     }
12963    }
12964   }
12965
12966   /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12967   * if not */
12968
12969   /* non-Latin1 code point implies unicode semantics.  Must be set in
12970   * pass1 so is there for the whole of pass 2 */
12971   if (value > 255) {
12972    RExC_uni_semantics = 1;
12973   }
12974
12975   /* Ready to process either the single value, or the completed range.
12976   * For single-valued non-inverted ranges, we consider the possibility
12977   * of multi-char folds.  (We made a conscious decision to not do this
12978   * for the other cases because it can often lead to non-intuitive
12979   * results.  For example, you have the peculiar case that:
12980   *  "s s" =~ /^[^\xDF]+$/i => Y
12981   *  "ss"  =~ /^[^\xDF]+$/i => N
12982   *
12983   * See [perl #89750] */
12984   if (FOLD && allow_multi_folds && value == prevvalue) {
12985    if (value == LATIN_SMALL_LETTER_SHARP_S
12986     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12987               value)))
12988    {
12989     /* Here <value> is indeed a multi-char fold.  Get what it is */
12990
12991     U8 foldbuf[UTF8_MAXBYTES_CASE];
12992     STRLEN foldlen;
12993
12994     UV folded = _to_uni_fold_flags(
12995         value,
12996         foldbuf,
12997         &foldlen,
12998         FOLD_FLAGS_FULL
12999         | ((LOC) ?  FOLD_FLAGS_LOCALE
13000            : (ASCII_FOLD_RESTRICTED)
13001            ? FOLD_FLAGS_NOMIX_ASCII
13002            : 0)
13003         );
13004
13005     /* Here, <folded> should be the first character of the
13006     * multi-char fold of <value>, with <foldbuf> containing the
13007     * whole thing.  But, if this fold is not allowed (because of
13008     * the flags), <fold> will be the same as <value>, and should
13009     * be processed like any other character, so skip the special
13010     * handling */
13011     if (folded != value) {
13012
13013      /* Skip if we are recursed, currently parsing the class
13014      * again.  Otherwise add this character to the list of
13015      * multi-char folds. */
13016      if (! RExC_in_multi_char_class) {
13017       AV** this_array_ptr;
13018       AV* this_array;
13019       STRLEN cp_count = utf8_length(foldbuf,
13020              foldbuf + foldlen);
13021       SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13022
13023       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13024
13025
13026       if (! multi_char_matches) {
13027        multi_char_matches = newAV();
13028       }
13029
13030       /* <multi_char_matches> is actually an array of arrays.
13031       * There will be one or two top-level elements: [2],
13032       * and/or [3].  The [2] element is an array, each
13033       * element thereof is a character which folds to two
13034       * characters; likewise for [3].  (Unicode guarantees a
13035       * maximum of 3 characters in any fold.)  When we
13036       * rewrite the character class below, we will do so
13037       * such that the longest folds are written first, so
13038       * that it prefers the longest matching strings first.
13039       * This is done even if it turns out that any
13040       * quantifier is non-greedy, out of programmer
13041       * laziness.  Tom Christiansen has agreed that this is
13042       * ok.  This makes the test for the ligature 'ffi' come
13043       * before the test for 'ff' */
13044       if (av_exists(multi_char_matches, cp_count)) {
13045        this_array_ptr = (AV**) av_fetch(multi_char_matches,
13046                cp_count, FALSE);
13047        this_array = *this_array_ptr;
13048       }
13049       else {
13050        this_array = newAV();
13051        av_store(multi_char_matches, cp_count,
13052          (SV*) this_array);
13053       }
13054       av_push(this_array, multi_fold);
13055      }
13056
13057      /* This element should not be processed further in this
13058      * class */
13059      element_count--;
13060      value = save_value;
13061      prevvalue = save_prevvalue;
13062      continue;
13063     }
13064    }
13065   }
13066
13067   /* Deal with this element of the class */
13068   if (! SIZE_ONLY) {
13069 #ifndef EBCDIC
13070    cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13071 #else
13072    SV* this_range = _new_invlist(1);
13073    _append_range_to_invlist(this_range, prevvalue, value);
13074
13075    /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13076    * If this range was specified using something like 'i-j', we want
13077    * to include only the 'i' and the 'j', and not anything in
13078    * between, so exclude non-ASCII, non-alphabetics from it.
13079    * However, if the range was specified with something like
13080    * [\x89-\x91] or [\x89-j], all code points within it should be
13081    * included.  literal_endpoint==2 means both ends of the range used
13082    * a literal character, not \x{foo} */
13083    if (literal_endpoint == 2
13084     && (prevvalue >= 'a' && value <= 'z')
13085      || (prevvalue >= 'A' && value <= 'Z'))
13086    {
13087     _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13088          &this_range);
13089    }
13090    _invlist_union(cp_list, this_range, &cp_list);
13091    literal_endpoint = 0;
13092 #endif
13093   }
13094
13095   range = 0; /* this range (if it was one) is done now */
13096  } /* End of loop through all the text within the brackets */
13097
13098  /* If anything in the class expands to more than one character, we have to
13099  * deal with them by building up a substitute parse string, and recursively
13100  * calling reg() on it, instead of proceeding */
13101  if (multi_char_matches) {
13102   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13103   I32 cp_count;
13104   STRLEN len;
13105   char *save_end = RExC_end;
13106   char *save_parse = RExC_parse;
13107   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
13108          a "|" */
13109   I32 reg_flags;
13110
13111   assert(! invert);
13112 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
13113   because too confusing */
13114   if (invert) {
13115    sv_catpv(substitute_parse, "(?:");
13116   }
13117 #endif
13118
13119   /* Look at the longest folds first */
13120   for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13121
13122    if (av_exists(multi_char_matches, cp_count)) {
13123     AV** this_array_ptr;
13124     SV* this_sequence;
13125
13126     this_array_ptr = (AV**) av_fetch(multi_char_matches,
13127             cp_count, FALSE);
13128     while ((this_sequence = av_pop(*this_array_ptr)) !=
13129                 &PL_sv_undef)
13130     {
13131      if (! first_time) {
13132       sv_catpv(substitute_parse, "|");
13133      }
13134      first_time = FALSE;
13135
13136      sv_catpv(substitute_parse, SvPVX(this_sequence));
13137     }
13138    }
13139   }
13140
13141   /* If the character class contains anything else besides these
13142   * multi-character folds, have to include it in recursive parsing */
13143   if (element_count) {
13144    sv_catpv(substitute_parse, "|[");
13145    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13146    sv_catpv(substitute_parse, "]");
13147   }
13148
13149   sv_catpv(substitute_parse, ")");
13150 #if 0
13151   if (invert) {
13152    /* This is a way to get the parse to skip forward a whole named
13153    * sequence instead of matching the 2nd character when it fails the
13154    * first */
13155    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13156   }
13157 #endif
13158
13159   RExC_parse = SvPV(substitute_parse, len);
13160   RExC_end = RExC_parse + len;
13161   RExC_in_multi_char_class = 1;
13162   RExC_emit = (regnode *)orig_emit;
13163
13164   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13165
13166   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13167
13168   RExC_parse = save_parse;
13169   RExC_end = save_end;
13170   RExC_in_multi_char_class = 0;
13171   SvREFCNT_dec_NN(multi_char_matches);
13172   return ret;
13173  }
13174
13175  /* If the character class contains only a single element, it may be
13176  * optimizable into another node type which is smaller and runs faster.
13177  * Check if this is the case for this class */
13178  if (element_count == 1 && ! ret_invlist) {
13179   U8 op = END;
13180   U8 arg = 0;
13181
13182   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13183            [:digit:] or \p{foo} */
13184
13185    /* All named classes are mapped into POSIXish nodes, with its FLAG
13186    * argument giving which class it is */
13187    switch ((I32)namedclass) {
13188     case ANYOF_UNIPROP:
13189      break;
13190
13191     /* These don't depend on the charset modifiers.  They always
13192     * match under /u rules */
13193     case ANYOF_NHORIZWS:
13194     case ANYOF_HORIZWS:
13195      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13196      /* FALLTHROUGH */
13197
13198     case ANYOF_NVERTWS:
13199     case ANYOF_VERTWS:
13200      op = POSIXU;
13201      goto join_posix;
13202
13203     /* The actual POSIXish node for all the rest depends on the
13204     * charset modifier.  The ones in the first set depend only on
13205     * ASCII or, if available on this platform, locale */
13206     case ANYOF_ASCII:
13207     case ANYOF_NASCII:
13208 #ifdef HAS_ISASCII
13209      op = (LOC) ? POSIXL : POSIXA;
13210 #else
13211      op = POSIXA;
13212 #endif
13213      goto join_posix;
13214
13215     case ANYOF_NCASED:
13216     case ANYOF_LOWER:
13217     case ANYOF_NLOWER:
13218     case ANYOF_UPPER:
13219     case ANYOF_NUPPER:
13220      /* under /a could be alpha */
13221      if (FOLD) {
13222       if (ASCII_RESTRICTED) {
13223        namedclass = ANYOF_ALPHA + (namedclass % 2);
13224       }
13225       else if (! LOC) {
13226        break;
13227       }
13228      }
13229      /* FALLTHROUGH */
13230
13231     /* The rest have more possibilities depending on the charset.
13232     * We take advantage of the enum ordering of the charset
13233     * modifiers to get the exact node type, */
13234     default:
13235      op = POSIXD + get_regex_charset(RExC_flags);
13236      if (op > POSIXA) { /* /aa is same as /a */
13237       op = POSIXA;
13238      }
13239 #ifndef HAS_ISBLANK
13240      if (op == POSIXL
13241       && (namedclass == ANYOF_BLANK
13242        || namedclass == ANYOF_NBLANK))
13243      {
13244       op = POSIXA;
13245      }
13246 #endif
13247
13248     join_posix:
13249      /* The odd numbered ones are the complements of the
13250      * next-lower even number one */
13251      if (namedclass % 2 == 1) {
13252       invert = ! invert;
13253       namedclass--;
13254      }
13255      arg = namedclass_to_classnum(namedclass);
13256      break;
13257    }
13258   }
13259   else if (value == prevvalue) {
13260
13261    /* Here, the class consists of just a single code point */
13262
13263    if (invert) {
13264     if (! LOC && value == '\n') {
13265      op = REG_ANY; /* Optimize [^\n] */
13266      *flagp |= HASWIDTH|SIMPLE;
13267      RExC_naughty++;
13268     }
13269    }
13270    else if (value < 256 || UTF) {
13271
13272     /* Optimize a single value into an EXACTish node, but not if it
13273     * would require converting the pattern to UTF-8. */
13274     op = compute_EXACTish(pRExC_state);
13275    }
13276   } /* Otherwise is a range */
13277   else if (! LOC) {   /* locale could vary these */
13278    if (prevvalue == '0') {
13279     if (value == '9') {
13280      arg = _CC_DIGIT;
13281      op = POSIXA;
13282     }
13283    }
13284   }
13285
13286   /* Here, we have changed <op> away from its initial value iff we found
13287   * an optimization */
13288   if (op != END) {
13289
13290    /* Throw away this ANYOF regnode, and emit the calculated one,
13291    * which should correspond to the beginning, not current, state of
13292    * the parse */
13293    const char * cur_parse = RExC_parse;
13294    RExC_parse = (char *)orig_parse;
13295    if ( SIZE_ONLY) {
13296     if (! LOC) {
13297
13298      /* To get locale nodes to not use the full ANYOF size would
13299      * require moving the code above that writes the portions
13300      * of it that aren't in other nodes to after this point.
13301      * e.g.  ANYOF_CLASS_SET */
13302      RExC_size = orig_size;
13303     }
13304    }
13305    else {
13306     RExC_emit = (regnode *)orig_emit;
13307     if (PL_regkind[op] == POSIXD) {
13308      if (invert) {
13309       op += NPOSIXD - POSIXD;
13310      }
13311     }
13312    }
13313
13314    ret = reg_node(pRExC_state, op);
13315
13316    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13317     if (! SIZE_ONLY) {
13318      FLAGS(ret) = arg;
13319     }
13320     *flagp |= HASWIDTH|SIMPLE;
13321    }
13322    else if (PL_regkind[op] == EXACT) {
13323     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13324    }
13325
13326    RExC_parse = (char *) cur_parse;
13327
13328    SvREFCNT_dec(posixes);
13329    SvREFCNT_dec(cp_list);
13330    return ret;
13331   }
13332  }
13333
13334  if (SIZE_ONLY)
13335   return ret;
13336  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13337
13338  /* If folding, we calculate all characters that could fold to or from the
13339  * ones already on the list */
13340  if (FOLD && cp_list) {
13341   UV start, end; /* End points of code point ranges */
13342
13343   SV* fold_intersection = NULL;
13344
13345   /* If the highest code point is within Latin1, we can use the
13346   * compiled-in Alphas list, and not have to go out to disk.  This
13347   * yields two false positives, the masculine and feminine ordinal
13348   * indicators, which are weeded out below using the
13349   * IS_IN_SOME_FOLD_L1() macro */
13350   if (invlist_highest(cp_list) < 256) {
13351    _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13352               &fold_intersection);
13353   }
13354   else {
13355
13356    /* Here, there are non-Latin1 code points, so we will have to go
13357    * fetch the list of all the characters that participate in folds
13358    */
13359    if (! PL_utf8_foldable) {
13360     SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13361          &PL_sv_undef, 1, 0);
13362     PL_utf8_foldable = _get_swash_invlist(swash);
13363     SvREFCNT_dec_NN(swash);
13364    }
13365
13366    /* This is a hash that for a particular fold gives all characters
13367    * that are involved in it */
13368    if (! PL_utf8_foldclosures) {
13369
13370     /* If we were unable to find any folds, then we likely won't be
13371     * able to find the closures.  So just create an empty list.
13372     * Folding will effectively be restricted to the non-Unicode
13373     * rules hard-coded into Perl.  (This case happens legitimately
13374     * during compilation of Perl itself before the Unicode tables
13375     * are generated) */
13376     if (_invlist_len(PL_utf8_foldable) == 0) {
13377      PL_utf8_foldclosures = newHV();
13378     }
13379     else {
13380      /* If the folds haven't been read in, call a fold function
13381      * to force that */
13382      if (! PL_utf8_tofold) {
13383       U8 dummy[UTF8_MAXBYTES+1];
13384
13385       /* This string is just a short named one above \xff */
13386       to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13387       assert(PL_utf8_tofold); /* Verify that worked */
13388      }
13389      PL_utf8_foldclosures =
13390          _swash_inversion_hash(PL_utf8_tofold);
13391     }
13392    }
13393
13394    /* Only the characters in this class that participate in folds need
13395    * be checked.  Get the intersection of this class and all the
13396    * possible characters that are foldable.  This can quickly narrow
13397    * down a large class */
13398    _invlist_intersection(PL_utf8_foldable, cp_list,
13399         &fold_intersection);
13400   }
13401
13402   /* Now look at the foldable characters in this class individually */
13403   invlist_iterinit(fold_intersection);
13404   while (invlist_iternext(fold_intersection, &start, &end)) {
13405    UV j;
13406
13407    /* Locale folding for Latin1 characters is deferred until runtime */
13408    if (LOC && start < 256) {
13409     start = 256;
13410    }
13411
13412    /* Look at every character in the range */
13413    for (j = start; j <= end; j++) {
13414
13415     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13416     STRLEN foldlen;
13417     SV** listp;
13418
13419     if (j < 256) {
13420
13421      /* We have the latin1 folding rules hard-coded here so that
13422      * an innocent-looking character class, like /[ks]/i won't
13423      * have to go out to disk to find the possible matches.
13424      * XXX It would be better to generate these via regen, in
13425      * case a new version of the Unicode standard adds new
13426      * mappings, though that is not really likely, and may be
13427      * caught by the default: case of the switch below. */
13428
13429      if (IS_IN_SOME_FOLD_L1(j)) {
13430
13431       /* ASCII is always matched; non-ASCII is matched only
13432       * under Unicode rules */
13433       if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13434        cp_list =
13435         add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13436       }
13437       else {
13438        depends_list =
13439        add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13440       }
13441      }
13442
13443      if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13444       && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13445      {
13446       /* Certain Latin1 characters have matches outside
13447       * Latin1.  To get here, <j> is one of those
13448       * characters.   None of these matches is valid for
13449       * ASCII characters under /aa, which is why the 'if'
13450       * just above excludes those.  These matches only
13451       * happen when the target string is utf8.  The code
13452       * below adds the single fold closures for <j> to the
13453       * inversion list. */
13454       switch (j) {
13455        case 'k':
13456        case 'K':
13457         cp_list =
13458          add_cp_to_invlist(cp_list, KELVIN_SIGN);
13459         break;
13460        case 's':
13461        case 'S':
13462         cp_list = add_cp_to_invlist(cp_list,
13463              LATIN_SMALL_LETTER_LONG_S);
13464         break;
13465        case MICRO_SIGN:
13466         cp_list = add_cp_to_invlist(cp_list,
13467              GREEK_CAPITAL_LETTER_MU);
13468         cp_list = add_cp_to_invlist(cp_list,
13469              GREEK_SMALL_LETTER_MU);
13470         break;
13471        case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13472        case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13473         cp_list =
13474          add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13475         break;
13476        case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13477         cp_list = add_cp_to_invlist(cp_list,
13478           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13479         break;
13480        case LATIN_SMALL_LETTER_SHARP_S:
13481         cp_list = add_cp_to_invlist(cp_list,
13482             LATIN_CAPITAL_LETTER_SHARP_S);
13483         break;
13484        case 'F': case 'f':
13485        case 'I': case 'i':
13486        case 'L': case 'l':
13487        case 'T': case 't':
13488        case 'A': case 'a':
13489        case 'H': case 'h':
13490        case 'J': case 'j':
13491        case 'N': case 'n':
13492        case 'W': case 'w':
13493        case 'Y': case 'y':
13494         /* These all are targets of multi-character
13495         * folds from code points that require UTF8 to
13496         * express, so they can't match unless the
13497         * target string is in UTF-8, so no action here
13498         * is necessary, as regexec.c properly handles
13499         * the general case for UTF-8 matching and
13500         * multi-char folds */
13501         break;
13502        default:
13503         /* Use deprecated warning to increase the
13504         * chances of this being output */
13505         ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13506         break;
13507       }
13508      }
13509      continue;
13510     }
13511
13512     /* Here is an above Latin1 character.  We don't have the rules
13513     * hard-coded for it.  First, get its fold.  This is the simple
13514     * fold, as the multi-character folds have been handled earlier
13515     * and separated out */
13516     _to_uni_fold_flags(j, foldbuf, &foldlen,
13517            ((LOC)
13518            ? FOLD_FLAGS_LOCALE
13519            : (ASCII_FOLD_RESTRICTED)
13520             ? FOLD_FLAGS_NOMIX_ASCII
13521             : 0));
13522
13523     /* Single character fold of above Latin1.  Add everything in
13524     * its fold closure to the list that this node should match.
13525     * The fold closures data structure is a hash with the keys
13526     * being the UTF-8 of every character that is folded to, like
13527     * 'k', and the values each an array of all code points that
13528     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13529     * Multi-character folds are not included */
13530     if ((listp = hv_fetch(PL_utf8_foldclosures,
13531          (char *) foldbuf, foldlen, FALSE)))
13532     {
13533      AV* list = (AV*) *listp;
13534      IV k;
13535      for (k = 0; k <= av_len(list); k++) {
13536       SV** c_p = av_fetch(list, k, FALSE);
13537       UV c;
13538       if (c_p == NULL) {
13539        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13540       }
13541       c = SvUV(*c_p);
13542
13543       /* /aa doesn't allow folds between ASCII and non-; /l
13544       * doesn't allow them between above and below 256 */
13545       if ((ASCII_FOLD_RESTRICTED
13546         && (isASCII(c) != isASCII(j)))
13547        || (LOC && ((c < 256) != (j < 256))))
13548       {
13549        continue;
13550       }
13551
13552       /* Folds involving non-ascii Latin1 characters
13553       * under /d are added to a separate list */
13554       if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13555       {
13556        cp_list = add_cp_to_invlist(cp_list, c);
13557       }
13558       else {
13559       depends_list = add_cp_to_invlist(depends_list, c);
13560       }
13561      }
13562     }
13563    }
13564   }
13565   SvREFCNT_dec_NN(fold_intersection);
13566  }
13567
13568  /* And combine the result (if any) with any inversion list from posix
13569  * classes.  The lists are kept separate up to now because we don't want to
13570  * fold the classes (folding of those is automatically handled by the swash
13571  * fetching code) */
13572  if (posixes) {
13573   if (! DEPENDS_SEMANTICS) {
13574    if (cp_list) {
13575     _invlist_union(cp_list, posixes, &cp_list);
13576     SvREFCNT_dec_NN(posixes);
13577    }
13578    else {
13579     cp_list = posixes;
13580    }
13581   }
13582   else {
13583    /* Under /d, we put into a separate list the Latin1 things that
13584    * match only when the target string is utf8 */
13585    SV* nonascii_but_latin1_properties = NULL;
13586    _invlist_intersection(posixes, PL_Latin1,
13587         &nonascii_but_latin1_properties);
13588    _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13589        &nonascii_but_latin1_properties);
13590    _invlist_subtract(posixes, nonascii_but_latin1_properties,
13591        &posixes);
13592    if (cp_list) {
13593     _invlist_union(cp_list, posixes, &cp_list);
13594     SvREFCNT_dec_NN(posixes);
13595    }
13596    else {
13597     cp_list = posixes;
13598    }
13599
13600    if (depends_list) {
13601     _invlist_union(depends_list, nonascii_but_latin1_properties,
13602        &depends_list);
13603     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13604    }
13605    else {
13606     depends_list = nonascii_but_latin1_properties;
13607    }
13608   }
13609  }
13610
13611  /* And combine the result (if any) with any inversion list from properties.
13612  * The lists are kept separate up to now so that we can distinguish the two
13613  * in regards to matching above-Unicode.  A run-time warning is generated
13614  * if a Unicode property is matched against a non-Unicode code point. But,
13615  * we allow user-defined properties to match anything, without any warning,
13616  * and we also suppress the warning if there is a portion of the character
13617  * class that isn't a Unicode property, and which matches above Unicode, \W
13618  * or [\x{110000}] for example.
13619  * (Note that in this case, unlike the Posix one above, there is no
13620  * <depends_list>, because having a Unicode property forces Unicode
13621  * semantics */
13622  if (properties) {
13623   bool warn_super = ! has_user_defined_property;
13624   if (cp_list) {
13625
13626    /* If it matters to the final outcome, see if a non-property
13627    * component of the class matches above Unicode.  If so, the
13628    * warning gets suppressed.  This is true even if just a single
13629    * such code point is specified, as though not strictly correct if
13630    * another such code point is matched against, the fact that they
13631    * are using above-Unicode code points indicates they should know
13632    * the issues involved */
13633    if (warn_super) {
13634     bool non_prop_matches_above_Unicode =
13635        runtime_posix_matches_above_Unicode
13636        | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13637     if (invert) {
13638      non_prop_matches_above_Unicode =
13639            !  non_prop_matches_above_Unicode;
13640     }
13641     warn_super = ! non_prop_matches_above_Unicode;
13642    }
13643
13644    _invlist_union(properties, cp_list, &cp_list);
13645    SvREFCNT_dec_NN(properties);
13646   }
13647   else {
13648    cp_list = properties;
13649   }
13650
13651   if (warn_super) {
13652    OP(ret) = ANYOF_WARN_SUPER;
13653   }
13654  }
13655
13656  /* Here, we have calculated what code points should be in the character
13657  * class.
13658  *
13659  * Now we can see about various optimizations.  Fold calculation (which we
13660  * did above) needs to take place before inversion.  Otherwise /[^k]/i
13661  * would invert to include K, which under /i would match k, which it
13662  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13663  * folded until runtime */
13664
13665  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13666  * at compile time.  Besides not inverting folded locale now, we can't
13667  * invert if there are things such as \w, which aren't known until runtime
13668  * */
13669  if (invert
13670   && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13671   && ! depends_list
13672   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13673  {
13674   _invlist_invert(cp_list);
13675
13676   /* Any swash can't be used as-is, because we've inverted things */
13677   if (swash) {
13678    SvREFCNT_dec_NN(swash);
13679    swash = NULL;
13680   }
13681
13682   /* Clear the invert flag since have just done it here */
13683   invert = FALSE;
13684  }
13685
13686  if (ret_invlist) {
13687   *ret_invlist = cp_list;
13688   SvREFCNT_dec(swash);
13689
13690   /* Discard the generated node */
13691   if (SIZE_ONLY) {
13692    RExC_size = orig_size;
13693   }
13694   else {
13695    RExC_emit = orig_emit;
13696   }
13697   return orig_emit;
13698  }
13699
13700  /* If we didn't do folding, it's because some information isn't available
13701  * until runtime; set the run-time fold flag for these.  (We don't have to
13702  * worry about properties folding, as that is taken care of by the swash
13703  * fetching) */
13704  if (FOLD && LOC)
13705  {
13706  ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13707  }
13708
13709  /* Some character classes are equivalent to other nodes.  Such nodes take
13710  * up less room and generally fewer operations to execute than ANYOF nodes.
13711  * Above, we checked for and optimized into some such equivalents for
13712  * certain common classes that are easy to test.  Getting to this point in
13713  * the code means that the class didn't get optimized there.  Since this
13714  * code is only executed in Pass 2, it is too late to save space--it has
13715  * been allocated in Pass 1, and currently isn't given back.  But turning
13716  * things into an EXACTish node can allow the optimizer to join it to any
13717  * adjacent such nodes.  And if the class is equivalent to things like /./,
13718  * expensive run-time swashes can be avoided.  Now that we have more
13719  * complete information, we can find things necessarily missed by the
13720  * earlier code.  I (khw) am not sure how much to look for here.  It would
13721  * be easy, but perhaps too slow, to check any candidates against all the
13722  * node types they could possibly match using _invlistEQ(). */
13723
13724  if (cp_list
13725   && ! invert
13726   && ! depends_list
13727   && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13728   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13729  {
13730   UV start, end;
13731   U8 op = END;  /* The optimzation node-type */
13732   const char * cur_parse= RExC_parse;
13733
13734   invlist_iterinit(cp_list);
13735   if (! invlist_iternext(cp_list, &start, &end)) {
13736
13737    /* Here, the list is empty.  This happens, for example, when a
13738    * Unicode property is the only thing in the character class, and
13739    * it doesn't match anything.  (perluniprops.pod notes such
13740    * properties) */
13741    op = OPFAIL;
13742    *flagp |= HASWIDTH|SIMPLE;
13743   }
13744   else if (start == end) {    /* The range is a single code point */
13745    if (! invlist_iternext(cp_list, &start, &end)
13746
13747      /* Don't do this optimization if it would require changing
13748      * the pattern to UTF-8 */
13749     && (start < 256 || UTF))
13750    {
13751     /* Here, the list contains a single code point.  Can optimize
13752     * into an EXACT node */
13753
13754     value = start;
13755
13756     if (! FOLD) {
13757      op = EXACT;
13758     }
13759     else if (LOC) {
13760
13761      /* A locale node under folding with one code point can be
13762      * an EXACTFL, as its fold won't be calculated until
13763      * runtime */
13764      op = EXACTFL;
13765     }
13766     else {
13767
13768      /* Here, we are generally folding, but there is only one
13769      * code point to match.  If we have to, we use an EXACT
13770      * node, but it would be better for joining with adjacent
13771      * nodes in the optimization pass if we used the same
13772      * EXACTFish node that any such are likely to be.  We can
13773      * do this iff the code point doesn't participate in any
13774      * folds.  For example, an EXACTF of a colon is the same as
13775      * an EXACT one, since nothing folds to or from a colon. */
13776      if (value < 256) {
13777       if (IS_IN_SOME_FOLD_L1(value)) {
13778        op = EXACT;
13779       }
13780      }
13781      else {
13782       if (! PL_utf8_foldable) {
13783        SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13784             &PL_sv_undef, 1, 0);
13785        PL_utf8_foldable = _get_swash_invlist(swash);
13786        SvREFCNT_dec_NN(swash);
13787       }
13788       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13789        op = EXACT;
13790       }
13791      }
13792
13793      /* If we haven't found the node type, above, it means we
13794      * can use the prevailing one */
13795      if (op == END) {
13796       op = compute_EXACTish(pRExC_state);
13797      }
13798     }
13799    }
13800   }
13801   else if (start == 0) {
13802    if (end == UV_MAX) {
13803     op = SANY;
13804     *flagp |= HASWIDTH|SIMPLE;
13805     RExC_naughty++;
13806    }
13807    else if (end == '\n' - 1
13808      && invlist_iternext(cp_list, &start, &end)
13809      && start == '\n' + 1 && end == UV_MAX)
13810    {
13811     op = REG_ANY;
13812     *flagp |= HASWIDTH|SIMPLE;
13813     RExC_naughty++;
13814    }
13815   }
13816   invlist_iterfinish(cp_list);
13817
13818   if (op != END) {
13819    RExC_parse = (char *)orig_parse;
13820    RExC_emit = (regnode *)orig_emit;
13821
13822    ret = reg_node(pRExC_state, op);
13823
13824    RExC_parse = (char *)cur_parse;
13825
13826    if (PL_regkind[op] == EXACT) {
13827     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13828    }
13829
13830    SvREFCNT_dec_NN(cp_list);
13831    return ret;
13832   }
13833  }
13834
13835  /* Here, <cp_list> contains all the code points we can determine at
13836  * compile time that match under all conditions.  Go through it, and
13837  * for things that belong in the bitmap, put them there, and delete from
13838  * <cp_list>.  While we are at it, see if everything above 255 is in the
13839  * list, and if so, set a flag to speed up execution */
13840  ANYOF_BITMAP_ZERO(ret);
13841  if (cp_list) {
13842
13843   /* This gets set if we actually need to modify things */
13844   bool change_invlist = FALSE;
13845
13846   UV start, end;
13847
13848   /* Start looking through <cp_list> */
13849   invlist_iterinit(cp_list);
13850   while (invlist_iternext(cp_list, &start, &end)) {
13851    UV high;
13852    int i;
13853
13854    if (end == UV_MAX && start <= 256) {
13855     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13856    }
13857
13858    /* Quit if are above what we should change */
13859    if (start > 255) {
13860     break;
13861    }
13862
13863    change_invlist = TRUE;
13864
13865    /* Set all the bits in the range, up to the max that we are doing */
13866    high = (end < 255) ? end : 255;
13867    for (i = start; i <= (int) high; i++) {
13868     if (! ANYOF_BITMAP_TEST(ret, i)) {
13869      ANYOF_BITMAP_SET(ret, i);
13870      prevvalue = value;
13871      value = i;
13872     }
13873    }
13874   }
13875   invlist_iterfinish(cp_list);
13876
13877   /* Done with loop; remove any code points that are in the bitmap from
13878   * <cp_list> */
13879   if (change_invlist) {
13880    _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13881   }
13882
13883   /* If have completely emptied it, remove it completely */
13884   if (_invlist_len(cp_list) == 0) {
13885    SvREFCNT_dec_NN(cp_list);
13886    cp_list = NULL;
13887   }
13888  }
13889
13890  if (invert) {
13891   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13892  }
13893
13894  /* Here, the bitmap has been populated with all the Latin1 code points that
13895  * always match.  Can now add to the overall list those that match only
13896  * when the target string is UTF-8 (<depends_list>). */
13897  if (depends_list) {
13898   if (cp_list) {
13899    _invlist_union(cp_list, depends_list, &cp_list);
13900    SvREFCNT_dec_NN(depends_list);
13901   }
13902   else {
13903    cp_list = depends_list;
13904   }
13905  }
13906
13907  /* If there is a swash and more than one element, we can't use the swash in
13908  * the optimization below. */
13909  if (swash && element_count > 1) {
13910   SvREFCNT_dec_NN(swash);
13911   swash = NULL;
13912  }
13913
13914  if (! cp_list
13915   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13916  {
13917   ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13918  }
13919  else {
13920   /* av[0] stores the character class description in its textual form:
13921   *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13922   *       appropriate swash, and is also useful for dumping the regnode.
13923   * av[1] if NULL, is a placeholder to later contain the swash computed
13924   *       from av[0].  But if no further computation need be done, the
13925   *       swash is stored there now.
13926   * av[2] stores the cp_list inversion list for use in addition or
13927   *       instead of av[0]; used only if av[1] is NULL
13928   * av[3] is set if any component of the class is from a user-defined
13929   *       property; used only if av[1] is NULL */
13930   AV * const av = newAV();
13931   SV *rv;
13932
13933   av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13934       ? SvREFCNT_inc(listsv) : &PL_sv_undef);
13935   if (swash) {
13936    av_store(av, 1, swash);
13937    SvREFCNT_dec_NN(cp_list);
13938   }
13939   else {
13940    av_store(av, 1, NULL);
13941    if (cp_list) {
13942     av_store(av, 2, cp_list);
13943     av_store(av, 3, newSVuv(has_user_defined_property));
13944    }
13945   }
13946
13947   rv = newRV_noinc(MUTABLE_SV(av));
13948   n = add_data(pRExC_state, 1, "s");
13949   RExC_rxi->data->data[n] = (void*)rv;
13950   ARG_SET(ret, n);
13951  }
13952
13953  *flagp |= HASWIDTH|SIMPLE;
13954  return ret;
13955 }
13956 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13957
13958
13959 /* reg_skipcomment()
13960
13961    Absorbs an /x style # comments from the input stream.
13962    Returns true if there is more text remaining in the stream.
13963    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13964    terminates the pattern without including a newline.
13965
13966    Note its the callers responsibility to ensure that we are
13967    actually in /x mode
13968
13969 */
13970
13971 STATIC bool
13972 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13973 {
13974  bool ended = 0;
13975
13976  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13977
13978  while (RExC_parse < RExC_end)
13979   if (*RExC_parse++ == '\n') {
13980    ended = 1;
13981    break;
13982   }
13983  if (!ended) {
13984   /* we ran off the end of the pattern without ending
13985   the comment, so we have to add an \n when wrapping */
13986   RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13987   return 0;
13988  } else
13989   return 1;
13990 }
13991
13992 /* nextchar()
13993
13994    Advances the parse position, and optionally absorbs
13995    "whitespace" from the inputstream.
13996
13997    Without /x "whitespace" means (?#...) style comments only,
13998    with /x this means (?#...) and # comments and whitespace proper.
13999
14000    Returns the RExC_parse point from BEFORE the scan occurs.
14001
14002    This is the /x friendly way of saying RExC_parse++.
14003 */
14004
14005 STATIC char*
14006 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14007 {
14008  char* const retval = RExC_parse++;
14009
14010  PERL_ARGS_ASSERT_NEXTCHAR;
14011
14012  for (;;) {
14013   if (RExC_end - RExC_parse >= 3
14014    && *RExC_parse == '('
14015    && RExC_parse[1] == '?'
14016    && RExC_parse[2] == '#')
14017   {
14018    while (*RExC_parse != ')') {
14019     if (RExC_parse == RExC_end)
14020      FAIL("Sequence (?#... not terminated");
14021     RExC_parse++;
14022    }
14023    RExC_parse++;
14024    continue;
14025   }
14026   if (RExC_flags & RXf_PMf_EXTENDED) {
14027    if (isSPACE(*RExC_parse)) {
14028     RExC_parse++;
14029     continue;
14030    }
14031    else if (*RExC_parse == '#') {
14032     if ( reg_skipcomment( pRExC_state ) )
14033      continue;
14034    }
14035   }
14036   return retval;
14037  }
14038 }
14039
14040 /*
14041 - reg_node - emit a node
14042 */
14043 STATIC regnode *   /* Location. */
14044 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14045 {
14046  dVAR;
14047  regnode *ptr;
14048  regnode * const ret = RExC_emit;
14049  GET_RE_DEBUG_FLAGS_DECL;
14050
14051  PERL_ARGS_ASSERT_REG_NODE;
14052
14053  if (SIZE_ONLY) {
14054   SIZE_ALIGN(RExC_size);
14055   RExC_size += 1;
14056   return(ret);
14057  }
14058  if (RExC_emit >= RExC_emit_bound)
14059   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14060     op, RExC_emit, RExC_emit_bound);
14061
14062  NODE_ALIGN_FILL(ret);
14063  ptr = ret;
14064  FILL_ADVANCE_NODE(ptr, op);
14065  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
14066 #ifdef RE_TRACK_PATTERN_OFFSETS
14067  if (RExC_offsets) {         /* MJD */
14068   MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14069    "reg_node", __LINE__,
14070    PL_reg_name[op],
14071    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14072     ? "Overwriting end of array!\n" : "OK",
14073    (UV)(RExC_emit - RExC_emit_start),
14074    (UV)(RExC_parse - RExC_start),
14075    (UV)RExC_offsets[0]));
14076   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14077  }
14078 #endif
14079  RExC_emit = ptr;
14080  return(ret);
14081 }
14082
14083 /*
14084 - reganode - emit a node with an argument
14085 */
14086 STATIC regnode *   /* Location. */
14087 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14088 {
14089  dVAR;
14090  regnode *ptr;
14091  regnode * const ret = RExC_emit;
14092  GET_RE_DEBUG_FLAGS_DECL;
14093
14094  PERL_ARGS_ASSERT_REGANODE;
14095
14096  if (SIZE_ONLY) {
14097   SIZE_ALIGN(RExC_size);
14098   RExC_size += 2;
14099   /*
14100   We can't do this:
14101
14102   assert(2==regarglen[op]+1);
14103
14104   Anything larger than this has to allocate the extra amount.
14105   If we changed this to be:
14106
14107   RExC_size += (1 + regarglen[op]);
14108
14109   then it wouldn't matter. Its not clear what side effect
14110   might come from that so its not done so far.
14111   -- dmq
14112   */
14113   return(ret);
14114  }
14115  if (RExC_emit >= RExC_emit_bound)
14116   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14117     op, RExC_emit, RExC_emit_bound);
14118
14119  NODE_ALIGN_FILL(ret);
14120  ptr = ret;
14121  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14122  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
14123 #ifdef RE_TRACK_PATTERN_OFFSETS
14124  if (RExC_offsets) {         /* MJD */
14125   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14126    "reganode",
14127    __LINE__,
14128    PL_reg_name[op],
14129    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14130    "Overwriting end of array!\n" : "OK",
14131    (UV)(RExC_emit - RExC_emit_start),
14132    (UV)(RExC_parse - RExC_start),
14133    (UV)RExC_offsets[0]));
14134   Set_Cur_Node_Offset;
14135  }
14136 #endif
14137  RExC_emit = ptr;
14138  return(ret);
14139 }
14140
14141 /*
14142 - reguni - emit (if appropriate) a Unicode character
14143 */
14144 STATIC STRLEN
14145 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14146 {
14147  dVAR;
14148
14149  PERL_ARGS_ASSERT_REGUNI;
14150
14151  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14152 }
14153
14154 /*
14155 - reginsert - insert an operator in front of already-emitted operand
14156 *
14157 * Means relocating the operand.
14158 */
14159 STATIC void
14160 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14161 {
14162  dVAR;
14163  regnode *src;
14164  regnode *dst;
14165  regnode *place;
14166  const int offset = regarglen[(U8)op];
14167  const int size = NODE_STEP_REGNODE + offset;
14168  GET_RE_DEBUG_FLAGS_DECL;
14169
14170  PERL_ARGS_ASSERT_REGINSERT;
14171  PERL_UNUSED_ARG(depth);
14172 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14173  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14174  if (SIZE_ONLY) {
14175   RExC_size += size;
14176   return;
14177  }
14178
14179  src = RExC_emit;
14180  RExC_emit += size;
14181  dst = RExC_emit;
14182  if (RExC_open_parens) {
14183   int paren;
14184   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14185   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14186    if ( RExC_open_parens[paren] >= opnd ) {
14187     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14188     RExC_open_parens[paren] += size;
14189    } else {
14190     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14191    }
14192    if ( RExC_close_parens[paren] >= opnd ) {
14193     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14194     RExC_close_parens[paren] += size;
14195    } else {
14196     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14197    }
14198   }
14199  }
14200
14201  while (src > opnd) {
14202   StructCopy(--src, --dst, regnode);
14203 #ifdef RE_TRACK_PATTERN_OFFSETS
14204   if (RExC_offsets) {     /* MJD 20010112 */
14205    MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14206     "reg_insert",
14207     __LINE__,
14208     PL_reg_name[op],
14209     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14210      ? "Overwriting end of array!\n" : "OK",
14211     (UV)(src - RExC_emit_start),
14212     (UV)(dst - RExC_emit_start),
14213     (UV)RExC_offsets[0]));
14214    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14215    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14216   }
14217 #endif
14218  }
14219
14220
14221  place = opnd;  /* Op node, where operand used to be. */
14222 #ifdef RE_TRACK_PATTERN_OFFSETS
14223  if (RExC_offsets) {         /* MJD */
14224   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14225    "reginsert",
14226    __LINE__,
14227    PL_reg_name[op],
14228    (UV)(place - RExC_emit_start) > RExC_offsets[0]
14229    ? "Overwriting end of array!\n" : "OK",
14230    (UV)(place - RExC_emit_start),
14231    (UV)(RExC_parse - RExC_start),
14232    (UV)RExC_offsets[0]));
14233   Set_Node_Offset(place, RExC_parse);
14234   Set_Node_Length(place, 1);
14235  }
14236 #endif
14237  src = NEXTOPER(place);
14238  FILL_ADVANCE_NODE(place, op);
14239  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
14240  Zero(src, offset, regnode);
14241 }
14242
14243 /*
14244 - regtail - set the next-pointer at the end of a node chain of p to val.
14245 - SEE ALSO: regtail_study
14246 */
14247 /* TODO: All three parms should be const */
14248 STATIC void
14249 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14250 {
14251  dVAR;
14252  regnode *scan;
14253  GET_RE_DEBUG_FLAGS_DECL;
14254
14255  PERL_ARGS_ASSERT_REGTAIL;
14256 #ifndef DEBUGGING
14257  PERL_UNUSED_ARG(depth);
14258 #endif
14259
14260  if (SIZE_ONLY)
14261   return;
14262
14263  /* Find last node. */
14264  scan = p;
14265  for (;;) {
14266   regnode * const temp = regnext(scan);
14267   DEBUG_PARSE_r({
14268    SV * const mysv=sv_newmortal();
14269    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14270    regprop(RExC_rx, mysv, scan);
14271    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14272     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14273      (temp == NULL ? "->" : ""),
14274      (temp == NULL ? PL_reg_name[OP(val)] : "")
14275    );
14276   });
14277   if (temp == NULL)
14278    break;
14279   scan = temp;
14280  }
14281
14282  if (reg_off_by_arg[OP(scan)]) {
14283   ARG_SET(scan, val - scan);
14284  }
14285  else {
14286   NEXT_OFF(scan) = val - scan;
14287  }
14288 }
14289
14290 #ifdef DEBUGGING
14291 /*
14292 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14293 - Look for optimizable sequences at the same time.
14294 - currently only looks for EXACT chains.
14295
14296 This is experimental code. The idea is to use this routine to perform
14297 in place optimizations on branches and groups as they are constructed,
14298 with the long term intention of removing optimization from study_chunk so
14299 that it is purely analytical.
14300
14301 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14302 to control which is which.
14303
14304 */
14305 /* TODO: All four parms should be const */
14306
14307 STATIC U8
14308 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14309 {
14310  dVAR;
14311  regnode *scan;
14312  U8 exact = PSEUDO;
14313 #ifdef EXPERIMENTAL_INPLACESCAN
14314  I32 min = 0;
14315 #endif
14316  GET_RE_DEBUG_FLAGS_DECL;
14317
14318  PERL_ARGS_ASSERT_REGTAIL_STUDY;
14319
14320
14321  if (SIZE_ONLY)
14322   return exact;
14323
14324  /* Find last node. */
14325
14326  scan = p;
14327  for (;;) {
14328   regnode * const temp = regnext(scan);
14329 #ifdef EXPERIMENTAL_INPLACESCAN
14330   if (PL_regkind[OP(scan)] == EXACT) {
14331    bool has_exactf_sharp_s; /* Unexamined in this routine */
14332    if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14333     return EXACT;
14334   }
14335 #endif
14336   if ( exact ) {
14337    switch (OP(scan)) {
14338     case EXACT:
14339     case EXACTF:
14340     case EXACTFA:
14341     case EXACTFU:
14342     case EXACTFU_SS:
14343     case EXACTFU_TRICKYFOLD:
14344     case EXACTFL:
14345       if( exact == PSEUDO )
14346        exact= OP(scan);
14347       else if ( exact != OP(scan) )
14348        exact= 0;
14349     case NOTHING:
14350      break;
14351     default:
14352      exact= 0;
14353    }
14354   }
14355   DEBUG_PARSE_r({
14356    SV * const mysv=sv_newmortal();
14357    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14358    regprop(RExC_rx, mysv, scan);
14359    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14360     SvPV_nolen_const(mysv),
14361     REG_NODE_NUM(scan),
14362     PL_reg_name[exact]);
14363   });
14364   if (temp == NULL)
14365    break;
14366   scan = temp;
14367  }
14368  DEBUG_PARSE_r({
14369   SV * const mysv_val=sv_newmortal();
14370   DEBUG_PARSE_MSG("");
14371   regprop(RExC_rx, mysv_val, val);
14372   PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14373      SvPV_nolen_const(mysv_val),
14374      (IV)REG_NODE_NUM(val),
14375      (IV)(val - scan)
14376   );
14377  });
14378  if (reg_off_by_arg[OP(scan)]) {
14379   ARG_SET(scan, val - scan);
14380  }
14381  else {
14382   NEXT_OFF(scan) = val - scan;
14383  }
14384
14385  return exact;
14386 }
14387 #endif
14388
14389 /*
14390  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14391  */
14392 #ifdef DEBUGGING
14393 static void
14394 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14395 {
14396  int bit;
14397  int set=0;
14398  regex_charset cs;
14399
14400  for (bit=0; bit<32; bit++) {
14401   if (flags & (1<<bit)) {
14402    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
14403     continue;
14404    }
14405    if (!set++ && lead)
14406     PerlIO_printf(Perl_debug_log, "%s",lead);
14407    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14408   }
14409  }
14410  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14411    if (!set++ && lead) {
14412     PerlIO_printf(Perl_debug_log, "%s",lead);
14413    }
14414    switch (cs) {
14415     case REGEX_UNICODE_CHARSET:
14416      PerlIO_printf(Perl_debug_log, "UNICODE");
14417      break;
14418     case REGEX_LOCALE_CHARSET:
14419      PerlIO_printf(Perl_debug_log, "LOCALE");
14420      break;
14421     case REGEX_ASCII_RESTRICTED_CHARSET:
14422      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14423      break;
14424     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14425      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14426      break;
14427     default:
14428      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14429      break;
14430    }
14431  }
14432  if (lead)  {
14433   if (set)
14434    PerlIO_printf(Perl_debug_log, "\n");
14435   else
14436    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14437  }
14438 }
14439 #endif
14440
14441 void
14442 Perl_regdump(pTHX_ const regexp *r)
14443 {
14444 #ifdef DEBUGGING
14445  dVAR;
14446  SV * const sv = sv_newmortal();
14447  SV *dsv= sv_newmortal();
14448  RXi_GET_DECL(r,ri);
14449  GET_RE_DEBUG_FLAGS_DECL;
14450
14451  PERL_ARGS_ASSERT_REGDUMP;
14452
14453  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14454
14455  /* Header fields of interest. */
14456  if (r->anchored_substr) {
14457   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14458    RE_SV_DUMPLEN(r->anchored_substr), 30);
14459   PerlIO_printf(Perl_debug_log,
14460      "anchored %s%s at %"IVdf" ",
14461      s, RE_SV_TAIL(r->anchored_substr),
14462      (IV)r->anchored_offset);
14463  } else if (r->anchored_utf8) {
14464   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14465    RE_SV_DUMPLEN(r->anchored_utf8), 30);
14466   PerlIO_printf(Perl_debug_log,
14467      "anchored utf8 %s%s at %"IVdf" ",
14468      s, RE_SV_TAIL(r->anchored_utf8),
14469      (IV)r->anchored_offset);
14470  }
14471  if (r->float_substr) {
14472   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14473    RE_SV_DUMPLEN(r->float_substr), 30);
14474   PerlIO_printf(Perl_debug_log,
14475      "floating %s%s at %"IVdf"..%"UVuf" ",
14476      s, RE_SV_TAIL(r->float_substr),
14477      (IV)r->float_min_offset, (UV)r->float_max_offset);
14478  } else if (r->float_utf8) {
14479   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14480    RE_SV_DUMPLEN(r->float_utf8), 30);
14481   PerlIO_printf(Perl_debug_log,
14482      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14483      s, RE_SV_TAIL(r->float_utf8),
14484      (IV)r->float_min_offset, (UV)r->float_max_offset);
14485  }
14486  if (r->check_substr || r->check_utf8)
14487   PerlIO_printf(Perl_debug_log,
14488      (const char *)
14489      (r->check_substr == r->float_substr
14490      && r->check_utf8 == r->float_utf8
14491      ? "(checking floating" : "(checking anchored"));
14492  if (r->extflags & RXf_NOSCAN)
14493   PerlIO_printf(Perl_debug_log, " noscan");
14494  if (r->extflags & RXf_CHECK_ALL)
14495   PerlIO_printf(Perl_debug_log, " isall");
14496  if (r->check_substr || r->check_utf8)
14497   PerlIO_printf(Perl_debug_log, ") ");
14498
14499  if (ri->regstclass) {
14500   regprop(r, sv, ri->regstclass);
14501   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14502  }
14503  if (r->extflags & RXf_ANCH) {
14504   PerlIO_printf(Perl_debug_log, "anchored");
14505   if (r->extflags & RXf_ANCH_BOL)
14506    PerlIO_printf(Perl_debug_log, "(BOL)");
14507   if (r->extflags & RXf_ANCH_MBOL)
14508    PerlIO_printf(Perl_debug_log, "(MBOL)");
14509   if (r->extflags & RXf_ANCH_SBOL)
14510    PerlIO_printf(Perl_debug_log, "(SBOL)");
14511   if (r->extflags & RXf_ANCH_GPOS)
14512    PerlIO_printf(Perl_debug_log, "(GPOS)");
14513   PerlIO_putc(Perl_debug_log, ' ');
14514  }
14515  if (r->extflags & RXf_GPOS_SEEN)
14516   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14517  if (r->intflags & PREGf_SKIP)
14518   PerlIO_printf(Perl_debug_log, "plus ");
14519  if (r->intflags & PREGf_IMPLICIT)
14520   PerlIO_printf(Perl_debug_log, "implicit ");
14521  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14522  if (r->extflags & RXf_EVAL_SEEN)
14523   PerlIO_printf(Perl_debug_log, "with eval ");
14524  PerlIO_printf(Perl_debug_log, "\n");
14525  DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14526 #else
14527  PERL_ARGS_ASSERT_REGDUMP;
14528  PERL_UNUSED_CONTEXT;
14529  PERL_UNUSED_ARG(r);
14530 #endif /* DEBUGGING */
14531 }
14532
14533 /*
14534 - regprop - printable representation of opcode
14535 */
14536 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14537 STMT_START { \
14538   if (do_sep) {                           \
14539    Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14540    if (flags & ANYOF_INVERT)           \
14541     /*make sure the invert info is in each */ \
14542     sv_catpvs(sv, "^");             \
14543    do_sep = 0;                         \
14544   }                                       \
14545 } STMT_END
14546
14547 void
14548 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14549 {
14550 #ifdef DEBUGGING
14551  dVAR;
14552  int k;
14553
14554  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14555  static const char * const anyofs[] = {
14556 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14557  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14558  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14559  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14560  || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14561  || _CC_VERTSPACE != 16
14562   #error Need to adjust order of anyofs[]
14563 #endif
14564   "[\\w]",
14565   "[\\W]",
14566   "[\\d]",
14567   "[\\D]",
14568   "[:alpha:]",
14569   "[:^alpha:]",
14570   "[:lower:]",
14571   "[:^lower:]",
14572   "[:upper:]",
14573   "[:^upper:]",
14574   "[:punct:]",
14575   "[:^punct:]",
14576   "[:print:]",
14577   "[:^print:]",
14578   "[:alnum:]",
14579   "[:^alnum:]",
14580   "[:graph:]",
14581   "[:^graph:]",
14582   "[:cased:]",
14583   "[:^cased:]",
14584   "[\\s]",
14585   "[\\S]",
14586   "[:blank:]",
14587   "[:^blank:]",
14588   "[:xdigit:]",
14589   "[:^xdigit:]",
14590   "[:space:]",
14591   "[:^space:]",
14592   "[:cntrl:]",
14593   "[:^cntrl:]",
14594   "[:ascii:]",
14595   "[:^ascii:]",
14596   "[\\v]",
14597   "[\\V]"
14598  };
14599  RXi_GET_DECL(prog,progi);
14600  GET_RE_DEBUG_FLAGS_DECL;
14601
14602  PERL_ARGS_ASSERT_REGPROP;
14603
14604  sv_setpvs(sv, "");
14605
14606  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
14607   /* It would be nice to FAIL() here, but this may be called from
14608   regexec.c, and it would be hard to supply pRExC_state. */
14609   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14610  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14611
14612  k = PL_regkind[OP(o)];
14613
14614  if (k == EXACT) {
14615   sv_catpvs(sv, " ");
14616   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14617   * is a crude hack but it may be the best for now since
14618   * we have no flag "this EXACTish node was UTF-8"
14619   * --jhi */
14620   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14621     PERL_PV_ESCAPE_UNI_DETECT |
14622     PERL_PV_ESCAPE_NONASCII   |
14623     PERL_PV_PRETTY_ELLIPSES   |
14624     PERL_PV_PRETTY_LTGT       |
14625     PERL_PV_PRETTY_NOCLEAR
14626     );
14627  } else if (k == TRIE) {
14628   /* print the details of the trie in dumpuntil instead, as
14629   * progi->data isn't available here */
14630   const char op = OP(o);
14631   const U32 n = ARG(o);
14632   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14633    (reg_ac_data *)progi->data->data[n] :
14634    NULL;
14635   const reg_trie_data * const trie
14636    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14637
14638   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14639   DEBUG_TRIE_COMPILE_r(
14640    Perl_sv_catpvf(aTHX_ sv,
14641     "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14642     (UV)trie->startstate,
14643     (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14644     (UV)trie->wordcount,
14645     (UV)trie->minlen,
14646     (UV)trie->maxlen,
14647     (UV)TRIE_CHARCOUNT(trie),
14648     (UV)trie->uniquecharcount
14649    )
14650   );
14651   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14652    int i;
14653    int rangestart = -1;
14654    U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14655    sv_catpvs(sv, "[");
14656    for (i = 0; i <= 256; i++) {
14657     if (i < 256 && BITMAP_TEST(bitmap,i)) {
14658      if (rangestart == -1)
14659       rangestart = i;
14660     } else if (rangestart != -1) {
14661      if (i <= rangestart + 3)
14662       for (; rangestart < i; rangestart++)
14663        put_byte(sv, rangestart);
14664      else {
14665       put_byte(sv, rangestart);
14666       sv_catpvs(sv, "-");
14667       put_byte(sv, i - 1);
14668      }
14669      rangestart = -1;
14670     }
14671    }
14672    sv_catpvs(sv, "]");
14673   }
14674
14675  } else if (k == CURLY) {
14676   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14677    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14678   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14679  }
14680  else if (k == WHILEM && o->flags)   /* Ordinal/of */
14681   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14682  else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14683   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14684   if ( RXp_PAREN_NAMES(prog) ) {
14685    if ( k != REF || (OP(o) < NREF)) {
14686     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14687     SV **name= av_fetch(list, ARG(o), 0 );
14688     if (name)
14689      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14690    }
14691    else {
14692     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14693     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14694     I32 *nums=(I32*)SvPVX(sv_dat);
14695     SV **name= av_fetch(list, nums[0], 0 );
14696     I32 n;
14697     if (name) {
14698      for ( n=0; n<SvIVX(sv_dat); n++ ) {
14699       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14700          (n ? "," : ""), (IV)nums[n]);
14701      }
14702      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14703     }
14704    }
14705   }
14706  } else if (k == GOSUB)
14707   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14708  else if (k == VERB) {
14709   if (!o->flags)
14710    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14711       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14712  } else if (k == LOGICAL)
14713   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14714  else if (k == ANYOF) {
14715   int i, rangestart = -1;
14716   const U8 flags = ANYOF_FLAGS(o);
14717   int do_sep = 0;
14718
14719
14720   if (flags & ANYOF_LOCALE)
14721    sv_catpvs(sv, "{loc}");
14722   if (flags & ANYOF_LOC_FOLD)
14723    sv_catpvs(sv, "{i}");
14724   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14725   if (flags & ANYOF_INVERT)
14726    sv_catpvs(sv, "^");
14727
14728   /* output what the standard cp 0-255 bitmap matches */
14729   for (i = 0; i <= 256; i++) {
14730    if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14731     if (rangestart == -1)
14732      rangestart = i;
14733    } else if (rangestart != -1) {
14734     if (i <= rangestart + 3)
14735      for (; rangestart < i; rangestart++)
14736       put_byte(sv, rangestart);
14737     else {
14738      put_byte(sv, rangestart);
14739      sv_catpvs(sv, "-");
14740      put_byte(sv, i - 1);
14741     }
14742     do_sep = 1;
14743     rangestart = -1;
14744    }
14745   }
14746
14747   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14748   /* output any special charclass tests (used entirely under use locale) */
14749   if (ANYOF_CLASS_TEST_ANY_SET(o))
14750    for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14751     if (ANYOF_CLASS_TEST(o,i)) {
14752      sv_catpv(sv, anyofs[i]);
14753      do_sep = 1;
14754     }
14755
14756   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14757
14758   if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14759    sv_catpvs(sv, "{non-utf8-latin1-all}");
14760   }
14761
14762   /* output information about the unicode matching */
14763   if (flags & ANYOF_UNICODE_ALL)
14764    sv_catpvs(sv, "{unicode_all}");
14765   else if (ANYOF_NONBITMAP(o))
14766    sv_catpvs(sv, "{unicode}");
14767   if (flags & ANYOF_NONBITMAP_NON_UTF8)
14768    sv_catpvs(sv, "{outside bitmap}");
14769
14770   if (ANYOF_NONBITMAP(o)) {
14771    SV *lv; /* Set if there is something outside the bit map */
14772    SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14773    bool byte_output = FALSE;   /* If something in the bitmap has been
14774           output */
14775
14776    if (lv && lv != &PL_sv_undef) {
14777     if (sw) {
14778      U8 s[UTF8_MAXBYTES_CASE+1];
14779
14780      for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14781       uvchr_to_utf8(s, i);
14782
14783       if (i < 256
14784        && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
14785                things already
14786                output as part
14787                of the bitmap */
14788        && swash_fetch(sw, s, TRUE))
14789       {
14790        if (rangestart == -1)
14791         rangestart = i;
14792       } else if (rangestart != -1) {
14793        byte_output = TRUE;
14794        if (i <= rangestart + 3)
14795         for (; rangestart < i; rangestart++) {
14796          put_byte(sv, rangestart);
14797         }
14798        else {
14799         put_byte(sv, rangestart);
14800         sv_catpvs(sv, "-");
14801         put_byte(sv, i-1);
14802        }
14803        rangestart = -1;
14804       }
14805      }
14806     }
14807
14808     {
14809      char *s = savesvpv(lv);
14810      char * const origs = s;
14811
14812      while (*s && *s != '\n')
14813       s++;
14814
14815      if (*s == '\n') {
14816       const char * const t = ++s;
14817
14818       if (byte_output) {
14819        sv_catpvs(sv, " ");
14820       }
14821
14822       while (*s) {
14823        if (*s == '\n') {
14824
14825         /* Truncate very long output */
14826         if (s - origs > 256) {
14827          Perl_sv_catpvf(aTHX_ sv,
14828             "%.*s...",
14829             (int) (s - origs - 1),
14830             t);
14831          goto out_dump;
14832         }
14833         *s = ' ';
14834        }
14835        else if (*s == '\t') {
14836         *s = '-';
14837        }
14838        s++;
14839       }
14840       if (s[-1] == ' ')
14841        s[-1] = 0;
14842
14843       sv_catpv(sv, t);
14844      }
14845
14846     out_dump:
14847
14848      Safefree(origs);
14849     }
14850     SvREFCNT_dec_NN(lv);
14851    }
14852   }
14853
14854   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14855  }
14856  else if (k == POSIXD || k == NPOSIXD) {
14857   U8 index = FLAGS(o) * 2;
14858   if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14859    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14860   }
14861   else {
14862    sv_catpv(sv, anyofs[index]);
14863   }
14864  }
14865  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14866   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14867 #else
14868  PERL_UNUSED_CONTEXT;
14869  PERL_UNUSED_ARG(sv);
14870  PERL_UNUSED_ARG(o);
14871  PERL_UNUSED_ARG(prog);
14872 #endif /* DEBUGGING */
14873 }
14874
14875 SV *
14876 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14877 {    /* Assume that RE_INTUIT is set */
14878  dVAR;
14879  struct regexp *const prog = ReANY(r);
14880  GET_RE_DEBUG_FLAGS_DECL;
14881
14882  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14883  PERL_UNUSED_CONTEXT;
14884
14885  DEBUG_COMPILE_r(
14886   {
14887    const char * const s = SvPV_nolen_const(prog->check_substr
14888      ? prog->check_substr : prog->check_utf8);
14889
14890    if (!PL_colorset) reginitcolors();
14891    PerlIO_printf(Perl_debug_log,
14892      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14893      PL_colors[4],
14894      prog->check_substr ? "" : "utf8 ",
14895      PL_colors[5],PL_colors[0],
14896      s,
14897      PL_colors[1],
14898      (strlen(s) > 60 ? "..." : ""));
14899   } );
14900
14901  return prog->check_substr ? prog->check_substr : prog->check_utf8;
14902 }
14903
14904 /*
14905    pregfree()
14906
14907    handles refcounting and freeing the perl core regexp structure. When
14908    it is necessary to actually free the structure the first thing it
14909    does is call the 'free' method of the regexp_engine associated to
14910    the regexp, allowing the handling of the void *pprivate; member
14911    first. (This routine is not overridable by extensions, which is why
14912    the extensions free is called first.)
14913
14914    See regdupe and regdupe_internal if you change anything here.
14915 */
14916 #ifndef PERL_IN_XSUB_RE
14917 void
14918 Perl_pregfree(pTHX_ REGEXP *r)
14919 {
14920  SvREFCNT_dec(r);
14921 }
14922
14923 void
14924 Perl_pregfree2(pTHX_ REGEXP *rx)
14925 {
14926  dVAR;
14927  struct regexp *const r = ReANY(rx);
14928  GET_RE_DEBUG_FLAGS_DECL;
14929
14930  PERL_ARGS_ASSERT_PREGFREE2;
14931
14932  if (r->mother_re) {
14933   ReREFCNT_dec(r->mother_re);
14934  } else {
14935   CALLREGFREE_PVT(rx); /* free the private data */
14936   SvREFCNT_dec(RXp_PAREN_NAMES(r));
14937   Safefree(r->xpv_len_u.xpvlenu_pv);
14938  }
14939  if (r->substrs) {
14940   SvREFCNT_dec(r->anchored_substr);
14941   SvREFCNT_dec(r->anchored_utf8);
14942   SvREFCNT_dec(r->float_substr);
14943   SvREFCNT_dec(r->float_utf8);
14944   Safefree(r->substrs);
14945  }
14946  RX_MATCH_COPY_FREE(rx);
14947 #ifdef PERL_ANY_COW
14948  SvREFCNT_dec(r->saved_copy);
14949 #endif
14950  Safefree(r->offs);
14951  SvREFCNT_dec(r->qr_anoncv);
14952  rx->sv_u.svu_rx = 0;
14953 }
14954
14955 /*  reg_temp_copy()
14956
14957  This is a hacky workaround to the structural issue of match results
14958  being stored in the regexp structure which is in turn stored in
14959  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14960  could be PL_curpm in multiple contexts, and could require multiple
14961  result sets being associated with the pattern simultaneously, such
14962  as when doing a recursive match with (??{$qr})
14963
14964  The solution is to make a lightweight copy of the regexp structure
14965  when a qr// is returned from the code executed by (??{$qr}) this
14966  lightweight copy doesn't actually own any of its data except for
14967  the starp/end and the actual regexp structure itself.
14968
14969 */
14970
14971
14972 REGEXP *
14973 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14974 {
14975  struct regexp *ret;
14976  struct regexp *const r = ReANY(rx);
14977  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14978
14979  PERL_ARGS_ASSERT_REG_TEMP_COPY;
14980
14981  if (!ret_x)
14982   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14983  else {
14984   SvOK_off((SV *)ret_x);
14985   if (islv) {
14986    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14987    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
14988    made both spots point to the same regexp body.) */
14989    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14990    assert(!SvPVX(ret_x));
14991    ret_x->sv_u.svu_rx = temp->sv_any;
14992    temp->sv_any = NULL;
14993    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14994    SvREFCNT_dec_NN(temp);
14995    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14996    ing below will not set it. */
14997    SvCUR_set(ret_x, SvCUR(rx));
14998   }
14999  }
15000  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15001  sv_force_normal(sv) is called.  */
15002  SvFAKE_on(ret_x);
15003  ret = ReANY(ret_x);
15004
15005  SvFLAGS(ret_x) |= SvUTF8(rx);
15006  /* We share the same string buffer as the original regexp, on which we
15007  hold a reference count, incremented when mother_re is set below.
15008  The string pointer is copied here, being part of the regexp struct.
15009  */
15010  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15011   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15012  if (r->offs) {
15013   const I32 npar = r->nparens+1;
15014   Newx(ret->offs, npar, regexp_paren_pair);
15015   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15016  }
15017  if (r->substrs) {
15018   Newx(ret->substrs, 1, struct reg_substr_data);
15019   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15020
15021   SvREFCNT_inc_void(ret->anchored_substr);
15022   SvREFCNT_inc_void(ret->anchored_utf8);
15023   SvREFCNT_inc_void(ret->float_substr);
15024   SvREFCNT_inc_void(ret->float_utf8);
15025
15026   /* check_substr and check_utf8, if non-NULL, point to either their
15027   anchored or float namesakes, and don't hold a second reference.  */
15028  }
15029  RX_MATCH_COPIED_off(ret_x);
15030 #ifdef PERL_ANY_COW
15031  ret->saved_copy = NULL;
15032 #endif
15033  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15034  SvREFCNT_inc_void(ret->qr_anoncv);
15035
15036  return ret_x;
15037 }
15038 #endif
15039
15040 /* regfree_internal()
15041
15042    Free the private data in a regexp. This is overloadable by
15043    extensions. Perl takes care of the regexp structure in pregfree(),
15044    this covers the *pprivate pointer which technically perl doesn't
15045    know about, however of course we have to handle the
15046    regexp_internal structure when no extension is in use.
15047
15048    Note this is called before freeing anything in the regexp
15049    structure.
15050  */
15051
15052 void
15053 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15054 {
15055  dVAR;
15056  struct regexp *const r = ReANY(rx);
15057  RXi_GET_DECL(r,ri);
15058  GET_RE_DEBUG_FLAGS_DECL;
15059
15060  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15061
15062  DEBUG_COMPILE_r({
15063   if (!PL_colorset)
15064    reginitcolors();
15065   {
15066    SV *dsv= sv_newmortal();
15067    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15068     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15069    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15070     PL_colors[4],PL_colors[5],s);
15071   }
15072  });
15073 #ifdef RE_TRACK_PATTERN_OFFSETS
15074  if (ri->u.offsets)
15075   Safefree(ri->u.offsets);             /* 20010421 MJD */
15076 #endif
15077  if (ri->code_blocks) {
15078   int n;
15079   for (n = 0; n < ri->num_code_blocks; n++)
15080    SvREFCNT_dec(ri->code_blocks[n].src_regex);
15081   Safefree(ri->code_blocks);
15082  }
15083
15084  if (ri->data) {
15085   int n = ri->data->count;
15086
15087   while (--n >= 0) {
15088   /* If you add a ->what type here, update the comment in regcomp.h */
15089    switch (ri->data->what[n]) {
15090    case 'a':
15091    case 'r':
15092    case 's':
15093    case 'S':
15094    case 'u':
15095     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15096     break;
15097    case 'f':
15098     Safefree(ri->data->data[n]);
15099     break;
15100    case 'l':
15101    case 'L':
15102     break;
15103    case 'T':
15104     { /* Aho Corasick add-on structure for a trie node.
15105      Used in stclass optimization only */
15106      U32 refcount;
15107      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15108      OP_REFCNT_LOCK;
15109      refcount = --aho->refcount;
15110      OP_REFCNT_UNLOCK;
15111      if ( !refcount ) {
15112       PerlMemShared_free(aho->states);
15113       PerlMemShared_free(aho->fail);
15114       /* do this last!!!! */
15115       PerlMemShared_free(ri->data->data[n]);
15116       PerlMemShared_free(ri->regstclass);
15117      }
15118     }
15119     break;
15120    case 't':
15121     {
15122      /* trie structure. */
15123      U32 refcount;
15124      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15125      OP_REFCNT_LOCK;
15126      refcount = --trie->refcount;
15127      OP_REFCNT_UNLOCK;
15128      if ( !refcount ) {
15129       PerlMemShared_free(trie->charmap);
15130       PerlMemShared_free(trie->states);
15131       PerlMemShared_free(trie->trans);
15132       if (trie->bitmap)
15133        PerlMemShared_free(trie->bitmap);
15134       if (trie->jump)
15135        PerlMemShared_free(trie->jump);
15136       PerlMemShared_free(trie->wordinfo);
15137       /* do this last!!!! */
15138       PerlMemShared_free(ri->data->data[n]);
15139      }
15140     }
15141     break;
15142    default:
15143     Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15144    }
15145   }
15146   Safefree(ri->data->what);
15147   Safefree(ri->data);
15148  }
15149
15150  Safefree(ri);
15151 }
15152
15153 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15154 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15155 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15156
15157 /*
15158    re_dup - duplicate a regexp.
15159
15160    This routine is expected to clone a given regexp structure. It is only
15161    compiled under USE_ITHREADS.
15162
15163    After all of the core data stored in struct regexp is duplicated
15164    the regexp_engine.dupe method is used to copy any private data
15165    stored in the *pprivate pointer. This allows extensions to handle
15166    any duplication it needs to do.
15167
15168    See pregfree() and regfree_internal() if you change anything here.
15169 */
15170 #if defined(USE_ITHREADS)
15171 #ifndef PERL_IN_XSUB_RE
15172 void
15173 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15174 {
15175  dVAR;
15176  I32 npar;
15177  const struct regexp *r = ReANY(sstr);
15178  struct regexp *ret = ReANY(dstr);
15179
15180  PERL_ARGS_ASSERT_RE_DUP_GUTS;
15181
15182  npar = r->nparens+1;
15183  Newx(ret->offs, npar, regexp_paren_pair);
15184  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15185
15186  if (ret->substrs) {
15187   /* Do it this way to avoid reading from *r after the StructCopy().
15188   That way, if any of the sv_dup_inc()s dislodge *r from the L1
15189   cache, it doesn't matter.  */
15190   const bool anchored = r->check_substr
15191    ? r->check_substr == r->anchored_substr
15192    : r->check_utf8 == r->anchored_utf8;
15193   Newx(ret->substrs, 1, struct reg_substr_data);
15194   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15195
15196   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15197   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15198   ret->float_substr = sv_dup_inc(ret->float_substr, param);
15199   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15200
15201   /* check_substr and check_utf8, if non-NULL, point to either their
15202   anchored or float namesakes, and don't hold a second reference.  */
15203
15204   if (ret->check_substr) {
15205    if (anchored) {
15206     assert(r->check_utf8 == r->anchored_utf8);
15207     ret->check_substr = ret->anchored_substr;
15208     ret->check_utf8 = ret->anchored_utf8;
15209    } else {
15210     assert(r->check_substr == r->float_substr);
15211     assert(r->check_utf8 == r->float_utf8);
15212     ret->check_substr = ret->float_substr;
15213     ret->check_utf8 = ret->float_utf8;
15214    }
15215   } else if (ret->check_utf8) {
15216    if (anchored) {
15217     ret->check_utf8 = ret->anchored_utf8;
15218    } else {
15219     ret->check_utf8 = ret->float_utf8;
15220    }
15221   }
15222  }
15223
15224  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15225  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15226
15227  if (ret->pprivate)
15228   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15229
15230  if (RX_MATCH_COPIED(dstr))
15231   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15232  else
15233   ret->subbeg = NULL;
15234 #ifdef PERL_ANY_COW
15235  ret->saved_copy = NULL;
15236 #endif
15237
15238  /* Whether mother_re be set or no, we need to copy the string.  We
15239  cannot refrain from copying it when the storage points directly to
15240  our mother regexp, because that's
15241    1: a buffer in a different thread
15242    2: something we no longer hold a reference on
15243    so we need to copy it locally.  */
15244  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15245  ret->mother_re   = NULL;
15246  ret->gofs = 0;
15247 }
15248 #endif /* PERL_IN_XSUB_RE */
15249
15250 /*
15251    regdupe_internal()
15252
15253    This is the internal complement to regdupe() which is used to copy
15254    the structure pointed to by the *pprivate pointer in the regexp.
15255    This is the core version of the extension overridable cloning hook.
15256    The regexp structure being duplicated will be copied by perl prior
15257    to this and will be provided as the regexp *r argument, however
15258    with the /old/ structures pprivate pointer value. Thus this routine
15259    may override any copying normally done by perl.
15260
15261    It returns a pointer to the new regexp_internal structure.
15262 */
15263
15264 void *
15265 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15266 {
15267  dVAR;
15268  struct regexp *const r = ReANY(rx);
15269  regexp_internal *reti;
15270  int len;
15271  RXi_GET_DECL(r,ri);
15272
15273  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15274
15275  len = ProgLen(ri);
15276
15277  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15278  Copy(ri->program, reti->program, len+1, regnode);
15279
15280  reti->num_code_blocks = ri->num_code_blocks;
15281  if (ri->code_blocks) {
15282   int n;
15283   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15284     struct reg_code_block);
15285   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15286     struct reg_code_block);
15287   for (n = 0; n < ri->num_code_blocks; n++)
15288    reti->code_blocks[n].src_regex = (REGEXP*)
15289      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15290  }
15291  else
15292   reti->code_blocks = NULL;
15293
15294  reti->regstclass = NULL;
15295
15296  if (ri->data) {
15297   struct reg_data *d;
15298   const int count = ri->data->count;
15299   int i;
15300
15301   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15302     char, struct reg_data);
15303   Newx(d->what, count, U8);
15304
15305   d->count = count;
15306   for (i = 0; i < count; i++) {
15307    d->what[i] = ri->data->what[i];
15308    switch (d->what[i]) {
15309     /* see also regcomp.h and regfree_internal() */
15310    case 'a': /* actually an AV, but the dup function is identical.  */
15311    case 'r':
15312    case 's':
15313    case 'S':
15314    case 'u': /* actually an HV, but the dup function is identical.  */
15315     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15316     break;
15317    case 'f':
15318     /* This is cheating. */
15319     Newx(d->data[i], 1, struct regnode_charclass_class);
15320     StructCopy(ri->data->data[i], d->data[i],
15321        struct regnode_charclass_class);
15322     reti->regstclass = (regnode*)d->data[i];
15323     break;
15324    case 'T':
15325     /* Trie stclasses are readonly and can thus be shared
15326     * without duplication. We free the stclass in pregfree
15327     * when the corresponding reg_ac_data struct is freed.
15328     */
15329     reti->regstclass= ri->regstclass;
15330     /* Fall through */
15331    case 't':
15332     OP_REFCNT_LOCK;
15333     ((reg_trie_data*)ri->data->data[i])->refcount++;
15334     OP_REFCNT_UNLOCK;
15335     /* Fall through */
15336    case 'l':
15337    case 'L':
15338     d->data[i] = ri->data->data[i];
15339     break;
15340    default:
15341     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15342    }
15343   }
15344
15345   reti->data = d;
15346  }
15347  else
15348   reti->data = NULL;
15349
15350  reti->name_list_idx = ri->name_list_idx;
15351
15352 #ifdef RE_TRACK_PATTERN_OFFSETS
15353  if (ri->u.offsets) {
15354   Newx(reti->u.offsets, 2*len+1, U32);
15355   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15356  }
15357 #else
15358  SetProgLen(reti,len);
15359 #endif
15360
15361  return (void*)reti;
15362 }
15363
15364 #endif    /* USE_ITHREADS */
15365
15366 #ifndef PERL_IN_XSUB_RE
15367
15368 /*
15369  - regnext - dig the "next" pointer out of a node
15370  */
15371 regnode *
15372 Perl_regnext(pTHX_ regnode *p)
15373 {
15374  dVAR;
15375  I32 offset;
15376
15377  if (!p)
15378   return(NULL);
15379
15380  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
15381   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15382  }
15383
15384  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15385  if (offset == 0)
15386   return(NULL);
15387
15388  return(p+offset);
15389 }
15390 #endif
15391
15392 STATIC void
15393 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15394 {
15395  va_list args;
15396  STRLEN l1 = strlen(pat1);
15397  STRLEN l2 = strlen(pat2);
15398  char buf[512];
15399  SV *msv;
15400  const char *message;
15401
15402  PERL_ARGS_ASSERT_RE_CROAK2;
15403
15404  if (l1 > 510)
15405   l1 = 510;
15406  if (l1 + l2 > 510)
15407   l2 = 510 - l1;
15408  Copy(pat1, buf, l1 , char);
15409  Copy(pat2, buf + l1, l2 , char);
15410  buf[l1 + l2] = '\n';
15411  buf[l1 + l2 + 1] = '\0';
15412 #ifdef I_STDARG
15413  /* ANSI variant takes additional second argument */
15414  va_start(args, pat2);
15415 #else
15416  va_start(args);
15417 #endif
15418  msv = vmess(buf, &args);
15419  va_end(args);
15420  message = SvPV_const(msv,l1);
15421  if (l1 > 512)
15422   l1 = 512;
15423  Copy(message, buf, l1 , char);
15424  buf[l1-1] = '\0';   /* Overwrite \n */
15425  Perl_croak(aTHX_ "%s", buf);
15426 }
15427
15428 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15429
15430 #ifndef PERL_IN_XSUB_RE
15431 void
15432 Perl_save_re_context(pTHX)
15433 {
15434  dVAR;
15435
15436  struct re_save_state *state;
15437
15438  SAVEVPTR(PL_curcop);
15439  SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15440
15441  state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15442  PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15443  SSPUSHUV(SAVEt_RE_STATE);
15444
15445  Copy(&PL_reg_state, state, 1, struct re_save_state);
15446
15447  PL_reg_oldsaved = NULL;
15448  PL_reg_oldsavedlen = 0;
15449  PL_reg_oldsavedoffset = 0;
15450  PL_reg_oldsavedcoffset = 0;
15451  PL_reg_maxiter = 0;
15452  PL_reg_leftiter = 0;
15453  PL_reg_poscache = NULL;
15454  PL_reg_poscache_size = 0;
15455 #ifdef PERL_ANY_COW
15456  PL_nrs = NULL;
15457 #endif
15458
15459  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15460  if (PL_curpm) {
15461   const REGEXP * const rx = PM_GETRE(PL_curpm);
15462   if (rx) {
15463    U32 i;
15464    for (i = 1; i <= RX_NPARENS(rx); i++) {
15465     char digits[TYPE_CHARS(long)];
15466     const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15467     GV *const *const gvp
15468      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15469
15470     if (gvp) {
15471      GV * const gv = *gvp;
15472      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15473       save_scalar(gv);
15474     }
15475    }
15476   }
15477  }
15478 }
15479 #endif
15480
15481 #ifdef DEBUGGING
15482
15483 STATIC void
15484 S_put_byte(pTHX_ SV *sv, int c)
15485 {
15486  PERL_ARGS_ASSERT_PUT_BYTE;
15487
15488  /* Our definition of isPRINT() ignores locales, so only bytes that are
15489  not part of UTF-8 are considered printable. I assume that the same
15490  holds for UTF-EBCDIC.
15491  Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15492  which Wikipedia says:
15493
15494  EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15495  ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15496  identical, to the ASCII delete (DEL) or rubout control character. ...
15497  it is typically mapped to hexadecimal code 9F, in order to provide a
15498  unique character mapping in both directions)
15499
15500  So the old condition can be simplified to !isPRINT(c)  */
15501  if (!isPRINT(c)) {
15502   if (c < 256) {
15503    Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15504   }
15505   else {
15506    Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15507   }
15508  }
15509  else {
15510   const char string = c;
15511   if (c == '-' || c == ']' || c == '\\' || c == '^')
15512    sv_catpvs(sv, "\\");
15513   sv_catpvn(sv, &string, 1);
15514  }
15515 }
15516
15517
15518 #define CLEAR_OPTSTART \
15519  if (optstart) STMT_START { \
15520    DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15521    optstart=NULL; \
15522  } STMT_END
15523
15524 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15525
15526 STATIC const regnode *
15527 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15528    const regnode *last, const regnode *plast,
15529    SV* sv, I32 indent, U32 depth)
15530 {
15531  dVAR;
15532  U8 op = PSEUDO; /* Arbitrary non-END op. */
15533  const regnode *next;
15534  const regnode *optstart= NULL;
15535
15536  RXi_GET_DECL(r,ri);
15537  GET_RE_DEBUG_FLAGS_DECL;
15538
15539  PERL_ARGS_ASSERT_DUMPUNTIL;
15540
15541 #ifdef DEBUG_DUMPUNTIL
15542  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15543   last ? last-start : 0,plast ? plast-start : 0);
15544 #endif
15545
15546  if (plast && plast < last)
15547   last= plast;
15548
15549  while (PL_regkind[op] != END && (!last || node < last)) {
15550   /* While that wasn't END last time... */
15551   NODE_ALIGN(node);
15552   op = OP(node);
15553   if (op == CLOSE || op == WHILEM)
15554    indent--;
15555   next = regnext((regnode *)node);
15556
15557   /* Where, what. */
15558   if (OP(node) == OPTIMIZED) {
15559    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15560     optstart = node;
15561    else
15562     goto after_print;
15563   } else
15564    CLEAR_OPTSTART;
15565
15566   regprop(r, sv, node);
15567   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15568      (int)(2*indent + 1), "", SvPVX_const(sv));
15569
15570   if (OP(node) != OPTIMIZED) {
15571    if (next == NULL)  /* Next ptr. */
15572     PerlIO_printf(Perl_debug_log, " (0)");
15573    else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15574     PerlIO_printf(Perl_debug_log, " (FAIL)");
15575    else
15576     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15577    (void)PerlIO_putc(Perl_debug_log, '\n');
15578   }
15579
15580  after_print:
15581   if (PL_regkind[(U8)op] == BRANCHJ) {
15582    assert(next);
15583    {
15584     const regnode *nnode = (OP(next) == LONGJMP
15585          ? regnext((regnode *)next)
15586          : next);
15587     if (last && nnode > last)
15588      nnode = last;
15589     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15590    }
15591   }
15592   else if (PL_regkind[(U8)op] == BRANCH) {
15593    assert(next);
15594    DUMPUNTIL(NEXTOPER(node), next);
15595   }
15596   else if ( PL_regkind[(U8)op]  == TRIE ) {
15597    const regnode *this_trie = node;
15598    const char op = OP(node);
15599    const U32 n = ARG(node);
15600    const reg_ac_data * const ac = op>=AHOCORASICK ?
15601    (reg_ac_data *)ri->data->data[n] :
15602    NULL;
15603    const reg_trie_data * const trie =
15604     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15605 #ifdef DEBUGGING
15606    AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15607 #endif
15608    const regnode *nextbranch= NULL;
15609    I32 word_idx;
15610    sv_setpvs(sv, "");
15611    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15612     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15613
15614     PerlIO_printf(Perl_debug_log, "%*s%s ",
15615     (int)(2*(indent+3)), "",
15616      elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15617        PL_colors[0], PL_colors[1],
15618        (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15619        PERL_PV_PRETTY_ELLIPSES    |
15620        PERL_PV_PRETTY_LTGT
15621        )
15622        : "???"
15623     );
15624     if (trie->jump) {
15625      U16 dist= trie->jump[word_idx+1];
15626      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15627         (UV)((dist ? this_trie + dist : next) - start));
15628      if (dist) {
15629       if (!nextbranch)
15630        nextbranch= this_trie + trie->jump[0];
15631       DUMPUNTIL(this_trie + dist, nextbranch);
15632      }
15633      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15634       nextbranch= regnext((regnode *)nextbranch);
15635     } else {
15636      PerlIO_printf(Perl_debug_log, "\n");
15637     }
15638    }
15639    if (last && next > last)
15640     node= last;
15641    else
15642     node= next;
15643   }
15644   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15645    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15646      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15647   }
15648   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15649    assert(next);
15650    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15651   }
15652   else if ( op == PLUS || op == STAR) {
15653    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15654   }
15655   else if (PL_regkind[(U8)op] == ANYOF) {
15656    /* arglen 1 + class block */
15657    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15658      ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15659    node = NEXTOPER(node);
15660   }
15661   else if (PL_regkind[(U8)op] == EXACT) {
15662    /* Literal string, where present. */
15663    node += NODE_SZ_STR(node) - 1;
15664    node = NEXTOPER(node);
15665   }
15666   else {
15667    node = NEXTOPER(node);
15668    node += regarglen[(U8)op];
15669   }
15670   if (op == CURLYX || op == OPEN)
15671    indent++;
15672  }
15673  CLEAR_OPTSTART;
15674 #ifdef DEBUG_DUMPUNTIL
15675  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15676 #endif
15677  return node;
15678 }
15679
15680 #endif /* DEBUGGING */
15681
15682 /*
15683  * Local variables:
15684  * c-indentation-style: bsd
15685  * c-basic-offset: 4
15686  * indent-tabs-mode: nil
15687  * End:
15688  *
15689  * ex: set ts=8 sts=4 sw=4 et:
15690  */