src/5018003/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #undef PERL_IN_XSUB_RE
  76 #define PERL_IN_XSUB_RE 1
  77 #include "perl.h"
  78 #undef PERL_IN_XSUB_RE
  79
  80 #ifndef PERL_IN_XSUB_RE
  81 #include "re_defs.h"
  82 #endif
  83
  84 #define REG_COMP_C
  85 #ifdef PERL_IN_XSUB_RE
  86 #  include "re_comp.h"
  87 extern const struct regexp_engine my_reg_engine;
  88 #else
  89 #  include "regcomp.h"
  90 #endif
  91
  92 #include "dquote_static.c"
  93 #include "charclass_invlists.h"
  94 #include "inline_invlist.c"
  95 #include "unicode_constants.h"
  96
  97 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
 100
 101 #ifdef op
 102 #undef op
 103 #endif /* op */
 104
 105 #ifdef MSDOS
 106 #  if defined(BUGGY_MSC6)
 107  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 108 #    pragma optimize("a",off)
 109  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 110 #    pragma optimize("w",on )
 111 #  endif /* BUGGY_MSC6 */
 112 #endif /* MSDOS */
 113
 114 #ifndef STATIC
 115 #define STATIC static
 116 #endif
 117
 118
 119 typedef struct RExC_state_t {
 120  U32  flags;   /* RXf_* are we folding, multilining? */
 121  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 122  char *precomp;  /* uncompiled string. */
 123  REGEXP *rx_sv;   /* The SV that is the regexp. */
 124  regexp *rx;                    /* perl core regexp structure */
 125  regexp_internal *rxi;           /* internal data for regexp object pprivate field */
 126  char *start;   /* Start of input for compile */
 127  char *end;   /* End of input for compile */
 128  char *parse;   /* Input-scan pointer. */
 129  I32  whilem_seen;  /* number of WHILEM in this expr */
 130  regnode *emit_start;  /* Start of emitted-code area */
 131  regnode *emit_bound;  /* First regnode outside of the allocated space */
 132  regnode *emit;   /* Code-emit pointer; &regdummy = don't = compiling */
 133  I32  naughty;  /* How bad is this pattern? */
 134  I32  sawback;  /* Did we see \1, ...? */
 135  U32  seen;
 136  I32  size;   /* Code size. */
 137  I32  npar;   /* Capture buffer count, (OPEN). */
 138  I32  cpar;   /* Capture buffer count, (CLOSE). */
 139  I32  nestroot;  /* root parens we are in - used by accept */
 140  I32  extralen;
 141  I32  seen_zerolen;
 142  regnode **open_parens;  /* pointers to open parens */
 143  regnode **close_parens;  /* pointers to close parens */
 144  regnode *opend;   /* END node in program */
 145  I32  utf8;  /* whether the pattern is utf8 or not */
 146  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 147         /* XXX use this for future optimisation of case
 148         * where pattern must be upgraded to utf8. */
 149  I32  uni_semantics; /* If a d charset modifier should use unicode
 150         rules, even if the pattern is not in
 151         utf8 */
 152  HV  *paren_names;  /* Paren names */
 153
 154  regnode **recurse;  /* Recurse regops */
 155  I32  recurse_count;  /* Number of recurse regops */
 156  I32  in_lookbehind;
 157  I32  contains_locale;
 158  I32  override_recoding;
 159  I32  in_multi_char_class;
 160  struct reg_code_block *code_blocks; /* positions of literal (?{})
 161            within pattern */
 162  int  num_code_blocks; /* size of code_blocks[] */
 163  int  code_index;  /* next code_blocks[] slot */
 164 #if ADD_TO_REGEXEC
 165  char  *starttry;  /* -Dr: where regtry was called. */
 166 #define RExC_starttry (pRExC_state->starttry)
 167 #endif
 168  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 169 #ifdef DEBUGGING
 170  const char  *lastparse;
 171  I32         lastnum;
 172  AV          *paren_name_list;       /* idx -> name */
 173 #define RExC_lastparse (pRExC_state->lastparse)
 174 #define RExC_lastnum (pRExC_state->lastnum)
 175 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 176 #endif
 177 } RExC_state_t;
 178
 179 #define RExC_flags (pRExC_state->flags)
 180 #define RExC_pm_flags (pRExC_state->pm_flags)
 181 #define RExC_precomp (pRExC_state->precomp)
 182 #define RExC_rx_sv (pRExC_state->rx_sv)
 183 #define RExC_rx  (pRExC_state->rx)
 184 #define RExC_rxi (pRExC_state->rxi)
 185 #define RExC_start (pRExC_state->start)
 186 #define RExC_end (pRExC_state->end)
 187 #define RExC_parse (pRExC_state->parse)
 188 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 189 #ifdef RE_TRACK_PATTERN_OFFSETS
 190 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
 191 #endif
 192 #define RExC_emit (pRExC_state->emit)
 193 #define RExC_emit_start (pRExC_state->emit_start)
 194 #define RExC_emit_bound (pRExC_state->emit_bound)
 195 #define RExC_naughty (pRExC_state->naughty)
 196 #define RExC_sawback (pRExC_state->sawback)
 197 #define RExC_seen (pRExC_state->seen)
 198 #define RExC_size (pRExC_state->size)
 199 #define RExC_npar (pRExC_state->npar)
 200 #define RExC_nestroot   (pRExC_state->nestroot)
 201 #define RExC_extralen (pRExC_state->extralen)
 202 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 203 #define RExC_utf8 (pRExC_state->utf8)
 204 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 205 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 206 #define RExC_open_parens (pRExC_state->open_parens)
 207 #define RExC_close_parens (pRExC_state->close_parens)
 208 #define RExC_opend (pRExC_state->opend)
 209 #define RExC_paren_names (pRExC_state->paren_names)
 210 #define RExC_recurse (pRExC_state->recurse)
 211 #define RExC_recurse_count (pRExC_state->recurse_count)
 212 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 213 #define RExC_contains_locale (pRExC_state->contains_locale)
 214 #define RExC_override_recoding (pRExC_state->override_recoding)
 215 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 216
 217
 218 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 219 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 220   ((*s) == '{' && regcurly(s, FALSE)))
 221
 222 #ifdef SPSTART
 223 #undef SPSTART  /* dratted cpp namespace... */
 224 #endif
 225 /*
 226  * Flags to be passed up and down.
 227  */
 228 #define WORST  0 /* Worst case. */
 229 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 230
 231 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 232  * character.  (There needs to be a case: in the switch statement in regexec.c
 233  * for any node marked SIMPLE.)  Note that this is not the same thing as
 234  * REGNODE_SIMPLE */
 235 #define SIMPLE  0x02
 236 #define SPSTART  0x04 /* Starts with * or + */
 237 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 238 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 239 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 240
 241 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 242
 243 /* whether trie related optimizations are enabled */
 244 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 245 #define TRIE_STUDY_OPT
 246 #define FULL_TRIE_STUDY
 247 #define TRIE_STCLASS
 248 #endif
 249
 250
 251
 252 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 253 #define PBITVAL(paren) (1 << ((paren) & 7))
 254 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 255 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 256 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 257
 258 #define REQUIRE_UTF8 STMT_START {                                       \
 259          if (!UTF) {                           \
 260           *flagp = RESTART_UTF8;            \
 261           return NULL;                      \
 262          }                                     \
 263       } STMT_END
 264
 265 /* This converts the named class defined in regcomp.h to its equivalent class
 266  * number defined in handy.h. */
 267 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 268 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 269
 270 /* About scan_data_t.
 271
 272   During optimisation we recurse through the regexp program performing
 273   various inplace (keyhole style) optimisations. In addition study_chunk
 274   and scan_commit populate this data structure with information about
 275   what strings MUST appear in the pattern. We look for the longest
 276   string that must appear at a fixed location, and we look for the
 277   longest string that may appear at a floating location. So for instance
 278   in the pattern:
 279
 280  /FOO[xX]A.*B[xX]BAR/
 281
 282   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 283   strings (because they follow a .* construct). study_chunk will identify
 284   both FOO and BAR as being the longest fixed and floating strings respectively.
 285
 286   The strings can be composites, for instance
 287
 288  /(f)(o)(o)/
 289
 290   will result in a composite fixed substring 'foo'.
 291
 292   For each string some basic information is maintained:
 293
 294   - offset or min_offset
 295  This is the position the string must appear at, or not before.
 296  It also implicitly (when combined with minlenp) tells us how many
 297  characters must match before the string we are searching for.
 298  Likewise when combined with minlenp and the length of the string it
 299  tells us how many characters must appear after the string we have
 300  found.
 301
 302   - max_offset
 303  Only used for floating strings. This is the rightmost point that
 304  the string can appear at. If set to I32 max it indicates that the
 305  string can occur infinitely far to the right.
 306
 307   - minlenp
 308  A pointer to the minimum number of characters of the pattern that the
 309  string was found inside. This is important as in the case of positive
 310  lookahead or positive lookbehind we can have multiple patterns
 311  involved. Consider
 312
 313  /(?=FOO).*F/
 314
 315  The minimum length of the pattern overall is 3, the minimum length
 316  of the lookahead part is 3, but the minimum length of the part that
 317  will actually match is 1. So 'FOO's minimum length is 3, but the
 318  minimum length for the F is 1. This is important as the minimum length
 319  is used to determine offsets in front of and behind the string being
 320  looked for.  Since strings can be composites this is the length of the
 321  pattern at the time it was committed with a scan_commit. Note that
 322  the length is calculated by study_chunk, so that the minimum lengths
 323  are not known until the full pattern has been compiled, thus the
 324  pointer to the value.
 325
 326   - lookbehind
 327
 328  In the case of lookbehind the string being searched for can be
 329  offset past the start point of the final matching string.
 330  If this value was just blithely removed from the min_offset it would
 331  invalidate some of the calculations for how many chars must match
 332  before or after (as they are derived from min_offset and minlen and
 333  the length of the string being searched for).
 334  When the final pattern is compiled and the data is moved from the
 335  scan_data_t structure into the regexp structure the information
 336  about lookbehind is factored in, with the information that would
 337  have been lost precalculated in the end_shift field for the
 338  associated string.
 339
 340   The fields pos_min and pos_delta are used to store the minimum offset
 341   and the delta to the maximum offset at the current point in the pattern.
 342
 343 */
 344
 345 typedef struct scan_data_t {
 346  /*I32 len_min;      unused */
 347  /*I32 len_delta;    unused */
 348  I32 pos_min;
 349  I32 pos_delta;
 350  SV *last_found;
 351  I32 last_end;     /* min value, <0 unless valid. */
 352  I32 last_start_min;
 353  I32 last_start_max;
 354  SV **longest;     /* Either &l_fixed, or &l_float. */
 355  SV *longest_fixed;      /* longest fixed string found in pattern */
 356  I32 offset_fixed;       /* offset where it starts */
 357  I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 358  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 359  SV *longest_float;      /* longest floating string found in pattern */
 360  I32 offset_float_min;   /* earliest point in string it can appear */
 361  I32 offset_float_max;   /* latest point in string it can appear */
 362  I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 363  I32 lookbehind_float;   /* is the position of the string modified by LB */
 364  I32 flags;
 365  I32 whilem_c;
 366  I32 *last_closep;
 367  struct regnode_charclass_class *start_class;
 368 } scan_data_t;
 369
 370 /*
 371  * Forward declarations for pregcomp()'s friends.
 372  */
 373
 374 static const scan_data_t zero_scan_data =
 375   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 376
 377 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 378 #define SF_BEFORE_SEOL  0x0001
 379 #define SF_BEFORE_MEOL  0x0002
 380 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 381 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 382
 383 #ifdef NO_UNARY_PLUS
 384 #  define SF_FIX_SHIFT_EOL (0+2)
 385 #  define SF_FL_SHIFT_EOL  (0+4)
 386 #else
 387 #  define SF_FIX_SHIFT_EOL (+2)
 388 #  define SF_FL_SHIFT_EOL  (+4)
 389 #endif
 390
 391 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 392 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 393
 394 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 395 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 396 #define SF_IS_INF  0x0040
 397 #define SF_HAS_PAR  0x0080
 398 #define SF_IN_PAR  0x0100
 399 #define SF_HAS_EVAL  0x0200
 400 #define SCF_DO_SUBSTR  0x0400
 401 #define SCF_DO_STCLASS_AND 0x0800
 402 #define SCF_DO_STCLASS_OR 0x1000
 403 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 404 #define SCF_WHILEM_VISITED_POS 0x2000
 405
 406 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 407 #define SCF_SEEN_ACCEPT         0x8000
 408
 409 #define UTF cBOOL(RExC_utf8)
 410
 411 /* The enums for all these are ordered so things work out correctly */
 412 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 413 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 414 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 415 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 416 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 417 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 418 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 419
 420 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 421
 422 #define OOB_NAMEDCLASS  -1
 423
 424 /* There is no code point that is out-of-bounds, so this is problematic.  But
 425  * its only current use is to initialize a variable that is always set before
 426  * looked at. */
 427 #define OOB_UNICODE  0xDEADBEEF
 428
 429 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 430 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 431
 432
 433 /* length of regex to show in messages that don't mark a position within */
 434 #define RegexLengthToShowInErrorMessages 127
 435
 436 /*
 437  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 438  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 439  * op/pragma/warn/regcomp.
 440  */
 441 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 442 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 443
 444 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 445
 446 /*
 447  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 448  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 449  * "...".
 450  */
 451 #define _FAIL(code) STMT_START {     \
 452  const char *ellipses = "";      \
 453  IV len = RExC_end - RExC_precomp;     \
 454                   \
 455  if (!SIZE_ONLY)       \
 456   SAVEFREESV(RExC_rx_sv);      \
 457  if (len > RegexLengthToShowInErrorMessages) {   \
 458   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 459   len = RegexLengthToShowInErrorMessages - 10;   \
 460   ellipses = "...";      \
 461  }         \
 462  code;                                                               \
 463 } STMT_END
 464
 465 #define FAIL(msg) _FAIL(       \
 466  Perl_croak(aTHX_ "%s in regex m/%.*s%s/",     \
 467    msg, (int)len, RExC_precomp, ellipses))
 468
 469 #define FAIL2(msg,arg) _FAIL(       \
 470  Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 471    arg, (int)len, RExC_precomp, ellipses))
 472
 473 /*
 474  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 475  */
 476 #define Simple_vFAIL(m) STMT_START {     \
 477  const IV offset = RExC_parse - RExC_precomp;   \
 478  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 479    m, (int)offset, RExC_precomp, RExC_precomp + offset); \
 480 } STMT_END
 481
 482 /*
 483  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 484  */
 485 #define vFAIL(m) STMT_START {    \
 486  if (!SIZE_ONLY)     \
 487   SAVEFREESV(RExC_rx_sv);    \
 488  Simple_vFAIL(m);     \
 489 } STMT_END
 490
 491 /*
 492  * Like Simple_vFAIL(), but accepts two arguments.
 493  */
 494 #define Simple_vFAIL2(m,a1) STMT_START {   \
 495  const IV offset = RExC_parse - RExC_precomp;   \
 496  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,   \
 497    (int)offset, RExC_precomp, RExC_precomp + offset); \
 498 } STMT_END
 499
 500 /*
 501  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 502  */
 503 #define vFAIL2(m,a1) STMT_START {   \
 504  if (!SIZE_ONLY)     \
 505   SAVEFREESV(RExC_rx_sv);    \
 506  Simple_vFAIL2(m, a1);    \
 507 } STMT_END
 508
 509
 510 /*
 511  * Like Simple_vFAIL(), but accepts three arguments.
 512  */
 513 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 514  const IV offset = RExC_parse - RExC_precomp;  \
 515  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,  \
 516    (int)offset, RExC_precomp, RExC_precomp + offset); \
 517 } STMT_END
 518
 519 /*
 520  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 521  */
 522 #define vFAIL3(m,a1,a2) STMT_START {   \
 523  if (!SIZE_ONLY)     \
 524   SAVEFREESV(RExC_rx_sv);    \
 525  Simple_vFAIL3(m, a1, a2);    \
 526 } STMT_END
 527
 528 /*
 529  * Like Simple_vFAIL(), but accepts four arguments.
 530  */
 531 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 532  const IV offset = RExC_parse - RExC_precomp;  \
 533  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,  \
 534    (int)offset, RExC_precomp, RExC_precomp + offset); \
 535 } STMT_END
 536
 537 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 538  if (!SIZE_ONLY)     \
 539   SAVEFREESV(RExC_rx_sv);    \
 540  Simple_vFAIL4(m, a1, a2, a3);   \
 541 } STMT_END
 542
 543 /* m is not necessarily a "literal string", in this macro */
 544 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 545  const IV offset = loc - RExC_precomp;                               \
 546  Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 547    m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 548 } STMT_END
 549
 550 #define ckWARNreg(loc,m) STMT_START {     \
 551  const IV offset = loc - RExC_precomp;    \
 552  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 553    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 554 } STMT_END
 555
 556 #define vWARN_dep(loc, m) STMT_START {            \
 557  const IV offset = loc - RExC_precomp;    \
 558  Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 559    (int)offset, RExC_precomp, RExC_precomp + offset);         \
 560 } STMT_END
 561
 562 #define ckWARNdep(loc,m) STMT_START {            \
 563  const IV offset = loc - RExC_precomp;    \
 564  Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 565    m REPORT_LOCATION,      \
 566    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 567 } STMT_END
 568
 569 #define ckWARNregdep(loc,m) STMT_START {    \
 570  const IV offset = loc - RExC_precomp;    \
 571  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 572    m REPORT_LOCATION,      \
 573    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 574 } STMT_END
 575
 576 #define ckWARN2regdep(loc,m, a1) STMT_START {    \
 577  const IV offset = loc - RExC_precomp;    \
 578  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 579    m REPORT_LOCATION,      \
 580    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 581 } STMT_END
 582
 583 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 584  const IV offset = loc - RExC_precomp;    \
 585  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 586    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 587 } STMT_END
 588
 589 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 590  const IV offset = loc - RExC_precomp;    \
 591  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 592    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 593 } STMT_END
 594
 595 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 596  const IV offset = loc - RExC_precomp;    \
 597  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 598    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 599 } STMT_END
 600
 601 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 602  const IV offset = loc - RExC_precomp;    \
 603  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 604    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 605 } STMT_END
 606
 607 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 608  const IV offset = loc - RExC_precomp;    \
 609  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 610    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 611 } STMT_END
 612
 613 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 614  const IV offset = loc - RExC_precomp;    \
 615  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 616    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 617 } STMT_END
 618
 619
 620 /* Allow for side effects in s */
 621 #define REGC(c,s) STMT_START {   \
 622  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 623 } STMT_END
 624
 625 /* Macros for recording node offsets.   20001227 mjd@plover.com
 626  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 627  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 628  * Element 0 holds the number n.
 629  * Position is 1 indexed.
 630  */
 631 #ifndef RE_TRACK_PATTERN_OFFSETS
 632 #define Set_Node_Offset_To_R(node,byte)
 633 #define Set_Node_Offset(node,byte)
 634 #define Set_Cur_Node_Offset
 635 #define Set_Node_Length_To_R(node,len)
 636 #define Set_Node_Length(node,len)
 637 #define Set_Node_Cur_Length(node)
 638 #define Node_Offset(n)
 639 #define Node_Length(n)
 640 #define Set_Node_Offset_Length(node,offset,len)
 641 #define ProgLen(ri) ri->u.proglen
 642 #define SetProgLen(ri,x) ri->u.proglen = x
 643 #else
 644 #define ProgLen(ri) ri->u.offsets[0]
 645 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 646 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 647  if (! SIZE_ONLY) {       \
 648   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 649      __LINE__, (int)(node), (int)(byte)));  \
 650   if((node) < 0) {      \
 651    Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 652   } else {       \
 653    RExC_offsets[2*(node)-1] = (byte);    \
 654   }        \
 655  }         \
 656 } STMT_END
 657
 658 #define Set_Node_Offset(node,byte) \
 659  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 660 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 661
 662 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 663  if (! SIZE_ONLY) {       \
 664   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 665     __LINE__, (int)(node), (int)(len)));   \
 666   if((node) < 0) {      \
 667    Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 668   } else {       \
 669    RExC_offsets[2*(node)] = (len);    \
 670   }        \
 671  }         \
 672 } STMT_END
 673
 674 #define Set_Node_Length(node,len) \
 675  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 676 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 677 #define Set_Node_Cur_Length(node) \
 678  Set_Node_Length(node, RExC_parse - parse_start)
 679
 680 /* Get offsets and lengths */
 681 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 682 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 683
 684 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 685  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 686  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 687 } STMT_END
 688 #endif
 689
 690 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 691 #define EXPERIMENTAL_INPLACESCAN
 692 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 693
 694 #define DEBUG_STUDYDATA(str,data,depth)                              \
 695 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 696  PerlIO_printf(Perl_debug_log,                                    \
 697   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 698   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 699   (int)(depth)*2, "",                                          \
 700   (IV)((data)->pos_min),                                       \
 701   (IV)((data)->pos_delta),                                     \
 702   (UV)((data)->flags),                                         \
 703   (IV)((data)->whilem_c),                                      \
 704   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 705   is_inf ? "INF " : ""                                         \
 706  );                                                               \
 707  if ((data)->last_found)                                          \
 708   PerlIO_printf(Perl_debug_log,                                \
 709    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 710    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 711    SvPVX_const((data)->last_found),                         \
 712    (IV)((data)->last_end),                                  \
 713    (IV)((data)->last_start_min),                            \
 714    (IV)((data)->last_start_max),                            \
 715    ((data)->longest &&                                      \
 716    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 717    SvPVX_const((data)->longest_fixed),                      \
 718    (IV)((data)->offset_fixed),                              \
 719    ((data)->longest &&                                      \
 720    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 721    SvPVX_const((data)->longest_float),                      \
 722    (IV)((data)->offset_float_min),                          \
 723    (IV)((data)->offset_float_max)                           \
 724   );                                                           \
 725  PerlIO_printf(Perl_debug_log,"\n");                              \
 726 });
 727
 728 /* Mark that we cannot extend a found fixed substring at this point.
 729    Update the longest found anchored substring and the longest found
 730    floating substrings if needed. */
 731
 732 STATIC void
 733 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 734 {
 735  const STRLEN l = CHR_SVLEN(data->last_found);
 736  const STRLEN old_l = CHR_SVLEN(*data->longest);
 737  GET_RE_DEBUG_FLAGS_DECL;
 738
 739  PERL_ARGS_ASSERT_SCAN_COMMIT;
 740
 741  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 742   SvSetMagicSV(*data->longest, data->last_found);
 743   if (*data->longest == data->longest_fixed) {
 744    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 745    if (data->flags & SF_BEFORE_EOL)
 746     data->flags
 747      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 748    else
 749     data->flags &= ~SF_FIX_BEFORE_EOL;
 750    data->minlen_fixed=minlenp;
 751    data->lookbehind_fixed=0;
 752   }
 753   else { /* *data->longest == data->longest_float */
 754    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 755    data->offset_float_max = (l
 756          ? data->last_start_max
 757          : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
 758    if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 759     data->offset_float_max = I32_MAX;
 760    if (data->flags & SF_BEFORE_EOL)
 761     data->flags
 762      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 763    else
 764     data->flags &= ~SF_FL_BEFORE_EOL;
 765    data->minlen_float=minlenp;
 766    data->lookbehind_float=0;
 767   }
 768  }
 769  SvCUR_set(data->last_found, 0);
 770  {
 771   SV * const sv = data->last_found;
 772   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 773    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 774    if (mg)
 775     mg->mg_len = 0;
 776   }
 777  }
 778  data->last_end = -1;
 779  data->flags &= ~SF_BEFORE_EOL;
 780  DEBUG_STUDYDATA("commit: ",data,0);
 781 }
 782
 783 /* These macros set, clear and test whether the synthetic start class ('ssc',
 784  * given by the parameter) matches an empty string (EOS).  This uses the
 785  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 786  * stands alone, so there is never a next_off, so this field is otherwise
 787  * unused.  The EOS information is used only for compilation, but theoretically
 788  * it could be passed on to the execution code.  This could be used to store
 789  * more than one bit of information, but only this one is currently used. */
 790 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 791 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 792 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 793
 794 /* Can match anything (initialization) */
 795 STATIC void
 796 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 797 {
 798  PERL_ARGS_ASSERT_CL_ANYTHING;
 799
 800  ANYOF_BITMAP_SETALL(cl);
 801  cl->flags = ANYOF_UNICODE_ALL;
 802  SET_SSC_EOS(cl);
 803
 804  /* If any portion of the regex is to operate under locale rules,
 805  * initialization includes it.  The reason this isn't done for all regexes
 806  * is that the optimizer was written under the assumption that locale was
 807  * all-or-nothing.  Given the complexity and lack of documentation in the
 808  * optimizer, and that there are inadequate test cases for locale, so many
 809  * parts of it may not work properly, it is safest to avoid locale unless
 810  * necessary. */
 811  if (RExC_contains_locale) {
 812   ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 813   cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 814  }
 815  else {
 816   ANYOF_CLASS_ZERO(cl);     /* Only /l uses class now */
 817  }
 818 }
 819
 820 /* Can match anything (initialization) */
 821 STATIC int
 822 S_cl_is_anything(const struct regnode_charclass_class *cl)
 823 {
 824  int value;
 825
 826  PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 827
 828  for (value = 0; value < ANYOF_MAX; value += 2)
 829   if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 830    return 1;
 831  if (!(cl->flags & ANYOF_UNICODE_ALL))
 832   return 0;
 833  if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 834   return 0;
 835  return 1;
 836 }
 837
 838 /* Can match anything (initialization) */
 839 STATIC void
 840 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 841 {
 842  PERL_ARGS_ASSERT_CL_INIT;
 843
 844  Zero(cl, 1, struct regnode_charclass_class);
 845  cl->type = ANYOF;
 846  cl_anything(pRExC_state, cl);
 847  ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 848 }
 849
 850 /* These two functions currently do the exact same thing */
 851 #define cl_init_zero  S_cl_init
 852
 853 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 854  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 855  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 856 STATIC void
 857 S_cl_and(struct regnode_charclass_class *cl,
 858   const struct regnode_charclass_class *and_with)
 859 {
 860  PERL_ARGS_ASSERT_CL_AND;
 861
 862  assert(PL_regkind[and_with->type] == ANYOF);
 863
 864  /* I (khw) am not sure all these restrictions are necessary XXX */
 865  if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 866   && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 867   && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 868   && !(and_with->flags & ANYOF_LOC_FOLD)
 869   && !(cl->flags & ANYOF_LOC_FOLD)) {
 870   int i;
 871
 872   if (and_with->flags & ANYOF_INVERT)
 873    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 874     cl->bitmap[i] &= ~and_with->bitmap[i];
 875   else
 876    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 877     cl->bitmap[i] &= and_with->bitmap[i];
 878  } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 879
 880  if (and_with->flags & ANYOF_INVERT) {
 881
 882   /* Here, the and'ed node is inverted.  Get the AND of the flags that
 883   * aren't affected by the inversion.  Those that are affected are
 884   * handled individually below */
 885   U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 886   cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 887   cl->flags |= affected_flags;
 888
 889   /* We currently don't know how to deal with things that aren't in the
 890   * bitmap, but we know that the intersection is no greater than what
 891   * is already in cl, so let there be false positives that get sorted
 892   * out after the synthetic start class succeeds, and the node is
 893   * matched for real. */
 894
 895   /* The inversion of these two flags indicate that the resulting
 896   * intersection doesn't have them */
 897   if (and_with->flags & ANYOF_UNICODE_ALL) {
 898    cl->flags &= ~ANYOF_UNICODE_ALL;
 899   }
 900   if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 901    cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 902   }
 903  }
 904  else {   /* and'd node is not inverted */
 905   U8 outside_bitmap_but_not_utf8; /* Temp variable */
 906
 907   if (! ANYOF_NONBITMAP(and_with)) {
 908
 909    /* Here 'and_with' doesn't match anything outside the bitmap
 910    * (except possibly ANYOF_UNICODE_ALL), which means the
 911    * intersection can't either, except for ANYOF_UNICODE_ALL, in
 912    * which case we don't know what the intersection is, but it's no
 913    * greater than what cl already has, so can just leave it alone,
 914    * with possible false positives */
 915    if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 916     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 917     cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 918    }
 919   }
 920   else if (! ANYOF_NONBITMAP(cl)) {
 921
 922    /* Here, 'and_with' does match something outside the bitmap, and cl
 923    * doesn't have a list of things to match outside the bitmap.  If
 924    * cl can match all code points above 255, the intersection will
 925    * be those above-255 code points that 'and_with' matches.  If cl
 926    * can't match all Unicode code points, it means that it can't
 927    * match anything outside the bitmap (since the 'if' that got us
 928    * into this block tested for that), so we leave the bitmap empty.
 929    */
 930    if (cl->flags & ANYOF_UNICODE_ALL) {
 931     ARG_SET(cl, ARG(and_with));
 932
 933     /* and_with's ARG may match things that don't require UTF8.
 934     * And now cl's will too, in spite of this being an 'and'.  See
 935     * the comments below about the kludge */
 936     cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 937    }
 938   }
 939   else {
 940    /* Here, both 'and_with' and cl match something outside the
 941    * bitmap.  Currently we do not do the intersection, so just match
 942    * whatever cl had at the beginning.  */
 943   }
 944
 945
 946   /* Take the intersection of the two sets of flags.  However, the
 947   * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 948   * kludge around the fact that this flag is not treated like the others
 949   * which are initialized in cl_anything().  The way the optimizer works
 950   * is that the synthetic start class (SSC) is initialized to match
 951   * anything, and then the first time a real node is encountered, its
 952   * values are AND'd with the SSC's with the result being the values of
 953   * the real node.  However, there are paths through the optimizer where
 954   * the AND never gets called, so those initialized bits are set
 955   * inappropriately, which is not usually a big deal, as they just cause
 956   * false positives in the SSC, which will just mean a probably
 957   * imperceptible slow down in execution.  However this bit has a
 958   * higher false positive consequence in that it can cause utf8.pm,
 959   * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 960   * bigger slowdown and also causes significant extra memory to be used.
 961   * In order to prevent this, the code now takes a different tack.  The
 962   * bit isn't set unless some part of the regular expression needs it,
 963   * but once set it won't get cleared.  This means that these extra
 964   * modules won't get loaded unless there was some path through the
 965   * pattern that would have required them anyway, and  so any false
 966   * positives that occur by not ANDing them out when they could be
 967   * aren't as severe as they would be if we treated this bit like all
 968   * the others */
 969   outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 970          & ANYOF_NONBITMAP_NON_UTF8;
 971   cl->flags &= and_with->flags;
 972   cl->flags |= outside_bitmap_but_not_utf8;
 973  }
 974 }
 975
 976 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 977  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 978  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 979 STATIC void
 980 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 981 {
 982  PERL_ARGS_ASSERT_CL_OR;
 983
 984  if (or_with->flags & ANYOF_INVERT) {
 985
 986   /* Here, the or'd node is to be inverted.  This means we take the
 987   * complement of everything not in the bitmap, but currently we don't
 988   * know what that is, so give up and match anything */
 989   if (ANYOF_NONBITMAP(or_with)) {
 990    cl_anything(pRExC_state, cl);
 991   }
 992   /* We do not use
 993   * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 994   *   <= (B1 | !B2) | (CL1 | !CL2)
 995   * which is wasteful if CL2 is small, but we ignore CL2:
 996   *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 997   * XXXX Can we handle case-fold?  Unclear:
 998   *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 999   *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1000   */
1001   else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1002    && !(or_with->flags & ANYOF_LOC_FOLD)
1003    && !(cl->flags & ANYOF_LOC_FOLD) ) {
1004    int i;
1005
1006    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1007     cl->bitmap[i] |= ~or_with->bitmap[i];
1008   } /* XXXX: logic is complicated otherwise */
1009   else {
1010    cl_anything(pRExC_state, cl);
1011   }
1012
1013   /* And, we can just take the union of the flags that aren't affected
1014   * by the inversion */
1015   cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1016
1017   /* For the remaining flags:
1018    ANYOF_UNICODE_ALL and inverted means to not match anything above
1019      255, which means that the union with cl should just be
1020      what cl has in it, so can ignore this flag
1021    ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1022      is 127-255 to match them, but then invert that, so the
1023      union with cl should just be what cl has in it, so can
1024      ignore this flag
1025   */
1026  } else {    /* 'or_with' is not inverted */
1027   /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1028   if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1029    && (!(or_with->flags & ANYOF_LOC_FOLD)
1030     || (cl->flags & ANYOF_LOC_FOLD)) ) {
1031    int i;
1032
1033    /* OR char bitmap and class bitmap separately */
1034    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1035     cl->bitmap[i] |= or_with->bitmap[i];
1036    if (or_with->flags & ANYOF_CLASS) {
1037     ANYOF_CLASS_OR(or_with, cl);
1038    }
1039   }
1040   else { /* XXXX: logic is complicated, leave it along for a moment. */
1041    cl_anything(pRExC_state, cl);
1042   }
1043
1044   if (ANYOF_NONBITMAP(or_with)) {
1045
1046    /* Use the added node's outside-the-bit-map match if there isn't a
1047    * conflict.  If there is a conflict (both nodes match something
1048    * outside the bitmap, but what they match outside is not the same
1049    * pointer, and hence not easily compared until XXX we extend
1050    * inversion lists this far), give up and allow the start class to
1051    * match everything outside the bitmap.  If that stuff is all above
1052    * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1053    if (! ANYOF_NONBITMAP(cl)) {
1054     ARG_SET(cl, ARG(or_with));
1055    }
1056    else if (ARG(cl) != ARG(or_with)) {
1057
1058     if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1059      cl_anything(pRExC_state, cl);
1060     }
1061     else {
1062      cl->flags |= ANYOF_UNICODE_ALL;
1063     }
1064    }
1065   }
1066
1067   /* Take the union */
1068   cl->flags |= or_with->flags;
1069  }
1070 }
1071
1072 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1073 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1074 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1075 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1076
1077
1078 #ifdef DEBUGGING
1079 /*
1080    dump_trie(trie,widecharmap,revcharmap)
1081    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1082    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1083
1084    These routines dump out a trie in a somewhat readable format.
1085    The _interim_ variants are used for debugging the interim
1086    tables that are used to generate the final compressed
1087    representation which is what dump_trie expects.
1088
1089    Part of the reason for their existence is to provide a form
1090    of documentation as to how the different representations function.
1091
1092 */
1093
1094 /*
1095   Dumps the final compressed table form of the trie to Perl_debug_log.
1096   Used for debugging make_trie().
1097 */
1098
1099 STATIC void
1100 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1101    AV *revcharmap, U32 depth)
1102 {
1103  U32 state;
1104  SV *sv=sv_newmortal();
1105  int colwidth= widecharmap ? 6 : 4;
1106  U16 word;
1107  GET_RE_DEBUG_FLAGS_DECL;
1108
1109  PERL_ARGS_ASSERT_DUMP_TRIE;
1110
1111  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1112   (int)depth * 2 + 2,"",
1113   "Match","Base","Ofs" );
1114
1115  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1116   SV ** const tmp = av_fetch( revcharmap, state, 0);
1117   if ( tmp ) {
1118    PerlIO_printf( Perl_debug_log, "%*s",
1119     colwidth,
1120     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1121        PL_colors[0], PL_colors[1],
1122        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1123        PERL_PV_ESCAPE_FIRSTCHAR
1124     )
1125    );
1126   }
1127  }
1128  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1129   (int)depth * 2 + 2,"");
1130
1131  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1132   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1133  PerlIO_printf( Perl_debug_log, "\n");
1134
1135  for( state = 1 ; state < trie->statecount ; state++ ) {
1136   const U32 base = trie->states[ state ].trans.base;
1137
1138   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1139
1140   if ( trie->states[ state ].wordnum ) {
1141    PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1142   } else {
1143    PerlIO_printf( Perl_debug_log, "%6s", "" );
1144   }
1145
1146   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1147
1148   if ( base ) {
1149    U32 ofs = 0;
1150
1151    while( ( base + ofs  < trie->uniquecharcount ) ||
1152     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1153      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1154      ofs++;
1155
1156    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1157
1158    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1159     if ( ( base + ofs >= trie->uniquecharcount ) &&
1160      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1161      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1162     {
1163     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1164      colwidth,
1165      (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1166     } else {
1167      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1168     }
1169    }
1170
1171    PerlIO_printf( Perl_debug_log, "]");
1172
1173   }
1174   PerlIO_printf( Perl_debug_log, "\n" );
1175  }
1176  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1177  for (word=1; word <= trie->wordcount; word++) {
1178   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1179    (int)word, (int)(trie->wordinfo[word].prev),
1180    (int)(trie->wordinfo[word].len));
1181  }
1182  PerlIO_printf(Perl_debug_log, "\n" );
1183 }
1184 /*
1185   Dumps a fully constructed but uncompressed trie in list form.
1186   List tries normally only are used for construction when the number of
1187   possible chars (trie->uniquecharcount) is very high.
1188   Used for debugging make_trie().
1189 */
1190 STATIC void
1191 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1192       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1193       U32 depth)
1194 {
1195  U32 state;
1196  SV *sv=sv_newmortal();
1197  int colwidth= widecharmap ? 6 : 4;
1198  GET_RE_DEBUG_FLAGS_DECL;
1199
1200  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1201
1202  /* print out the table precompression.  */
1203  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1204   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1205   "------:-----+-----------------\n" );
1206
1207  for( state=1 ; state < next_alloc ; state ++ ) {
1208   U16 charid;
1209
1210   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1211    (int)depth * 2 + 2,"", (UV)state  );
1212   if ( ! trie->states[ state ].wordnum ) {
1213    PerlIO_printf( Perl_debug_log, "%5s| ","");
1214   } else {
1215    PerlIO_printf( Perl_debug_log, "W%4x| ",
1216     trie->states[ state ].wordnum
1217    );
1218   }
1219   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1220    SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1221    if ( tmp ) {
1222     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1223      colwidth,
1224      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1225        PL_colors[0], PL_colors[1],
1226        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1227        PERL_PV_ESCAPE_FIRSTCHAR
1228      ) ,
1229      TRIE_LIST_ITEM(state,charid).forid,
1230      (UV)TRIE_LIST_ITEM(state,charid).newstate
1231     );
1232     if (!(charid % 10))
1233      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1234       (int)((depth * 2) + 14), "");
1235    }
1236   }
1237   PerlIO_printf( Perl_debug_log, "\n");
1238  }
1239 }
1240
1241 /*
1242   Dumps a fully constructed but uncompressed trie in table form.
1243   This is the normal DFA style state transition table, with a few
1244   twists to facilitate compression later.
1245   Used for debugging make_trie().
1246 */
1247 STATIC void
1248 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1249       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1250       U32 depth)
1251 {
1252  U32 state;
1253  U16 charid;
1254  SV *sv=sv_newmortal();
1255  int colwidth= widecharmap ? 6 : 4;
1256  GET_RE_DEBUG_FLAGS_DECL;
1257
1258  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1259
1260  /*
1261  print out the table precompression so that we can do a visual check
1262  that they are identical.
1263  */
1264
1265  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1266
1267  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1268   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1269   if ( tmp ) {
1270    PerlIO_printf( Perl_debug_log, "%*s",
1271     colwidth,
1272     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1273        PL_colors[0], PL_colors[1],
1274        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1275        PERL_PV_ESCAPE_FIRSTCHAR
1276     )
1277    );
1278   }
1279  }
1280
1281  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1282
1283  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1284   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1285  }
1286
1287  PerlIO_printf( Perl_debug_log, "\n" );
1288
1289  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1290
1291   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1292    (int)depth * 2 + 2,"",
1293    (UV)TRIE_NODENUM( state ) );
1294
1295   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1296    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1297    if (v)
1298     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1299    else
1300     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1301   }
1302   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1303    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1304   } else {
1305    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1306    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1307   }
1308  }
1309 }
1310
1311 #endif
1312
1313
1314 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1315   startbranch: the first branch in the whole branch sequence
1316   first      : start branch of sequence of branch-exact nodes.
1317    May be the same as startbranch
1318   last       : Thing following the last branch.
1319    May be the same as tail.
1320   tail       : item following the branch sequence
1321   count      : words in the sequence
1322   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1323   depth      : indent depth
1324
1325 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1326
1327 A trie is an N'ary tree where the branches are determined by digital
1328 decomposition of the key. IE, at the root node you look up the 1st character and
1329 follow that branch repeat until you find the end of the branches. Nodes can be
1330 marked as "accepting" meaning they represent a complete word. Eg:
1331
1332   /he|she|his|hers/
1333
1334 would convert into the following structure. Numbers represent states, letters
1335 following numbers represent valid transitions on the letter from that state, if
1336 the number is in square brackets it represents an accepting state, otherwise it
1337 will be in parenthesis.
1338
1339  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1340  |    |
1341  |   (2)
1342  |    |
1343  (1)   +-i->(6)-+-s->[7]
1344  |
1345  +-s->(3)-+-h->(4)-+-e->[5]
1346
1347  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1348
1349 This shows that when matching against the string 'hers' we will begin at state 1
1350 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1351 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1352 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1353 single traverse. We store a mapping from accepting to state to which word was
1354 matched, and then when we have multiple possibilities we try to complete the
1355 rest of the regex in the order in which they occured in the alternation.
1356
1357 The only prior NFA like behaviour that would be changed by the TRIE support is
1358 the silent ignoring of duplicate alternations which are of the form:
1359
1360  / (DUPE|DUPE) X? (?{ ... }) Y /x
1361
1362 Thus EVAL blocks following a trie may be called a different number of times with
1363 and without the optimisation. With the optimisations dupes will be silently
1364 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1365 the following demonstrates:
1366
1367  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1368
1369 which prints out 'word' three times, but
1370
1371  'words'=~/(word|word|word)(?{ print $1 })S/
1372
1373 which doesnt print it out at all. This is due to other optimisations kicking in.
1374
1375 Example of what happens on a structural level:
1376
1377 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1378
1379    1: CURLYM[1] {1,32767}(18)
1380    5:   BRANCH(8)
1381    6:     EXACT <ac>(16)
1382    8:   BRANCH(11)
1383    9:     EXACT <ad>(16)
1384   11:   BRANCH(14)
1385   12:     EXACT <ab>(16)
1386   16:   SUCCEED(0)
1387   17:   NOTHING(18)
1388   18: END(0)
1389
1390 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1391 and should turn into:
1392
1393    1: CURLYM[1] {1,32767}(18)
1394    5:   TRIE(16)
1395   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1396   <ac>
1397   <ad>
1398   <ab>
1399   16:   SUCCEED(0)
1400   17:   NOTHING(18)
1401   18: END(0)
1402
1403 Cases where tail != last would be like /(?foo|bar)baz/:
1404
1405    1: BRANCH(4)
1406    2:   EXACT <foo>(8)
1407    4: BRANCH(7)
1408    5:   EXACT <bar>(8)
1409    7: TAIL(8)
1410    8: EXACT <baz>(10)
1411   10: END(0)
1412
1413 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1414 and would end up looking like:
1415
1416  1: TRIE(8)
1417  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1418   <foo>
1419   <bar>
1420    7: TAIL(8)
1421    8: EXACT <baz>(10)
1422   10: END(0)
1423
1424  d = uvuni_to_utf8_flags(d, uv, 0);
1425
1426 is the recommended Unicode-aware way of saying
1427
1428  *(d++) = uv;
1429 */
1430
1431 #define TRIE_STORE_REVCHAR(val)                                            \
1432  STMT_START {                                                           \
1433   if (UTF) {          \
1434    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1435    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1436    unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1437    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1438    SvPOK_on(zlopp);         \
1439    SvUTF8_on(zlopp);         \
1440    av_push(revcharmap, zlopp);        \
1441   } else {          \
1442    char ooooff = (char)val;                                           \
1443    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1444   }           \
1445   } STMT_END
1446
1447 #define TRIE_READ_CHAR STMT_START {                                                     \
1448  wordlen++;                                                                          \
1449  if ( UTF ) {                                                                        \
1450   /* if it is UTF then it is either already folded, or does not need folding */   \
1451   uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1452  }                                                                                   \
1453  else if (folder == PL_fold_latin1) {                                                \
1454   /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1455   if ( foldlen > 0 ) {                                                            \
1456   uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1457   foldlen -= len;                                                              \
1458   scan += len;                                                                 \
1459   len = 0;                                                                     \
1460   } else {                                                                        \
1461    len = 1;                                                                    \
1462    uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1463    skiplen = UNISKIP(uvc);                                                     \
1464    foldlen -= skiplen;                                                         \
1465    scan = foldbuf + skiplen;                                                   \
1466   }                                                                               \
1467  } else {                                                                            \
1468   /* raw data, will be folded later if needed */                                  \
1469   uvc = (U32)*uc;                                                                 \
1470   len = 1;                                                                        \
1471  }                                                                                   \
1472 } STMT_END
1473
1474
1475
1476 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1477  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1478   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1479   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1480  }                                                           \
1481  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1482  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1483  TRIE_LIST_CUR( state )++;                                   \
1484 } STMT_END
1485
1486 #define TRIE_LIST_NEW(state) STMT_START {                       \
1487  Newxz( trie->states[ state ].trans.list,               \
1488   4, reg_trie_trans_le );                                 \
1489  TRIE_LIST_CUR( state ) = 1;                                \
1490  TRIE_LIST_LEN( state ) = 4;                                \
1491 } STMT_END
1492
1493 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1494  U16 dupe= trie->states[ state ].wordnum;                    \
1495  regnode * const noper_next = regnext( noper );              \
1496                 \
1497  DEBUG_r({                                                   \
1498   /* store the word for dumping */                        \
1499   SV* tmp;                                                \
1500   if (OP(noper) != NOTHING)                               \
1501    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1502   else                                                    \
1503    tmp = newSVpvn_utf8( "", 0, UTF );   \
1504   av_push( trie_words, tmp );                             \
1505  });                                                         \
1506                 \
1507  curword++;                                                  \
1508  trie->wordinfo[curword].prev   = 0;                         \
1509  trie->wordinfo[curword].len    = wordlen;                   \
1510  trie->wordinfo[curword].accept = state;                     \
1511                 \
1512  if ( noper_next < tail ) {                                  \
1513   if (!trie->jump)                                        \
1514    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1515   trie->jump[curword] = (U16)(noper_next - convert);      \
1516   if (!jumper)                                            \
1517    jumper = noper_next;                                \
1518   if (!nextbranch)                                        \
1519    nextbranch= regnext(cur);                           \
1520  }                                                           \
1521                 \
1522  if ( dupe ) {                                               \
1523   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1524   /* chain, so that when the bits of chain are later    */\
1525   /* linked together, the dups appear in the chain      */\
1526   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1527   trie->wordinfo[dupe].prev = curword;                    \
1528  } else {                                                    \
1529   /* we haven't inserted this word yet.                */ \
1530   trie->states[ state ].wordnum = curword;                \
1531  }                                                           \
1532 } STMT_END
1533
1534
1535 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1536  ( ( base + charid >=  ucharcount     \
1537   && base + charid < ubound     \
1538   && state == trie->trans[ base - ucharcount + charid ].check \
1539   && trie->trans[ base - ucharcount + charid ].next )  \
1540   ? trie->trans[ base - ucharcount + charid ].next  \
1541   : ( state==1 ? special : 0 )     \
1542  )
1543
1544 #define MADE_TRIE       1
1545 #define MADE_JUMP_TRIE  2
1546 #define MADE_EXACT_TRIE 4
1547
1548 STATIC I32
1549 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1550 {
1551  dVAR;
1552  /* first pass, loop through and scan words */
1553  reg_trie_data *trie;
1554  HV *widecharmap = NULL;
1555  AV *revcharmap = newAV();
1556  regnode *cur;
1557  const U32 uniflags = UTF8_ALLOW_DEFAULT;
1558  STRLEN len = 0;
1559  UV uvc = 0;
1560  U16 curword = 0;
1561  U32 next_alloc = 0;
1562  regnode *jumper = NULL;
1563  regnode *nextbranch = NULL;
1564  regnode *convert = NULL;
1565  U32 *prev_states; /* temp array mapping each state to previous one */
1566  /* we just use folder as a flag in utf8 */
1567  const U8 * folder = NULL;
1568
1569 #ifdef DEBUGGING
1570  const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1571  AV *trie_words = NULL;
1572  /* along with revcharmap, this only used during construction but both are
1573  * useful during debugging so we store them in the struct when debugging.
1574  */
1575 #else
1576  const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1577  STRLEN trie_charcount=0;
1578 #endif
1579  SV *re_trie_maxbuff;
1580  GET_RE_DEBUG_FLAGS_DECL;
1581
1582  PERL_ARGS_ASSERT_MAKE_TRIE;
1583 #ifndef DEBUGGING
1584  PERL_UNUSED_ARG(depth);
1585 #endif
1586
1587  switch (flags) {
1588   case EXACT: break;
1589   case EXACTFA:
1590   case EXACTFU_SS:
1591   case EXACTFU_TRICKYFOLD:
1592   case EXACTFU: folder = PL_fold_latin1; break;
1593   case EXACTF:  folder = PL_fold; break;
1594   case EXACTFL: folder = PL_fold_locale; break;
1595   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1596  }
1597
1598  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1599  trie->refcount = 1;
1600  trie->startstate = 1;
1601  trie->wordcount = word_count;
1602  RExC_rxi->data->data[ data_slot ] = (void*)trie;
1603  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1604  if (flags == EXACT)
1605   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1606  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1607      trie->wordcount+1, sizeof(reg_trie_wordinfo));
1608
1609  DEBUG_r({
1610   trie_words = newAV();
1611  });
1612
1613  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1614  if (!SvIOK(re_trie_maxbuff)) {
1615   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1616  }
1617  DEBUG_TRIE_COMPILE_r({
1618     PerlIO_printf( Perl_debug_log,
1619     "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1620     (int)depth * 2 + 2, "",
1621     REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1622     REG_NODE_NUM(last), REG_NODE_NUM(tail),
1623     (int)depth);
1624  });
1625
1626    /* Find the node we are going to overwrite */
1627  if ( first == startbranch && OP( last ) != BRANCH ) {
1628   /* whole branch chain */
1629   convert = first;
1630  } else {
1631   /* branch sub-chain */
1632   convert = NEXTOPER( first );
1633  }
1634
1635  /*  -- First loop and Setup --
1636
1637  We first traverse the branches and scan each word to determine if it
1638  contains widechars, and how many unique chars there are, this is
1639  important as we have to build a table with at least as many columns as we
1640  have unique chars.
1641
1642  We use an array of integers to represent the character codes 0..255
1643  (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1644  native representation of the character value as the key and IV's for the
1645  coded index.
1646
1647  *TODO* If we keep track of how many times each character is used we can
1648  remap the columns so that the table compression later on is more
1649  efficient in terms of memory by ensuring the most common value is in the
1650  middle and the least common are on the outside.  IMO this would be better
1651  than a most to least common mapping as theres a decent chance the most
1652  common letter will share a node with the least common, meaning the node
1653  will not be compressible. With a middle is most common approach the worst
1654  case is when we have the least common nodes twice.
1655
1656  */
1657
1658  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1659   regnode *noper = NEXTOPER( cur );
1660   const U8 *uc = (U8*)STRING( noper );
1661   const U8 *e  = uc + STR_LEN( noper );
1662   STRLEN foldlen = 0;
1663   U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1664   STRLEN skiplen = 0;
1665   const U8 *scan = (U8*)NULL;
1666   U32 wordlen      = 0;         /* required init */
1667   STRLEN chars = 0;
1668   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1669
1670   if (OP(noper) == NOTHING) {
1671    regnode *noper_next= regnext(noper);
1672    if (noper_next != tail && OP(noper_next) == flags) {
1673     noper = noper_next;
1674     uc= (U8*)STRING(noper);
1675     e= uc + STR_LEN(noper);
1676     trie->minlen= STR_LEN(noper);
1677    } else {
1678     trie->minlen= 0;
1679     continue;
1680    }
1681   }
1682
1683   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1684    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1685           regardless of encoding */
1686    if (OP( noper ) == EXACTFU_SS) {
1687     /* false positives are ok, so just set this */
1688     TRIE_BITMAP_SET(trie,0xDF);
1689    }
1690   }
1691   for ( ; uc < e ; uc += len ) {
1692    TRIE_CHARCOUNT(trie)++;
1693    TRIE_READ_CHAR;
1694    chars++;
1695    if ( uvc < 256 ) {
1696     if ( folder ) {
1697      U8 folded= folder[ (U8) uvc ];
1698      if ( !trie->charmap[ folded ] ) {
1699       trie->charmap[ folded ]=( ++trie->uniquecharcount );
1700       TRIE_STORE_REVCHAR( folded );
1701      }
1702     }
1703     if ( !trie->charmap[ uvc ] ) {
1704      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1705      TRIE_STORE_REVCHAR( uvc );
1706     }
1707     if ( set_bit ) {
1708      /* store the codepoint in the bitmap, and its folded
1709      * equivalent. */
1710      TRIE_BITMAP_SET(trie, uvc);
1711
1712      /* store the folded codepoint */
1713      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1714
1715      if ( !UTF ) {
1716       /* store first byte of utf8 representation of
1717       variant codepoints */
1718       if (! UNI_IS_INVARIANT(uvc)) {
1719        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1720       }
1721      }
1722      set_bit = 0; /* We've done our bit :-) */
1723     }
1724    } else {
1725     SV** svpp;
1726     if ( !widecharmap )
1727      widecharmap = newHV();
1728
1729     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1730
1731     if ( !svpp )
1732      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1733
1734     if ( !SvTRUE( *svpp ) ) {
1735      sv_setiv( *svpp, ++trie->uniquecharcount );
1736      TRIE_STORE_REVCHAR(uvc);
1737     }
1738    }
1739   }
1740   if( cur == first ) {
1741    trie->minlen = chars;
1742    trie->maxlen = chars;
1743   } else if (chars < trie->minlen) {
1744    trie->minlen = chars;
1745   } else if (chars > trie->maxlen) {
1746    trie->maxlen = chars;
1747   }
1748   if (OP( noper ) == EXACTFU_SS) {
1749    /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1750    if (trie->minlen > 1)
1751     trie->minlen= 1;
1752   }
1753   if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1754    /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1755    *        - We assume that any such sequence might match a 2 byte string */
1756    if (trie->minlen > 2 )
1757     trie->minlen= 2;
1758   }
1759
1760  } /* end first pass */
1761  DEBUG_TRIE_COMPILE_r(
1762   PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1763     (int)depth * 2 + 2,"",
1764     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1765     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1766     (int)trie->minlen, (int)trie->maxlen )
1767  );
1768
1769  /*
1770   We now know what we are dealing with in terms of unique chars and
1771   string sizes so we can calculate how much memory a naive
1772   representation using a flat table  will take. If it's over a reasonable
1773   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1774   conservative but potentially much slower representation using an array
1775   of lists.
1776
1777   At the end we convert both representations into the same compressed
1778   form that will be used in regexec.c for matching with. The latter
1779   is a form that cannot be used to construct with but has memory
1780   properties similar to the list form and access properties similar
1781   to the table form making it both suitable for fast searches and
1782   small enough that its feasable to store for the duration of a program.
1783
1784   See the comment in the code where the compressed table is produced
1785   inplace from the flat tabe representation for an explanation of how
1786   the compression works.
1787
1788  */
1789
1790
1791  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1792  prev_states[1] = 0;
1793
1794  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1795   /*
1796    Second Pass -- Array Of Lists Representation
1797
1798    Each state will be represented by a list of charid:state records
1799    (reg_trie_trans_le) the first such element holds the CUR and LEN
1800    points of the allocated array. (See defines above).
1801
1802    We build the initial structure using the lists, and then convert
1803    it into the compressed table form which allows faster lookups
1804    (but cant be modified once converted).
1805   */
1806
1807   STRLEN transcount = 1;
1808
1809   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1810    "%*sCompiling trie using list compiler\n",
1811    (int)depth * 2 + 2, ""));
1812
1813   trie->states = (reg_trie_state *)
1814    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1815         sizeof(reg_trie_state) );
1816   TRIE_LIST_NEW(1);
1817   next_alloc = 2;
1818
1819   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1820
1821    regnode *noper   = NEXTOPER( cur );
1822    U8 *uc           = (U8*)STRING( noper );
1823    const U8 *e      = uc + STR_LEN( noper );
1824    U32 state        = 1;         /* required init */
1825    U16 charid       = 0;         /* sanity init */
1826    U8 *scan         = (U8*)NULL; /* sanity init */
1827    STRLEN foldlen   = 0;         /* required init */
1828    U32 wordlen      = 0;         /* required init */
1829    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1830    STRLEN skiplen   = 0;
1831
1832    if (OP(noper) == NOTHING) {
1833     regnode *noper_next= regnext(noper);
1834     if (noper_next != tail && OP(noper_next) == flags) {
1835      noper = noper_next;
1836      uc= (U8*)STRING(noper);
1837      e= uc + STR_LEN(noper);
1838     }
1839    }
1840
1841    if (OP(noper) != NOTHING) {
1842     for ( ; uc < e ; uc += len ) {
1843
1844      TRIE_READ_CHAR;
1845
1846      if ( uvc < 256 ) {
1847       charid = trie->charmap[ uvc ];
1848      } else {
1849       SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1850       if ( !svpp ) {
1851        charid = 0;
1852       } else {
1853        charid=(U16)SvIV( *svpp );
1854       }
1855      }
1856      /* charid is now 0 if we dont know the char read, or nonzero if we do */
1857      if ( charid ) {
1858
1859       U16 check;
1860       U32 newstate = 0;
1861
1862       charid--;
1863       if ( !trie->states[ state ].trans.list ) {
1864        TRIE_LIST_NEW( state );
1865       }
1866       for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1867        if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1868         newstate = TRIE_LIST_ITEM( state, check ).newstate;
1869         break;
1870        }
1871       }
1872       if ( ! newstate ) {
1873        newstate = next_alloc++;
1874        prev_states[newstate] = state;
1875        TRIE_LIST_PUSH( state, charid, newstate );
1876        transcount++;
1877       }
1878       state = newstate;
1879      } else {
1880       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1881      }
1882     }
1883    }
1884    TRIE_HANDLE_WORD(state);
1885
1886   } /* end second pass */
1887
1888   /* next alloc is the NEXT state to be allocated */
1889   trie->statecount = next_alloc;
1890   trie->states = (reg_trie_state *)
1891    PerlMemShared_realloc( trie->states,
1892         next_alloc
1893         * sizeof(reg_trie_state) );
1894
1895   /* and now dump it out before we compress it */
1896   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1897               revcharmap, next_alloc,
1898               depth+1)
1899   );
1900
1901   trie->trans = (reg_trie_trans *)
1902    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1903   {
1904    U32 state;
1905    U32 tp = 0;
1906    U32 zp = 0;
1907
1908
1909    for( state=1 ; state < next_alloc ; state ++ ) {
1910     U32 base=0;
1911
1912     /*
1913     DEBUG_TRIE_COMPILE_MORE_r(
1914      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1915     );
1916     */
1917
1918     if (trie->states[state].trans.list) {
1919      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1920      U16 maxid=minid;
1921      U16 idx;
1922
1923      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1924       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1925       if ( forid < minid ) {
1926        minid=forid;
1927       } else if ( forid > maxid ) {
1928        maxid=forid;
1929       }
1930      }
1931      if ( transcount < tp + maxid - minid + 1) {
1932       transcount *= 2;
1933       trie->trans = (reg_trie_trans *)
1934        PerlMemShared_realloc( trie->trans,
1935              transcount
1936              * sizeof(reg_trie_trans) );
1937       Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1938      }
1939      base = trie->uniquecharcount + tp - minid;
1940      if ( maxid == minid ) {
1941       U32 set = 0;
1942       for ( ; zp < tp ; zp++ ) {
1943        if ( ! trie->trans[ zp ].next ) {
1944         base = trie->uniquecharcount + zp - minid;
1945         trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1946         trie->trans[ zp ].check = state;
1947         set = 1;
1948         break;
1949        }
1950       }
1951       if ( !set ) {
1952        trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1953        trie->trans[ tp ].check = state;
1954        tp++;
1955        zp = tp;
1956       }
1957      } else {
1958       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1959        const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1960        trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1961        trie->trans[ tid ].check = state;
1962       }
1963       tp += ( maxid - minid + 1 );
1964      }
1965      Safefree(trie->states[ state ].trans.list);
1966     }
1967     /*
1968     DEBUG_TRIE_COMPILE_MORE_r(
1969      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1970     );
1971     */
1972     trie->states[ state ].trans.base=base;
1973    }
1974    trie->lasttrans = tp + 1;
1975   }
1976  } else {
1977   /*
1978   Second Pass -- Flat Table Representation.
1979
1980   we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1981   We know that we will need Charcount+1 trans at most to store the data
1982   (one row per char at worst case) So we preallocate both structures
1983   assuming worst case.
1984
1985   We then construct the trie using only the .next slots of the entry
1986   structs.
1987
1988   We use the .check field of the first entry of the node temporarily to
1989   make compression both faster and easier by keeping track of how many non
1990   zero fields are in the node.
1991
1992   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1993   transition.
1994
1995   There are two terms at use here: state as a TRIE_NODEIDX() which is a
1996   number representing the first entry of the node, and state as a
1997   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1998   TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1999   are 2 entrys per node. eg:
2000
2001    A B       A B
2002   1. 2 4    1. 3 7
2003   2. 0 3    3. 0 5
2004   3. 0 0    5. 0 0
2005   4. 0 0    7. 0 0
2006
2007   The table is internally in the right hand, idx form. However as we also
2008   have to deal with the states array which is indexed by nodenum we have to
2009   use TRIE_NODENUM() to convert.
2010
2011   */
2012   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2013    "%*sCompiling trie using table compiler\n",
2014    (int)depth * 2 + 2, ""));
2015
2016   trie->trans = (reg_trie_trans *)
2017    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2018         * trie->uniquecharcount + 1,
2019         sizeof(reg_trie_trans) );
2020   trie->states = (reg_trie_state *)
2021    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2022         sizeof(reg_trie_state) );
2023   next_alloc = trie->uniquecharcount + 1;
2024
2025
2026   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2027
2028    regnode *noper   = NEXTOPER( cur );
2029    const U8 *uc     = (U8*)STRING( noper );
2030    const U8 *e      = uc + STR_LEN( noper );
2031
2032    U32 state        = 1;         /* required init */
2033
2034    U16 charid       = 0;         /* sanity init */
2035    U32 accept_state = 0;         /* sanity init */
2036    U8 *scan         = (U8*)NULL; /* sanity init */
2037
2038    STRLEN foldlen   = 0;         /* required init */
2039    U32 wordlen      = 0;         /* required init */
2040    STRLEN skiplen   = 0;
2041    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2042
2043    if (OP(noper) == NOTHING) {
2044     regnode *noper_next= regnext(noper);
2045     if (noper_next != tail && OP(noper_next) == flags) {
2046      noper = noper_next;
2047      uc= (U8*)STRING(noper);
2048      e= uc + STR_LEN(noper);
2049     }
2050    }
2051
2052    if ( OP(noper) != NOTHING ) {
2053     for ( ; uc < e ; uc += len ) {
2054
2055      TRIE_READ_CHAR;
2056
2057      if ( uvc < 256 ) {
2058       charid = trie->charmap[ uvc ];
2059      } else {
2060       SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2061       charid = svpp ? (U16)SvIV(*svpp) : 0;
2062      }
2063      if ( charid ) {
2064       charid--;
2065       if ( !trie->trans[ state + charid ].next ) {
2066        trie->trans[ state + charid ].next = next_alloc;
2067        trie->trans[ state ].check++;
2068        prev_states[TRIE_NODENUM(next_alloc)]
2069          = TRIE_NODENUM(state);
2070        next_alloc += trie->uniquecharcount;
2071       }
2072       state = trie->trans[ state + charid ].next;
2073      } else {
2074       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2075      }
2076      /* charid is now 0 if we dont know the char read, or nonzero if we do */
2077     }
2078    }
2079    accept_state = TRIE_NODENUM( state );
2080    TRIE_HANDLE_WORD(accept_state);
2081
2082   } /* end second pass */
2083
2084   /* and now dump it out before we compress it */
2085   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2086               revcharmap,
2087               next_alloc, depth+1));
2088
2089   {
2090   /*
2091   * Inplace compress the table.*
2092
2093   For sparse data sets the table constructed by the trie algorithm will
2094   be mostly 0/FAIL transitions or to put it another way mostly empty.
2095   (Note that leaf nodes will not contain any transitions.)
2096
2097   This algorithm compresses the tables by eliminating most such
2098   transitions, at the cost of a modest bit of extra work during lookup:
2099
2100   - Each states[] entry contains a .base field which indicates the
2101   index in the state[] array wheres its transition data is stored.
2102
2103   - If .base is 0 there are no valid transitions from that node.
2104
2105   - If .base is nonzero then charid is added to it to find an entry in
2106   the trans array.
2107
2108   -If trans[states[state].base+charid].check!=state then the
2109   transition is taken to be a 0/Fail transition. Thus if there are fail
2110   transitions at the front of the node then the .base offset will point
2111   somewhere inside the previous nodes data (or maybe even into a node
2112   even earlier), but the .check field determines if the transition is
2113   valid.
2114
2115   XXX - wrong maybe?
2116   The following process inplace converts the table to the compressed
2117   table: We first do not compress the root node 1,and mark all its
2118   .check pointers as 1 and set its .base pointer as 1 as well. This
2119   allows us to do a DFA construction from the compressed table later,
2120   and ensures that any .base pointers we calculate later are greater
2121   than 0.
2122
2123   - We set 'pos' to indicate the first entry of the second node.
2124
2125   - We then iterate over the columns of the node, finding the first and
2126   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2127   and set the .check pointers accordingly, and advance pos
2128   appropriately and repreat for the next node. Note that when we copy
2129   the next pointers we have to convert them from the original
2130   NODEIDX form to NODENUM form as the former is not valid post
2131   compression.
2132
2133   - If a node has no transitions used we mark its base as 0 and do not
2134   advance the pos pointer.
2135
2136   - If a node only has one transition we use a second pointer into the
2137   structure to fill in allocated fail transitions from other states.
2138   This pointer is independent of the main pointer and scans forward
2139   looking for null transitions that are allocated to a state. When it
2140   finds one it writes the single transition into the "hole".  If the
2141   pointer doesnt find one the single transition is appended as normal.
2142
2143   - Once compressed we can Renew/realloc the structures to release the
2144   excess space.
2145
2146   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2147   specifically Fig 3.47 and the associated pseudocode.
2148
2149   demq
2150   */
2151   const U32 laststate = TRIE_NODENUM( next_alloc );
2152   U32 state, charid;
2153   U32 pos = 0, zp=0;
2154   trie->statecount = laststate;
2155
2156   for ( state = 1 ; state < laststate ; state++ ) {
2157    U8 flag = 0;
2158    const U32 stateidx = TRIE_NODEIDX( state );
2159    const U32 o_used = trie->trans[ stateidx ].check;
2160    U32 used = trie->trans[ stateidx ].check;
2161    trie->trans[ stateidx ].check = 0;
2162
2163    for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2164     if ( flag || trie->trans[ stateidx + charid ].next ) {
2165      if ( trie->trans[ stateidx + charid ].next ) {
2166       if (o_used == 1) {
2167        for ( ; zp < pos ; zp++ ) {
2168         if ( ! trie->trans[ zp ].next ) {
2169          break;
2170         }
2171        }
2172        trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2173        trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2174        trie->trans[ zp ].check = state;
2175        if ( ++zp > pos ) pos = zp;
2176        break;
2177       }
2178       used--;
2179      }
2180      if ( !flag ) {
2181       flag = 1;
2182       trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2183      }
2184      trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2185      trie->trans[ pos ].check = state;
2186      pos++;
2187     }
2188    }
2189   }
2190   trie->lasttrans = pos + 1;
2191   trie->states = (reg_trie_state *)
2192    PerlMemShared_realloc( trie->states, laststate
2193         * sizeof(reg_trie_state) );
2194   DEBUG_TRIE_COMPILE_MORE_r(
2195     PerlIO_printf( Perl_debug_log,
2196      "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2197      (int)depth * 2 + 2,"",
2198      (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2199      (IV)next_alloc,
2200      (IV)pos,
2201      ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2202    );
2203
2204   } /* end table compress */
2205  }
2206  DEBUG_TRIE_COMPILE_MORE_r(
2207    PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2208     (int)depth * 2 + 2, "",
2209     (UV)trie->statecount,
2210     (UV)trie->lasttrans)
2211  );
2212  /* resize the trans array to remove unused space */
2213  trie->trans = (reg_trie_trans *)
2214   PerlMemShared_realloc( trie->trans, trie->lasttrans
2215        * sizeof(reg_trie_trans) );
2216
2217  {   /* Modify the program and insert the new TRIE node */
2218   U8 nodetype =(U8)(flags & 0xFF);
2219   char *str=NULL;
2220
2221 #ifdef DEBUGGING
2222   regnode *optimize = NULL;
2223 #ifdef RE_TRACK_PATTERN_OFFSETS
2224
2225   U32 mjd_offset = 0;
2226   U32 mjd_nodelen = 0;
2227 #endif /* RE_TRACK_PATTERN_OFFSETS */
2228 #endif /* DEBUGGING */
2229   /*
2230   This means we convert either the first branch or the first Exact,
2231   depending on whether the thing following (in 'last') is a branch
2232   or not and whther first is the startbranch (ie is it a sub part of
2233   the alternation or is it the whole thing.)
2234   Assuming its a sub part we convert the EXACT otherwise we convert
2235   the whole branch sequence, including the first.
2236   */
2237   /* Find the node we are going to overwrite */
2238   if ( first != startbranch || OP( last ) == BRANCH ) {
2239    /* branch sub-chain */
2240    NEXT_OFF( first ) = (U16)(last - first);
2241 #ifdef RE_TRACK_PATTERN_OFFSETS
2242    DEBUG_r({
2243     mjd_offset= Node_Offset((convert));
2244     mjd_nodelen= Node_Length((convert));
2245    });
2246 #endif
2247    /* whole branch chain */
2248   }
2249 #ifdef RE_TRACK_PATTERN_OFFSETS
2250   else {
2251    DEBUG_r({
2252     const  regnode *nop = NEXTOPER( convert );
2253     mjd_offset= Node_Offset((nop));
2254     mjd_nodelen= Node_Length((nop));
2255    });
2256   }
2257   DEBUG_OPTIMISE_r(
2258    PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2259     (int)depth * 2 + 2, "",
2260     (UV)mjd_offset, (UV)mjd_nodelen)
2261   );
2262 #endif
2263   /* But first we check to see if there is a common prefix we can
2264   split out as an EXACT and put in front of the TRIE node.  */
2265   trie->startstate= 1;
2266   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2267    U32 state;
2268    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2269     U32 ofs = 0;
2270     I32 idx = -1;
2271     U32 count = 0;
2272     const U32 base = trie->states[ state ].trans.base;
2273
2274     if ( trie->states[state].wordnum )
2275       count = 1;
2276
2277     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2278      if ( ( base + ofs >= trie->uniquecharcount ) &&
2279       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2280       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2281      {
2282       if ( ++count > 1 ) {
2283        SV **tmp = av_fetch( revcharmap, ofs, 0);
2284        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2285        if ( state == 1 ) break;
2286        if ( count == 2 ) {
2287         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2288         DEBUG_OPTIMISE_r(
2289          PerlIO_printf(Perl_debug_log,
2290           "%*sNew Start State=%"UVuf" Class: [",
2291           (int)depth * 2 + 2, "",
2292           (UV)state));
2293         if (idx >= 0) {
2294          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2295          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2296
2297          TRIE_BITMAP_SET(trie,*ch);
2298          if ( folder )
2299           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2300          DEBUG_OPTIMISE_r(
2301           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2302          );
2303         }
2304        }
2305        TRIE_BITMAP_SET(trie,*ch);
2306        if ( folder )
2307         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2308        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2309       }
2310       idx = ofs;
2311      }
2312     }
2313     if ( count == 1 ) {
2314      SV **tmp = av_fetch( revcharmap, idx, 0);
2315      STRLEN len;
2316      char *ch = SvPV( *tmp, len );
2317      DEBUG_OPTIMISE_r({
2318       SV *sv=sv_newmortal();
2319       PerlIO_printf( Perl_debug_log,
2320        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2321        (int)depth * 2 + 2, "",
2322        (UV)state, (UV)idx,
2323        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2324         PL_colors[0], PL_colors[1],
2325         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2326         PERL_PV_ESCAPE_FIRSTCHAR
2327        )
2328       );
2329      });
2330      if ( state==1 ) {
2331       OP( convert ) = nodetype;
2332       str=STRING(convert);
2333       STR_LEN(convert)=0;
2334      }
2335      STR_LEN(convert) += len;
2336      while (len--)
2337       *str++ = *ch++;
2338     } else {
2339 #ifdef DEBUGGING
2340      if (state>1)
2341       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2342 #endif
2343      break;
2344     }
2345    }
2346    trie->prefixlen = (state-1);
2347    if (str) {
2348     regnode *n = convert+NODE_SZ_STR(convert);
2349     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2350     trie->startstate = state;
2351     trie->minlen -= (state - 1);
2352     trie->maxlen -= (state - 1);
2353 #ifdef DEBUGGING
2354    /* At least the UNICOS C compiler choked on this
2355     * being argument to DEBUG_r(), so let's just have
2356     * it right here. */
2357    if (
2358 #ifdef PERL_EXT_RE_BUILD
2359     1
2360 #else
2361     DEBUG_r_TEST
2362 #endif
2363     ) {
2364     regnode *fix = convert;
2365     U32 word = trie->wordcount;
2366     mjd_nodelen++;
2367     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2368     while( ++fix < n ) {
2369      Set_Node_Offset_Length(fix, 0, 0);
2370     }
2371     while (word--) {
2372      SV ** const tmp = av_fetch( trie_words, word, 0 );
2373      if (tmp) {
2374       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2375        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2376       else
2377        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2378      }
2379     }
2380    }
2381 #endif
2382     if (trie->maxlen) {
2383      convert = n;
2384     } else {
2385      NEXT_OFF(convert) = (U16)(tail - convert);
2386      DEBUG_r(optimize= n);
2387     }
2388    }
2389   }
2390   if (!jumper)
2391    jumper = last;
2392   if ( trie->maxlen ) {
2393    NEXT_OFF( convert ) = (U16)(tail - convert);
2394    ARG_SET( convert, data_slot );
2395    /* Store the offset to the first unabsorbed branch in
2396    jump[0], which is otherwise unused by the jump logic.
2397    We use this when dumping a trie and during optimisation. */
2398    if (trie->jump)
2399     trie->jump[0] = (U16)(nextbranch - convert);
2400
2401    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2402    *   and there is a bitmap
2403    *   and the first "jump target" node we found leaves enough room
2404    * then convert the TRIE node into a TRIEC node, with the bitmap
2405    * embedded inline in the opcode - this is hypothetically faster.
2406    */
2407    if ( !trie->states[trie->startstate].wordnum
2408     && trie->bitmap
2409     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2410    {
2411     OP( convert ) = TRIEC;
2412     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2413     PerlMemShared_free(trie->bitmap);
2414     trie->bitmap= NULL;
2415    } else
2416     OP( convert ) = TRIE;
2417
2418    /* store the type in the flags */
2419    convert->flags = nodetype;
2420    DEBUG_r({
2421    optimize = convert
2422      + NODE_STEP_REGNODE
2423      + regarglen[ OP( convert ) ];
2424    });
2425    /* XXX We really should free up the resource in trie now,
2426     as we won't use them - (which resources?) dmq */
2427   }
2428   /* needed for dumping*/
2429   DEBUG_r(if (optimize) {
2430    regnode *opt = convert;
2431
2432    while ( ++opt < optimize) {
2433     Set_Node_Offset_Length(opt,0,0);
2434    }
2435    /*
2436     Try to clean up some of the debris left after the
2437     optimisation.
2438    */
2439    while( optimize < jumper ) {
2440     mjd_nodelen += Node_Length((optimize));
2441     OP( optimize ) = OPTIMIZED;
2442     Set_Node_Offset_Length(optimize,0,0);
2443     optimize++;
2444    }
2445    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2446   });
2447  } /* end node insert */
2448  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2449
2450  /*  Finish populating the prev field of the wordinfo array.  Walk back
2451  *  from each accept state until we find another accept state, and if
2452  *  so, point the first word's .prev field at the second word. If the
2453  *  second already has a .prev field set, stop now. This will be the
2454  *  case either if we've already processed that word's accept state,
2455  *  or that state had multiple words, and the overspill words were
2456  *  already linked up earlier.
2457  */
2458  {
2459   U16 word;
2460   U32 state;
2461   U16 prev;
2462
2463   for (word=1; word <= trie->wordcount; word++) {
2464    prev = 0;
2465    if (trie->wordinfo[word].prev)
2466     continue;
2467    state = trie->wordinfo[word].accept;
2468    while (state) {
2469     state = prev_states[state];
2470     if (!state)
2471      break;
2472     prev = trie->states[state].wordnum;
2473     if (prev)
2474      break;
2475    }
2476    trie->wordinfo[word].prev = prev;
2477   }
2478   Safefree(prev_states);
2479  }
2480
2481
2482  /* and now dump out the compressed format */
2483  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2484
2485  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2486 #ifdef DEBUGGING
2487  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2488  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2489 #else
2490  SvREFCNT_dec_NN(revcharmap);
2491 #endif
2492  return trie->jump
2493   ? MADE_JUMP_TRIE
2494   : trie->startstate>1
2495    ? MADE_EXACT_TRIE
2496    : MADE_TRIE;
2497 }
2498
2499 STATIC void
2500 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2501 {
2502 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2503
2504    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2505    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2506    ISBN 0-201-10088-6
2507
2508    We find the fail state for each state in the trie, this state is the longest proper
2509    suffix of the current state's 'word' that is also a proper prefix of another word in our
2510    trie. State 1 represents the word '' and is thus the default fail state. This allows
2511    the DFA not to have to restart after its tried and failed a word at a given point, it
2512    simply continues as though it had been matching the other word in the first place.
2513    Consider
2514  'abcdgu'=~/abcdefg|cdgu/
2515    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2516    fail, which would bring us to the state representing 'd' in the second word where we would
2517    try 'g' and succeed, proceeding to match 'cdgu'.
2518  */
2519  /* add a fail transition */
2520  const U32 trie_offset = ARG(source);
2521  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2522  U32 *q;
2523  const U32 ucharcount = trie->uniquecharcount;
2524  const U32 numstates = trie->statecount;
2525  const U32 ubound = trie->lasttrans + ucharcount;
2526  U32 q_read = 0;
2527  U32 q_write = 0;
2528  U32 charid;
2529  U32 base = trie->states[ 1 ].trans.base;
2530  U32 *fail;
2531  reg_ac_data *aho;
2532  const U32 data_slot = add_data( pRExC_state, 1, "T" );
2533  GET_RE_DEBUG_FLAGS_DECL;
2534
2535  PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2536 #ifndef DEBUGGING
2537  PERL_UNUSED_ARG(depth);
2538 #endif
2539
2540
2541  ARG_SET( stclass, data_slot );
2542  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2543  RExC_rxi->data->data[ data_slot ] = (void*)aho;
2544  aho->trie=trie_offset;
2545  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2546  Copy( trie->states, aho->states, numstates, reg_trie_state );
2547  Newxz( q, numstates, U32);
2548  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2549  aho->refcount = 1;
2550  fail = aho->fail;
2551  /* initialize fail[0..1] to be 1 so that we always have
2552  a valid final fail state */
2553  fail[ 0 ] = fail[ 1 ] = 1;
2554
2555  for ( charid = 0; charid < ucharcount ; charid++ ) {
2556   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2557   if ( newstate ) {
2558    q[ q_write ] = newstate;
2559    /* set to point at the root */
2560    fail[ q[ q_write++ ] ]=1;
2561   }
2562  }
2563  while ( q_read < q_write) {
2564   const U32 cur = q[ q_read++ % numstates ];
2565   base = trie->states[ cur ].trans.base;
2566
2567   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2568    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2569    if (ch_state) {
2570     U32 fail_state = cur;
2571     U32 fail_base;
2572     do {
2573      fail_state = fail[ fail_state ];
2574      fail_base = aho->states[ fail_state ].trans.base;
2575     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2576
2577     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2578     fail[ ch_state ] = fail_state;
2579     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2580     {
2581       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2582     }
2583     q[ q_write++ % numstates] = ch_state;
2584    }
2585   }
2586  }
2587  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2588  when we fail in state 1, this allows us to use the
2589  charclass scan to find a valid start char. This is based on the principle
2590  that theres a good chance the string being searched contains lots of stuff
2591  that cant be a start char.
2592  */
2593  fail[ 0 ] = fail[ 1 ] = 0;
2594  DEBUG_TRIE_COMPILE_r({
2595   PerlIO_printf(Perl_debug_log,
2596      "%*sStclass Failtable (%"UVuf" states): 0",
2597      (int)(depth * 2), "", (UV)numstates
2598   );
2599   for( q_read=1; q_read<numstates; q_read++ ) {
2600    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2601   }
2602   PerlIO_printf(Perl_debug_log, "\n");
2603  });
2604  Safefree(q);
2605  /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2606 }
2607
2608
2609 /*
2610  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2611  * These need to be revisited when a newer toolchain becomes available.
2612  */
2613 #if defined(__sparc64__) && defined(__GNUC__)
2614 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2615 #       undef  SPARC64_GCC_WORKAROUND
2616 #       define SPARC64_GCC_WORKAROUND 1
2617 #   endif
2618 #endif
2619
2620 #define DEBUG_PEEP(str,scan,depth) \
2621  DEBUG_OPTIMISE_r({if (scan){ \
2622  SV * const mysv=sv_newmortal(); \
2623  regnode *Next = regnext(scan); \
2624  regprop(RExC_rx, mysv, scan); \
2625  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2626  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2627  Next ? (REG_NODE_NUM(Next)) : 0 ); \
2628    }});
2629
2630
2631 /* The below joins as many adjacent EXACTish nodes as possible into a single
2632  * one.  The regop may be changed if the node(s) contain certain sequences that
2633  * require special handling.  The joining is only done if:
2634  * 1) there is room in the current conglomerated node to entirely contain the
2635  *    next one.
2636  * 2) they are the exact same node type
2637  *
2638  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2639  * these get optimized out
2640  *
2641  * If a node is to match under /i (folded), the number of characters it matches
2642  * can be different than its character length if it contains a multi-character
2643  * fold.  *min_subtract is set to the total delta of the input nodes.
2644  *
2645  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2646  * and contains LATIN SMALL LETTER SHARP S
2647  *
2648  * This is as good a place as any to discuss the design of handling these
2649  * multi-character fold sequences.  It's been wrong in Perl for a very long
2650  * time.  There are three code points in Unicode whose multi-character folds
2651  * were long ago discovered to mess things up.  The previous designs for
2652  * dealing with these involved assigning a special node for them.  This
2653  * approach doesn't work, as evidenced by this example:
2654  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2655  * Both these fold to "sss", but if the pattern is parsed to create a node that
2656  * would match just the \xDF, it won't be able to handle the case where a
2657  * successful match would have to cross the node's boundary.  The new approach
2658  * that hopefully generally solves the problem generates an EXACTFU_SS node
2659  * that is "sss".
2660  *
2661  * It turns out that there are problems with all multi-character folds, and not
2662  * just these three.  Now the code is general, for all such cases, but the
2663  * three still have some special handling.  The approach taken is:
2664  * 1)   This routine examines each EXACTFish node that could contain multi-
2665  *      character fold sequences.  It returns in *min_subtract how much to
2666  *      subtract from the the actual length of the string to get a real minimum
2667  *      match length; it is 0 if there are no multi-char folds.  This delta is
2668  *      used by the caller to adjust the min length of the match, and the delta
2669  *      between min and max, so that the optimizer doesn't reject these
2670  *      possibilities based on size constraints.
2671  * 2)   Certain of these sequences require special handling by the trie code,
2672  *      so, if found, this code changes the joined node type to special ops:
2673  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2674  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2675  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2676  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2677  *      there is a possible fold length change.  That means that a regular
2678  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2679  *      with length changes, and so can be processed faster.  regexec.c takes
2680  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2681  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2682  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2683  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2684  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2685  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2686  *      possibilities for the non-UTF8 patterns are quite simple, except for
2687  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2688  *      members of a fold-pair, and arrays are set up for all of them so that
2689  *      the other member of the pair can be found quickly.  Code elsewhere in
2690  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2691  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2692  *      described in the next item.
2693  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2694  *      'ss' or not is not knowable at compile time.  It will match iff the
2695  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2696  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2697  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2698  *      described in item 3).  An assumption that the optimizer part of
2699  *      regexec.c (probably unwittingly) makes is that a character in the
2700  *      pattern corresponds to at most a single character in the target string.
2701  *      (And I do mean character, and not byte here, unlike other parts of the
2702  *      documentation that have never been updated to account for multibyte
2703  *      Unicode.)  This assumption is wrong only in this case, as all other
2704  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2705  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2706  *      reluctant to try to change this assumption, so instead the code punts.
2707  *      This routine examines EXACTF nodes for the sharp s, and returns a
2708  *      boolean indicating whether or not the node is an EXACTF node that
2709  *      contains a sharp s.  When it is true, the caller sets a flag that later
2710  *      causes the optimizer in this file to not set values for the floating
2711  *      and fixed string lengths, and thus avoids the optimizer code in
2712  *      regexec.c that makes the invalid assumption.  Thus, there is no
2713  *      optimization based on string lengths for EXACTF nodes that contain the
2714  *      sharp s.  This only happens for /id rules (which means the pattern
2715  *      isn't in UTF-8).
2716  */
2717
2718 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2719  if (PL_regkind[OP(scan)] == EXACT) \
2720   join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2721
2722 STATIC U32
2723 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2724  /* Merge several consecutive EXACTish nodes into one. */
2725  regnode *n = regnext(scan);
2726  U32 stringok = 1;
2727  regnode *next = scan + NODE_SZ_STR(scan);
2728  U32 merged = 0;
2729  U32 stopnow = 0;
2730 #ifdef DEBUGGING
2731  regnode *stop = scan;
2732  GET_RE_DEBUG_FLAGS_DECL;
2733 #else
2734  PERL_UNUSED_ARG(depth);
2735 #endif
2736
2737  PERL_ARGS_ASSERT_JOIN_EXACT;
2738 #ifndef EXPERIMENTAL_INPLACESCAN
2739  PERL_UNUSED_ARG(flags);
2740  PERL_UNUSED_ARG(val);
2741 #endif
2742  DEBUG_PEEP("join",scan,depth);
2743
2744  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2745  * EXACT ones that are mergeable to the current one. */
2746  while (n
2747   && (PL_regkind[OP(n)] == NOTHING
2748    || (stringok && OP(n) == OP(scan)))
2749   && NEXT_OFF(n)
2750   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2751  {
2752
2753   if (OP(n) == TAIL || n > next)
2754    stringok = 0;
2755   if (PL_regkind[OP(n)] == NOTHING) {
2756    DEBUG_PEEP("skip:",n,depth);
2757    NEXT_OFF(scan) += NEXT_OFF(n);
2758    next = n + NODE_STEP_REGNODE;
2759 #ifdef DEBUGGING
2760    if (stringok)
2761     stop = n;
2762 #endif
2763    n = regnext(n);
2764   }
2765   else if (stringok) {
2766    const unsigned int oldl = STR_LEN(scan);
2767    regnode * const nnext = regnext(n);
2768
2769    /* XXX I (khw) kind of doubt that this works on platforms where
2770    * U8_MAX is above 255 because of lots of other assumptions */
2771    /* Don't join if the sum can't fit into a single node */
2772    if (oldl + STR_LEN(n) > U8_MAX)
2773     break;
2774
2775    DEBUG_PEEP("merg",n,depth);
2776    merged++;
2777
2778    NEXT_OFF(scan) += NEXT_OFF(n);
2779    STR_LEN(scan) += STR_LEN(n);
2780    next = n + NODE_SZ_STR(n);
2781    /* Now we can overwrite *n : */
2782    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2783 #ifdef DEBUGGING
2784    stop = next - 1;
2785 #endif
2786    n = nnext;
2787    if (stopnow) break;
2788   }
2789
2790 #ifdef EXPERIMENTAL_INPLACESCAN
2791   if (flags && !NEXT_OFF(n)) {
2792    DEBUG_PEEP("atch", val, depth);
2793    if (reg_off_by_arg[OP(n)]) {
2794     ARG_SET(n, val - n);
2795    }
2796    else {
2797     NEXT_OFF(n) = val - n;
2798    }
2799    stopnow = 1;
2800   }
2801 #endif
2802  }
2803
2804  *min_subtract = 0;
2805  *has_exactf_sharp_s = FALSE;
2806
2807  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2808  * can now analyze for sequences of problematic code points.  (Prior to
2809  * this final joining, sequences could have been split over boundaries, and
2810  * hence missed).  The sequences only happen in folding, hence for any
2811  * non-EXACT EXACTish node */
2812  if (OP(scan) != EXACT) {
2813   const U8 * const s0 = (U8*) STRING(scan);
2814   const U8 * s = s0;
2815   const U8 * const s_end = s0 + STR_LEN(scan);
2816
2817   /* One pass is made over the node's string looking for all the
2818   * possibilities.  to avoid some tests in the loop, there are two main
2819   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2820   * non-UTF-8 */
2821   if (UTF) {
2822
2823    /* Examine the string for a multi-character fold sequence.  UTF-8
2824    * patterns have all characters pre-folded by the time this code is
2825    * executed */
2826    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2827          length sequence we are looking for is 2 */
2828    {
2829     int count = 0;
2830     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2831     if (! len) {    /* Not a multi-char fold: get next char */
2832      s += UTF8SKIP(s);
2833      continue;
2834     }
2835
2836     /* Nodes with 'ss' require special handling, except for EXACTFL
2837     * and EXACTFA for which there is no multi-char fold to this */
2838     if (len == 2 && *s == 's' && *(s+1) == 's'
2839      && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2840     {
2841      count = 2;
2842      OP(scan) = EXACTFU_SS;
2843      s += 2;
2844     }
2845     else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2846       && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2847          COMBINING_DIAERESIS_UTF8
2848          COMBINING_ACUTE_ACCENT_UTF8,
2849         6)
2850        || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2851           COMBINING_DIAERESIS_UTF8
2852           COMBINING_ACUTE_ACCENT_UTF8,
2853          6)))
2854     {
2855      count = 3;
2856
2857      /* These two folds require special handling by trie's, so
2858      * change the node type to indicate this.  If EXACTFA and
2859      * EXACTFL were ever to be handled by trie's, this would
2860      * have to be changed.  If this node has already been
2861      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2862      * (khw) think it doesn't matter in regexec.c for UTF
2863      * patterns, but no need to change it */
2864      if (OP(scan) == EXACTFU) {
2865       OP(scan) = EXACTFU_TRICKYFOLD;
2866      }
2867      s += 6;
2868     }
2869     else { /* Here is a generic multi-char fold. */
2870      const U8* multi_end  = s + len;
2871
2872      /* Count how many characters in it.  In the case of /l and
2873      * /aa, no folds which contain ASCII code points are
2874      * allowed, so check for those, and skip if found.  (In
2875      * EXACTFL, no folds are allowed to any Latin1 code point,
2876      * not just ASCII.  But there aren't any of these
2877      * currently, nor ever likely, so don't take the time to
2878      * test for them.  The code that generates the
2879      * is_MULTI_foo() macros croaks should one actually get put
2880      * into Unicode .) */
2881      if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2882       count = utf8_length(s, multi_end);
2883       s = multi_end;
2884      }
2885      else {
2886       while (s < multi_end) {
2887        if (isASCII(*s)) {
2888         s++;
2889         goto next_iteration;
2890        }
2891        else {
2892         s += UTF8SKIP(s);
2893        }
2894        count++;
2895       }
2896      }
2897     }
2898
2899     /* The delta is how long the sequence is minus 1 (1 is how long
2900     * the character that folds to the sequence is) */
2901     *min_subtract += count - 1;
2902    next_iteration: ;
2903    }
2904   }
2905   else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2906
2907    /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2908    * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2909    * nodes can't have multi-char folds to this range (and there are
2910    * no existing ones in the upper latin1 range).  In the EXACTF
2911    * case we look also for the sharp s, which can be in the final
2912    * position.  Otherwise we can stop looking 1 byte earlier because
2913    * have to find at least two characters for a multi-fold */
2914    const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2915
2916    /* The below is perhaps overboard, but this allows us to save a
2917    * test each time through the loop at the expense of a mask.  This
2918    * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2919    * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2920    * are 64.  This uses an exclusive 'or' to find that bit and then
2921    * inverts it to form a mask, with just a single 0, in the bit
2922    * position where 'S' and 's' differ. */
2923    const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2924    const U8 s_masked = 's' & S_or_s_mask;
2925
2926    while (s < upper) {
2927     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2928     if (! len) {    /* Not a multi-char fold. */
2929      if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2930      {
2931       *has_exactf_sharp_s = TRUE;
2932      }
2933      s++;
2934      continue;
2935     }
2936
2937     if (len == 2
2938      && ((*s & S_or_s_mask) == s_masked)
2939      && ((*(s+1) & S_or_s_mask) == s_masked))
2940     {
2941
2942      /* EXACTF nodes need to know that the minimum length
2943      * changed so that a sharp s in the string can match this
2944      * ss in the pattern, but they remain EXACTF nodes, as they
2945      * won't match this unless the target string is is UTF-8,
2946      * which we don't know until runtime */
2947      if (OP(scan) != EXACTF) {
2948       OP(scan) = EXACTFU_SS;
2949      }
2950     }
2951
2952     *min_subtract += len - 1;
2953     s += len;
2954    }
2955   }
2956  }
2957
2958 #ifdef DEBUGGING
2959  /* Allow dumping but overwriting the collection of skipped
2960  * ops and/or strings with fake optimized ops */
2961  n = scan + NODE_SZ_STR(scan);
2962  while (n <= stop) {
2963   OP(n) = OPTIMIZED;
2964   FLAGS(n) = 0;
2965   NEXT_OFF(n) = 0;
2966   n++;
2967  }
2968 #endif
2969  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2970  return stopnow;
2971 }
2972
2973 /* REx optimizer.  Converts nodes into quicker variants "in place".
2974    Finds fixed substrings.  */
2975
2976 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2977    to the position after last scanned or to NULL. */
2978
2979 #define INIT_AND_WITHP \
2980  assert(!and_withp); \
2981  Newx(and_withp,1,struct regnode_charclass_class); \
2982  SAVEFREEPV(and_withp)
2983
2984 /* this is a chain of data about sub patterns we are processing that
2985    need to be handled separately/specially in study_chunk. Its so
2986    we can simulate recursion without losing state.  */
2987 struct scan_frame;
2988 typedef struct scan_frame {
2989  regnode *last;  /* last node to process in this frame */
2990  regnode *next;  /* next node to process when last is reached */
2991  struct scan_frame *prev; /*previous frame*/
2992  I32 stop; /* what stopparen do we use */
2993 } scan_frame;
2994
2995
2996 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2997
2998 STATIC I32
2999 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3000       I32 *minlenp, I32 *deltap,
3001       regnode *last,
3002       scan_data_t *data,
3003       I32 stopparen,
3004       U8* recursed,
3005       struct regnode_charclass_class *and_withp,
3006       U32 flags, U32 depth)
3007       /* scanp: Start here (read-write). */
3008       /* deltap: Write maxlen-minlen here. */
3009       /* last: Stop before this one. */
3010       /* data: string data about the pattern */
3011       /* stopparen: treat close N as END */
3012       /* recursed: which subroutines have we recursed into */
3013       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3014 {
3015  dVAR;
3016  I32 min = 0;    /* There must be at least this number of characters to match */
3017  I32 pars = 0, code;
3018  regnode *scan = *scanp, *next;
3019  I32 delta = 0;
3020  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3021  int is_inf_internal = 0;  /* The studied chunk is infinite */
3022  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3023  scan_data_t data_fake;
3024  SV *re_trie_maxbuff = NULL;
3025  regnode *first_non_open = scan;
3026  I32 stopmin = I32_MAX;
3027  scan_frame *frame = NULL;
3028  GET_RE_DEBUG_FLAGS_DECL;
3029
3030  PERL_ARGS_ASSERT_STUDY_CHUNK;
3031
3032 #ifdef DEBUGGING
3033  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3034 #endif
3035
3036  if ( depth == 0 ) {
3037   while (first_non_open && OP(first_non_open) == OPEN)
3038    first_non_open=regnext(first_non_open);
3039  }
3040
3041
3042   fake_study_recurse:
3043  while ( scan && OP(scan) != END && scan < last ){
3044   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3045         node length to get a real minimum (because
3046         the folded version may be shorter) */
3047   bool has_exactf_sharp_s = FALSE;
3048   /* Peephole optimizer: */
3049   DEBUG_STUDYDATA("Peep:", data,depth);
3050   DEBUG_PEEP("Peep",scan,depth);
3051
3052   /* Its not clear to khw or hv why this is done here, and not in the
3053   * clauses that deal with EXACT nodes.  khw's guess is that it's
3054   * because of a previous design */
3055   JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3056
3057   /* Follow the next-chain of the current node and optimize
3058   away all the NOTHINGs from it.  */
3059   if (OP(scan) != CURLYX) {
3060    const int max = (reg_off_by_arg[OP(scan)]
3061      ? I32_MAX
3062      /* I32 may be smaller than U16 on CRAYs! */
3063      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3064    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3065    int noff;
3066    regnode *n = scan;
3067
3068    /* Skip NOTHING and LONGJMP. */
3069    while ((n = regnext(n))
3070     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3071      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3072     && off + noff < max)
3073     off += noff;
3074    if (reg_off_by_arg[OP(scan)])
3075     ARG(scan) = off;
3076    else
3077     NEXT_OFF(scan) = off;
3078   }
3079
3080
3081
3082   /* The principal pseudo-switch.  Cannot be a switch, since we
3083   look into several different things.  */
3084   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3085     || OP(scan) == IFTHEN) {
3086    next = regnext(scan);
3087    code = OP(scan);
3088    /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3089
3090    if (OP(next) == code || code == IFTHEN) {
3091     /* NOTE - There is similar code to this block below for handling
3092     TRIE nodes on a re-study.  If you change stuff here check there
3093     too. */
3094     I32 max1 = 0, min1 = I32_MAX, num = 0;
3095     struct regnode_charclass_class accum;
3096     regnode * const startbranch=scan;
3097
3098     if (flags & SCF_DO_SUBSTR)
3099      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3100     if (flags & SCF_DO_STCLASS)
3101      cl_init_zero(pRExC_state, &accum);
3102
3103     while (OP(scan) == code) {
3104      I32 deltanext, minnext, f = 0, fake;
3105      struct regnode_charclass_class this_class;
3106
3107      num++;
3108      data_fake.flags = 0;
3109      if (data) {
3110       data_fake.whilem_c = data->whilem_c;
3111       data_fake.last_closep = data->last_closep;
3112      }
3113      else
3114       data_fake.last_closep = &fake;
3115
3116      data_fake.pos_delta = delta;
3117      next = regnext(scan);
3118      scan = NEXTOPER(scan);
3119      if (code != BRANCH)
3120       scan = NEXTOPER(scan);
3121      if (flags & SCF_DO_STCLASS) {
3122       cl_init(pRExC_state, &this_class);
3123       data_fake.start_class = &this_class;
3124       f = SCF_DO_STCLASS_AND;
3125      }
3126      if (flags & SCF_WHILEM_VISITED_POS)
3127       f |= SCF_WHILEM_VISITED_POS;
3128
3129      /* we suppose the run is continuous, last=next...*/
3130      minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3131           next, &data_fake,
3132           stopparen, recursed, NULL, f,depth+1);
3133      if (min1 > minnext)
3134       min1 = minnext;
3135      if (deltanext == I32_MAX) {
3136       is_inf = is_inf_internal = 1;
3137       max1 = I32_MAX;
3138      } else if (max1 < minnext + deltanext)
3139       max1 = minnext + deltanext;
3140      scan = next;
3141      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3142       pars++;
3143      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3144       if ( stopmin > minnext)
3145        stopmin = min + min1;
3146       flags &= ~SCF_DO_SUBSTR;
3147       if (data)
3148        data->flags |= SCF_SEEN_ACCEPT;
3149      }
3150      if (data) {
3151       if (data_fake.flags & SF_HAS_EVAL)
3152        data->flags |= SF_HAS_EVAL;
3153       data->whilem_c = data_fake.whilem_c;
3154      }
3155      if (flags & SCF_DO_STCLASS)
3156       cl_or(pRExC_state, &accum, &this_class);
3157     }
3158     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3159      min1 = 0;
3160     if (flags & SCF_DO_SUBSTR) {
3161      data->pos_min += min1;
3162      if (data->pos_delta >= I32_MAX - (max1 - min1))
3163       data->pos_delta = I32_MAX;
3164      else
3165       data->pos_delta += max1 - min1;
3166      if (max1 != min1 || is_inf)
3167       data->longest = &(data->longest_float);
3168     }
3169     min += min1;
3170     if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
3171      delta = I32_MAX;
3172     else
3173      delta += max1 - min1;
3174     if (flags & SCF_DO_STCLASS_OR) {
3175      cl_or(pRExC_state, data->start_class, &accum);
3176      if (min1) {
3177       cl_and(data->start_class, and_withp);
3178       flags &= ~SCF_DO_STCLASS;
3179      }
3180     }
3181     else if (flags & SCF_DO_STCLASS_AND) {
3182      if (min1) {
3183       cl_and(data->start_class, &accum);
3184       flags &= ~SCF_DO_STCLASS;
3185      }
3186      else {
3187       /* Switch to OR mode: cache the old value of
3188       * data->start_class */
3189       INIT_AND_WITHP;
3190       StructCopy(data->start_class, and_withp,
3191         struct regnode_charclass_class);
3192       flags &= ~SCF_DO_STCLASS_AND;
3193       StructCopy(&accum, data->start_class,
3194         struct regnode_charclass_class);
3195       flags |= SCF_DO_STCLASS_OR;
3196       SET_SSC_EOS(data->start_class);
3197      }
3198     }
3199
3200     if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3201     /* demq.
3202
3203     Assuming this was/is a branch we are dealing with: 'scan' now
3204     points at the item that follows the branch sequence, whatever
3205     it is. We now start at the beginning of the sequence and look
3206     for subsequences of
3207
3208     BRANCH->EXACT=>x1
3209     BRANCH->EXACT=>x2
3210     tail
3211
3212     which would be constructed from a pattern like /A|LIST|OF|WORDS/
3213
3214     If we can find such a subsequence we need to turn the first
3215     element into a trie and then add the subsequent branch exact
3216     strings to the trie.
3217
3218     We have two cases
3219
3220      1. patterns where the whole set of branches can be converted.
3221
3222      2. patterns where only a subset can be converted.
3223
3224     In case 1 we can replace the whole set with a single regop
3225     for the trie. In case 2 we need to keep the start and end
3226     branches so
3227
3228      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3229      becomes BRANCH TRIE; BRANCH X;
3230
3231     There is an additional case, that being where there is a
3232     common prefix, which gets split out into an EXACT like node
3233     preceding the TRIE node.
3234
3235     If x(1..n)==tail then we can do a simple trie, if not we make
3236     a "jump" trie, such that when we match the appropriate word
3237     we "jump" to the appropriate tail node. Essentially we turn
3238     a nested if into a case structure of sorts.
3239
3240     */
3241
3242      int made=0;
3243      if (!re_trie_maxbuff) {
3244       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3245       if (!SvIOK(re_trie_maxbuff))
3246        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3247      }
3248      if ( SvIV(re_trie_maxbuff)>=0  ) {
3249       regnode *cur;
3250       regnode *first = (regnode *)NULL;
3251       regnode *last = (regnode *)NULL;
3252       regnode *tail = scan;
3253       U8 trietype = 0;
3254       U32 count=0;
3255
3256 #ifdef DEBUGGING
3257       SV * const mysv = sv_newmortal();       /* for dumping */
3258 #endif
3259       /* var tail is used because there may be a TAIL
3260       regop in the way. Ie, the exacts will point to the
3261       thing following the TAIL, but the last branch will
3262       point at the TAIL. So we advance tail. If we
3263       have nested (?:) we may have to move through several
3264       tails.
3265       */
3266
3267       while ( OP( tail ) == TAIL ) {
3268        /* this is the TAIL generated by (?:) */
3269        tail = regnext( tail );
3270       }
3271
3272
3273       DEBUG_TRIE_COMPILE_r({
3274        regprop(RExC_rx, mysv, tail );
3275        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3276         (int)depth * 2 + 2, "",
3277         "Looking for TRIE'able sequences. Tail node is: ",
3278         SvPV_nolen_const( mysv )
3279        );
3280       });
3281
3282       /*
3283
3284        Step through the branches
3285         cur represents each branch,
3286         noper is the first thing to be matched as part of that branch
3287         noper_next is the regnext() of that node.
3288
3289        We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3290        via a "jump trie" but we also support building with NOJUMPTRIE,
3291        which restricts the trie logic to structures like /FOO|BAR/.
3292
3293        If noper is a trieable nodetype then the branch is a possible optimization
3294        target. If we are building under NOJUMPTRIE then we require that noper_next
3295        is the same as scan (our current position in the regex program).
3296
3297        Once we have two or more consecutive such branches we can create a
3298        trie of the EXACT's contents and stitch it in place into the program.
3299
3300        If the sequence represents all of the branches in the alternation we
3301        replace the entire thing with a single TRIE node.
3302
3303        Otherwise when it is a subsequence we need to stitch it in place and
3304        replace only the relevant branches. This means the first branch has
3305        to remain as it is used by the alternation logic, and its next pointer,
3306        and needs to be repointed at the item on the branch chain following
3307        the last branch we have optimized away.
3308
3309        This could be either a BRANCH, in which case the subsequence is internal,
3310        or it could be the item following the branch sequence in which case the
3311        subsequence is at the end (which does not necessarily mean the first node
3312        is the start of the alternation).
3313
3314        TRIE_TYPE(X) is a define which maps the optype to a trietype.
3315
3316         optype          |  trietype
3317         ----------------+-----------
3318         NOTHING         | NOTHING
3319         EXACT           | EXACT
3320         EXACTFU         | EXACTFU
3321         EXACTFU_SS      | EXACTFU
3322         EXACTFU_TRICKYFOLD | EXACTFU
3323         EXACTFA         | 0
3324
3325
3326       */
3327 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3328      ( EXACT == (X) )   ? EXACT :        \
3329      ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3330      0 )
3331
3332       /* dont use tail as the end marker for this traverse */
3333       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3334        regnode * const noper = NEXTOPER( cur );
3335        U8 noper_type = OP( noper );
3336        U8 noper_trietype = TRIE_TYPE( noper_type );
3337 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3338        regnode * const noper_next = regnext( noper );
3339        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3340        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3341 #endif
3342
3343        DEBUG_TRIE_COMPILE_r({
3344         regprop(RExC_rx, mysv, cur);
3345         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3346         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3347
3348         regprop(RExC_rx, mysv, noper);
3349         PerlIO_printf( Perl_debug_log, " -> %s",
3350          SvPV_nolen_const(mysv));
3351
3352         if ( noper_next ) {
3353         regprop(RExC_rx, mysv, noper_next );
3354         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3355          SvPV_nolen_const(mysv));
3356         }
3357         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3358         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3359         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3360         );
3361        });
3362
3363        /* Is noper a trieable nodetype that can be merged with the
3364        * current trie (if there is one)? */
3365        if ( noper_trietype
3366         &&
3367         (
3368           ( noper_trietype == NOTHING)
3369           || ( trietype == NOTHING )
3370           || ( trietype == noper_trietype )
3371         )
3372 #ifdef NOJUMPTRIE
3373         && noper_next == tail
3374 #endif
3375         && count < U16_MAX)
3376        {
3377         /* Handle mergable triable node
3378         * Either we are the first node in a new trieable sequence,
3379         * in which case we do some bookkeeping, otherwise we update
3380         * the end pointer. */
3381         if ( !first ) {
3382          first = cur;
3383          if ( noper_trietype == NOTHING ) {
3384 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3385           regnode * const noper_next = regnext( noper );
3386           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3387           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3388 #endif
3389
3390           if ( noper_next_trietype ) {
3391            trietype = noper_next_trietype;
3392           } else if (noper_next_type)  {
3393            /* a NOTHING regop is 1 regop wide. We need at least two
3394            * for a trie so we can't merge this in */
3395            first = NULL;
3396           }
3397          } else {
3398           trietype = noper_trietype;
3399          }
3400         } else {
3401          if ( trietype == NOTHING )
3402           trietype = noper_trietype;
3403          last = cur;
3404         }
3405         if (first)
3406          count++;
3407        } /* end handle mergable triable node */
3408        else {
3409         /* handle unmergable node -
3410         * noper may either be a triable node which can not be tried
3411         * together with the current trie, or a non triable node */
3412         if ( last ) {
3413          /* If last is set and trietype is not NOTHING then we have found
3414          * at least two triable branch sequences in a row of a similar
3415          * trietype so we can turn them into a trie. If/when we
3416          * allow NOTHING to start a trie sequence this condition will be
3417          * required, and it isn't expensive so we leave it in for now. */
3418          if ( trietype && trietype != NOTHING )
3419           make_trie( pRExC_state,
3420             startbranch, first, cur, tail, count,
3421             trietype, depth+1 );
3422          last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3423         }
3424         if ( noper_trietype
3425 #ifdef NOJUMPTRIE
3426          && noper_next == tail
3427 #endif
3428         ){
3429          /* noper is triable, so we can start a new trie sequence */
3430          count = 1;
3431          first = cur;
3432          trietype = noper_trietype;
3433         } else if (first) {
3434          /* if we already saw a first but the current node is not triable then we have
3435          * to reset the first information. */
3436          count = 0;
3437          first = NULL;
3438          trietype = 0;
3439         }
3440        } /* end handle unmergable node */
3441       } /* loop over branches */
3442       DEBUG_TRIE_COMPILE_r({
3443        regprop(RExC_rx, mysv, cur);
3444        PerlIO_printf( Perl_debug_log,
3445        "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3446        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3447
3448       });
3449       if ( last && trietype ) {
3450        if ( trietype != NOTHING ) {
3451         /* the last branch of the sequence was part of a trie,
3452         * so we have to construct it here outside of the loop
3453         */
3454         made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3455 #ifdef TRIE_STUDY_OPT
3456         if ( ((made == MADE_EXACT_TRIE &&
3457          startbranch == first)
3458          || ( first_non_open == first )) &&
3459          depth==0 ) {
3460          flags |= SCF_TRIE_RESTUDY;
3461          if ( startbranch == first
3462           && scan == tail )
3463          {
3464           RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3465          }
3466         }
3467 #endif
3468        } else {
3469         /* at this point we know whatever we have is a NOTHING sequence/branch
3470         * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3471         */
3472         if ( startbranch == first ) {
3473          regnode *opt;
3474          /* the entire thing is a NOTHING sequence, something like this:
3475          * (?:|) So we can turn it into a plain NOTHING op. */
3476          DEBUG_TRIE_COMPILE_r({
3477           regprop(RExC_rx, mysv, cur);
3478           PerlIO_printf( Perl_debug_log,
3479           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3480           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3481
3482          });
3483          OP(startbranch)= NOTHING;
3484          NEXT_OFF(startbranch)= tail - startbranch;
3485          for ( opt= startbranch + 1; opt < tail ; opt++ )
3486           OP(opt)= OPTIMIZED;
3487         }
3488        }
3489       } /* end if ( last) */
3490      } /* TRIE_MAXBUF is non zero */
3491
3492     } /* do trie */
3493
3494    }
3495    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3496     scan = NEXTOPER(NEXTOPER(scan));
3497    } else   /* single branch is optimized. */
3498     scan = NEXTOPER(scan);
3499    continue;
3500   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3501    scan_frame *newframe = NULL;
3502    I32 paren;
3503    regnode *start;
3504    regnode *end;
3505
3506    if (OP(scan) != SUSPEND) {
3507    /* set the pointer */
3508     if (OP(scan) == GOSUB) {
3509      paren = ARG(scan);
3510      RExC_recurse[ARG2L(scan)] = scan;
3511      start = RExC_open_parens[paren-1];
3512      end   = RExC_close_parens[paren-1];
3513     } else {
3514      paren = 0;
3515      start = RExC_rxi->program + 1;
3516      end   = RExC_opend;
3517     }
3518     if (!recursed) {
3519      Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3520      SAVEFREEPV(recursed);
3521     }
3522     if (!PAREN_TEST(recursed,paren+1)) {
3523      PAREN_SET(recursed,paren+1);
3524      Newx(newframe,1,scan_frame);
3525     } else {
3526      if (flags & SCF_DO_SUBSTR) {
3527       SCAN_COMMIT(pRExC_state,data,minlenp);
3528       data->longest = &(data->longest_float);
3529      }
3530      is_inf = is_inf_internal = 1;
3531      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3532       cl_anything(pRExC_state, data->start_class);
3533      flags &= ~SCF_DO_STCLASS;
3534     }
3535    } else {
3536     Newx(newframe,1,scan_frame);
3537     paren = stopparen;
3538     start = scan+2;
3539     end = regnext(scan);
3540    }
3541    if (newframe) {
3542     assert(start);
3543     assert(end);
3544     SAVEFREEPV(newframe);
3545     newframe->next = regnext(scan);
3546     newframe->last = last;
3547     newframe->stop = stopparen;
3548     newframe->prev = frame;
3549
3550     frame = newframe;
3551     scan =  start;
3552     stopparen = paren;
3553     last = end;
3554
3555     continue;
3556    }
3557   }
3558   else if (OP(scan) == EXACT) {
3559    I32 l = STR_LEN(scan);
3560    UV uc;
3561    if (UTF) {
3562     const U8 * const s = (U8*)STRING(scan);
3563     uc = utf8_to_uvchr_buf(s, s + l, NULL);
3564     l = utf8_length(s, s + l);
3565    } else {
3566     uc = *((U8*)STRING(scan));
3567    }
3568    min += l;
3569    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3570     /* The code below prefers earlier match for fixed
3571     offset, later match for variable offset.  */
3572     if (data->last_end == -1) { /* Update the start info. */
3573      data->last_start_min = data->pos_min;
3574      data->last_start_max = is_inf
3575       ? I32_MAX : data->pos_min + data->pos_delta;
3576     }
3577     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3578     if (UTF)
3579      SvUTF8_on(data->last_found);
3580     {
3581      SV * const sv = data->last_found;
3582      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3583       mg_find(sv, PERL_MAGIC_utf8) : NULL;
3584      if (mg && mg->mg_len >= 0)
3585       mg->mg_len += utf8_length((U8*)STRING(scan),
3586             (U8*)STRING(scan)+STR_LEN(scan));
3587     }
3588     data->last_end = data->pos_min + l;
3589     data->pos_min += l; /* As in the first entry. */
3590     data->flags &= ~SF_BEFORE_EOL;
3591    }
3592    if (flags & SCF_DO_STCLASS_AND) {
3593     /* Check whether it is compatible with what we know already! */
3594     int compat = 1;
3595
3596
3597     /* If compatible, we or it in below.  It is compatible if is
3598     * in the bitmp and either 1) its bit or its fold is set, or 2)
3599     * it's for a locale.  Even if there isn't unicode semantics
3600     * here, at runtime there may be because of matching against a
3601     * utf8 string, so accept a possible false positive for
3602     * latin1-range folds */
3603     if (uc >= 0x100 ||
3604      (!(data->start_class->flags & ANYOF_LOCALE)
3605      && !ANYOF_BITMAP_TEST(data->start_class, uc)
3606      && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3607       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3608      )
3609     {
3610      compat = 0;
3611     }
3612     ANYOF_CLASS_ZERO(data->start_class);
3613     ANYOF_BITMAP_ZERO(data->start_class);
3614     if (compat)
3615      ANYOF_BITMAP_SET(data->start_class, uc);
3616     else if (uc >= 0x100) {
3617      int i;
3618
3619      /* Some Unicode code points fold to the Latin1 range; as
3620      * XXX temporary code, instead of figuring out if this is
3621      * one, just assume it is and set all the start class bits
3622      * that could be some such above 255 code point's fold
3623      * which will generate fals positives.  As the code
3624      * elsewhere that does compute the fold settles down, it
3625      * can be extracted out and re-used here */
3626      for (i = 0; i < 256; i++){
3627       if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3628        ANYOF_BITMAP_SET(data->start_class, i);
3629       }
3630      }
3631     }
3632     CLEAR_SSC_EOS(data->start_class);
3633     if (uc < 0x100)
3634     data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3635    }
3636    else if (flags & SCF_DO_STCLASS_OR) {
3637     /* false positive possible if the class is case-folded */
3638     if (uc < 0x100)
3639      ANYOF_BITMAP_SET(data->start_class, uc);
3640     else
3641      data->start_class->flags |= ANYOF_UNICODE_ALL;
3642     CLEAR_SSC_EOS(data->start_class);
3643     cl_and(data->start_class, and_withp);
3644    }
3645    flags &= ~SCF_DO_STCLASS;
3646   }
3647   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3648    I32 l = STR_LEN(scan);
3649    UV uc = *((U8*)STRING(scan));
3650
3651    /* Search for fixed substrings supports EXACT only. */
3652    if (flags & SCF_DO_SUBSTR) {
3653     assert(data);
3654     SCAN_COMMIT(pRExC_state, data, minlenp);
3655    }
3656    if (UTF) {
3657     const U8 * const s = (U8 *)STRING(scan);
3658     uc = utf8_to_uvchr_buf(s, s + l, NULL);
3659     l = utf8_length(s, s + l);
3660    }
3661    if (has_exactf_sharp_s) {
3662     RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3663    }
3664    min += l - min_subtract;
3665    assert (min >= 0);
3666    delta += min_subtract;
3667    if (flags & SCF_DO_SUBSTR) {
3668     data->pos_min += l - min_subtract;
3669     if (data->pos_min < 0) {
3670      data->pos_min = 0;
3671     }
3672     data->pos_delta += min_subtract;
3673     if (min_subtract) {
3674      data->longest = &(data->longest_float);
3675     }
3676    }
3677    if (flags & SCF_DO_STCLASS_AND) {
3678     /* Check whether it is compatible with what we know already! */
3679     int compat = 1;
3680     if (uc >= 0x100 ||
3681     (!(data->start_class->flags & ANYOF_LOCALE)
3682     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3683     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3684     {
3685      compat = 0;
3686     }
3687     ANYOF_CLASS_ZERO(data->start_class);
3688     ANYOF_BITMAP_ZERO(data->start_class);
3689     if (compat) {
3690      ANYOF_BITMAP_SET(data->start_class, uc);
3691      CLEAR_SSC_EOS(data->start_class);
3692      if (OP(scan) == EXACTFL) {
3693       /* XXX This set is probably no longer necessary, and
3694       * probably wrong as LOCALE now is on in the initial
3695       * state */
3696       data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3697      }
3698      else {
3699
3700       /* Also set the other member of the fold pair.  In case
3701       * that unicode semantics is called for at runtime, use
3702       * the full latin1 fold.  (Can't do this for locale,
3703       * because not known until runtime) */
3704       ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3705
3706       /* All other (EXACTFL handled above) folds except under
3707       * /iaa that include s, S, and sharp_s also may include
3708       * the others */
3709       if (OP(scan) != EXACTFA) {
3710        if (uc == 's' || uc == 'S') {
3711         ANYOF_BITMAP_SET(data->start_class,
3712             LATIN_SMALL_LETTER_SHARP_S);
3713        }
3714        else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3715         ANYOF_BITMAP_SET(data->start_class, 's');
3716         ANYOF_BITMAP_SET(data->start_class, 'S');
3717        }
3718       }
3719      }
3720     }
3721     else if (uc >= 0x100) {
3722      int i;
3723      for (i = 0; i < 256; i++){
3724       if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3725        ANYOF_BITMAP_SET(data->start_class, i);
3726       }
3727      }
3728     }
3729    }
3730    else if (flags & SCF_DO_STCLASS_OR) {
3731     if (data->start_class->flags & ANYOF_LOC_FOLD) {
3732      /* false positive possible if the class is case-folded.
3733      Assume that the locale settings are the same... */
3734      if (uc < 0x100) {
3735       ANYOF_BITMAP_SET(data->start_class, uc);
3736       if (OP(scan) != EXACTFL) {
3737
3738        /* And set the other member of the fold pair, but
3739        * can't do that in locale because not known until
3740        * run-time */
3741        ANYOF_BITMAP_SET(data->start_class,
3742            PL_fold_latin1[uc]);
3743
3744        /* All folds except under /iaa that include s, S,
3745        * and sharp_s also may include the others */
3746        if (OP(scan) != EXACTFA) {
3747         if (uc == 's' || uc == 'S') {
3748          ANYOF_BITMAP_SET(data->start_class,
3749             LATIN_SMALL_LETTER_SHARP_S);
3750         }
3751         else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3752          ANYOF_BITMAP_SET(data->start_class, 's');
3753          ANYOF_BITMAP_SET(data->start_class, 'S');
3754         }
3755        }
3756       }
3757      }
3758      CLEAR_SSC_EOS(data->start_class);
3759     }
3760     cl_and(data->start_class, and_withp);
3761    }
3762    flags &= ~SCF_DO_STCLASS;
3763   }
3764   else if (REGNODE_VARIES(OP(scan))) {
3765    I32 mincount, maxcount, minnext, deltanext, fl = 0;
3766    I32 f = flags, pos_before = 0;
3767    regnode * const oscan = scan;
3768    struct regnode_charclass_class this_class;
3769    struct regnode_charclass_class *oclass = NULL;
3770    I32 next_is_eval = 0;
3771
3772    switch (PL_regkind[OP(scan)]) {
3773    case WHILEM:  /* End of (?:...)* . */
3774     scan = NEXTOPER(scan);
3775     goto finish;
3776    case PLUS:
3777     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3778      next = NEXTOPER(scan);
3779      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3780       mincount = 1;
3781       maxcount = REG_INFTY;
3782       next = regnext(scan);
3783       scan = NEXTOPER(scan);
3784       goto do_curly;
3785      }
3786     }
3787     if (flags & SCF_DO_SUBSTR)
3788      data->pos_min++;
3789     min++;
3790     /* Fall through. */
3791    case STAR:
3792     if (flags & SCF_DO_STCLASS) {
3793      mincount = 0;
3794      maxcount = REG_INFTY;
3795      next = regnext(scan);
3796      scan = NEXTOPER(scan);
3797      goto do_curly;
3798     }
3799     is_inf = is_inf_internal = 1;
3800     scan = regnext(scan);
3801     if (flags & SCF_DO_SUBSTR) {
3802      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3803      data->longest = &(data->longest_float);
3804     }
3805     goto optimize_curly_tail;
3806    case CURLY:
3807     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3808      && (scan->flags == stopparen))
3809     {
3810      mincount = 1;
3811      maxcount = 1;
3812     } else {
3813      mincount = ARG1(scan);
3814      maxcount = ARG2(scan);
3815     }
3816     next = regnext(scan);
3817     if (OP(scan) == CURLYX) {
3818      I32 lp = (data ? *(data->last_closep) : 0);
3819      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3820     }
3821     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3822     next_is_eval = (OP(scan) == EVAL);
3823    do_curly:
3824     if (flags & SCF_DO_SUBSTR) {
3825      if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3826      pos_before = data->pos_min;
3827     }
3828     if (data) {
3829      fl = data->flags;
3830      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3831      if (is_inf)
3832       data->flags |= SF_IS_INF;
3833     }
3834     if (flags & SCF_DO_STCLASS) {
3835      cl_init(pRExC_state, &this_class);
3836      oclass = data->start_class;
3837      data->start_class = &this_class;
3838      f |= SCF_DO_STCLASS_AND;
3839      f &= ~SCF_DO_STCLASS_OR;
3840     }
3841     /* Exclude from super-linear cache processing any {n,m}
3842     regops for which the combination of input pos and regex
3843     pos is not enough information to determine if a match
3844     will be possible.
3845
3846     For example, in the regex /foo(bar\s*){4,8}baz/ with the
3847     regex pos at the \s*, the prospects for a match depend not
3848     only on the input position but also on how many (bar\s*)
3849     repeats into the {4,8} we are. */
3850    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3851      f &= ~SCF_WHILEM_VISITED_POS;
3852
3853     /* This will finish on WHILEM, setting scan, or on NULL: */
3854     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3855          last, data, stopparen, recursed, NULL,
3856          (mincount == 0
3857           ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3858
3859     if (flags & SCF_DO_STCLASS)
3860      data->start_class = oclass;
3861     if (mincount == 0 || minnext == 0) {
3862      if (flags & SCF_DO_STCLASS_OR) {
3863       cl_or(pRExC_state, data->start_class, &this_class);
3864      }
3865      else if (flags & SCF_DO_STCLASS_AND) {
3866       /* Switch to OR mode: cache the old value of
3867       * data->start_class */
3868       INIT_AND_WITHP;
3869       StructCopy(data->start_class, and_withp,
3870         struct regnode_charclass_class);
3871       flags &= ~SCF_DO_STCLASS_AND;
3872       StructCopy(&this_class, data->start_class,
3873         struct regnode_charclass_class);
3874       flags |= SCF_DO_STCLASS_OR;
3875       SET_SSC_EOS(data->start_class);
3876      }
3877     } else {  /* Non-zero len */
3878      if (flags & SCF_DO_STCLASS_OR) {
3879       cl_or(pRExC_state, data->start_class, &this_class);
3880       cl_and(data->start_class, and_withp);
3881      }
3882      else if (flags & SCF_DO_STCLASS_AND)
3883       cl_and(data->start_class, &this_class);
3884      flags &= ~SCF_DO_STCLASS;
3885     }
3886     if (!scan)   /* It was not CURLYX, but CURLY. */
3887      scan = next;
3888     if ( /* ? quantifier ok, except for (?{ ... }) */
3889      (next_is_eval || !(mincount == 0 && maxcount == 1))
3890      && (minnext == 0) && (deltanext == 0)
3891      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3892      && maxcount <= REG_INFTY/3) /* Complement check for big count */
3893     {
3894      /* Fatal warnings may leak the regexp without this: */
3895      SAVEFREESV(RExC_rx_sv);
3896      ckWARNreg(RExC_parse,
3897        "Quantifier unexpected on zero-length expression");
3898      (void)ReREFCNT_inc(RExC_rx_sv);
3899     }
3900
3901     min += minnext * mincount;
3902     is_inf_internal |= deltanext == I32_MAX
3903          || (maxcount == REG_INFTY && minnext + deltanext > 0);
3904     is_inf |= is_inf_internal;
3905     if (is_inf)
3906      delta = I32_MAX;
3907     else
3908      delta += (minnext + deltanext) * maxcount - minnext * mincount;
3909
3910     /* Try powerful optimization CURLYX => CURLYN. */
3911     if (  OP(oscan) == CURLYX && data
3912      && data->flags & SF_IN_PAR
3913      && !(data->flags & SF_HAS_EVAL)
3914      && !deltanext && minnext == 1 ) {
3915      /* Try to optimize to CURLYN.  */
3916      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3917      regnode * const nxt1 = nxt;
3918 #ifdef DEBUGGING
3919      regnode *nxt2;
3920 #endif
3921
3922      /* Skip open. */
3923      nxt = regnext(nxt);
3924      if (!REGNODE_SIMPLE(OP(nxt))
3925       && !(PL_regkind[OP(nxt)] == EXACT
3926        && STR_LEN(nxt) == 1))
3927       goto nogo;
3928 #ifdef DEBUGGING
3929      nxt2 = nxt;
3930 #endif
3931      nxt = regnext(nxt);
3932      if (OP(nxt) != CLOSE)
3933       goto nogo;
3934      if (RExC_open_parens) {
3935       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3936       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3937      }
3938      /* Now we know that nxt2 is the only contents: */
3939      oscan->flags = (U8)ARG(nxt);
3940      OP(oscan) = CURLYN;
3941      OP(nxt1) = NOTHING; /* was OPEN. */
3942
3943 #ifdef DEBUGGING
3944      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3945      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3946      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3947      OP(nxt) = OPTIMIZED; /* was CLOSE. */
3948      OP(nxt + 1) = OPTIMIZED; /* was count. */
3949      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3950 #endif
3951     }
3952    nogo:
3953
3954     /* Try optimization CURLYX => CURLYM. */
3955     if (  OP(oscan) == CURLYX && data
3956      && !(data->flags & SF_HAS_PAR)
3957      && !(data->flags & SF_HAS_EVAL)
3958      && !deltanext /* atom is fixed width */
3959      && minnext != 0 /* CURLYM can't handle zero width */
3960      && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3961     ) {
3962      /* XXXX How to optimize if data == 0? */
3963      /* Optimize to a simpler form.  */
3964      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3965      regnode *nxt2;
3966
3967      OP(oscan) = CURLYM;
3968      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3969        && (OP(nxt2) != WHILEM))
3970       nxt = nxt2;
3971      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3972      /* Need to optimize away parenths. */
3973      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3974       /* Set the parenth number.  */
3975       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3976
3977       oscan->flags = (U8)ARG(nxt);
3978       if (RExC_open_parens) {
3979        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3980        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3981       }
3982       OP(nxt1) = OPTIMIZED; /* was OPEN. */
3983       OP(nxt) = OPTIMIZED; /* was CLOSE. */
3984
3985 #ifdef DEBUGGING
3986       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3987       OP(nxt + 1) = OPTIMIZED; /* was count. */
3988       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3989       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3990 #endif
3991 #if 0
3992       while ( nxt1 && (OP(nxt1) != WHILEM)) {
3993        regnode *nnxt = regnext(nxt1);
3994        if (nnxt == nxt) {
3995         if (reg_off_by_arg[OP(nxt1)])
3996          ARG_SET(nxt1, nxt2 - nxt1);
3997         else if (nxt2 - nxt1 < U16_MAX)
3998          NEXT_OFF(nxt1) = nxt2 - nxt1;
3999         else
4000          OP(nxt) = NOTHING; /* Cannot beautify */
4001        }
4002        nxt1 = nnxt;
4003       }
4004 #endif
4005       /* Optimize again: */
4006       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4007          NULL, stopparen, recursed, NULL, 0,depth+1);
4008      }
4009      else
4010       oscan->flags = 0;
4011     }
4012     else if ((OP(oscan) == CURLYX)
4013       && (flags & SCF_WHILEM_VISITED_POS)
4014       /* See the comment on a similar expression above.
4015        However, this time it's not a subexpression
4016        we care about, but the expression itself. */
4017       && (maxcount == REG_INFTY)
4018       && data && ++data->whilem_c < 16) {
4019      /* This stays as CURLYX, we can put the count/of pair. */
4020      /* Find WHILEM (as in regexec.c) */
4021      regnode *nxt = oscan + NEXT_OFF(oscan);
4022
4023      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4024       nxt += ARG(nxt);
4025      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4026       | (RExC_whilem_seen << 4)); /* On WHILEM */
4027     }
4028     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4029      pars++;
4030     if (flags & SCF_DO_SUBSTR) {
4031      SV *last_str = NULL;
4032      int counted = mincount != 0;
4033
4034      if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4035 #if defined(SPARC64_GCC_WORKAROUND)
4036       I32 b = 0;
4037       STRLEN l = 0;
4038       const char *s = NULL;
4039       I32 old = 0;
4040
4041       if (pos_before >= data->last_start_min)
4042        b = pos_before;
4043       else
4044        b = data->last_start_min;
4045
4046       l = 0;
4047       s = SvPV_const(data->last_found, l);
4048       old = b - data->last_start_min;
4049
4050 #else
4051       I32 b = pos_before >= data->last_start_min
4052        ? pos_before : data->last_start_min;
4053       STRLEN l;
4054       const char * const s = SvPV_const(data->last_found, l);
4055       I32 old = b - data->last_start_min;
4056 #endif
4057
4058       if (UTF)
4059        old = utf8_hop((U8*)s, old) - (U8*)s;
4060       l -= old;
4061       /* Get the added string: */
4062       last_str = newSVpvn_utf8(s  + old, l, UTF);
4063       if (deltanext == 0 && pos_before == b) {
4064        /* What was added is a constant string */
4065        if (mincount > 1) {
4066         SvGROW(last_str, (mincount * l) + 1);
4067         repeatcpy(SvPVX(last_str) + l,
4068           SvPVX_const(last_str), l, mincount - 1);
4069         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4070         /* Add additional parts. */
4071         SvCUR_set(data->last_found,
4072           SvCUR(data->last_found) - l);
4073         sv_catsv(data->last_found, last_str);
4074         {
4075          SV * sv = data->last_found;
4076          MAGIC *mg =
4077           SvUTF8(sv) && SvMAGICAL(sv) ?
4078           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4079          if (mg && mg->mg_len >= 0)
4080           mg->mg_len += CHR_SVLEN(last_str) - l;
4081         }
4082         data->last_end += l * (mincount - 1);
4083        }
4084       } else {
4085        /* start offset must point into the last copy */
4086        data->last_start_min += minnext * (mincount - 1);
4087        data->last_start_max += is_inf ? I32_MAX
4088         : (maxcount - 1) * (minnext + data->pos_delta);
4089       }
4090      }
4091      /* It is counted once already... */
4092      data->pos_min += minnext * (mincount - counted);
4093 #if 0
4094 PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
4095  counted, deltanext, I32_MAX, minnext, maxcount, mincount);
4096 if (deltanext != I32_MAX)
4097 PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
4098 #endif
4099      if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
4100       data->pos_delta = I32_MAX;
4101      else
4102       data->pos_delta += - counted * deltanext +
4103       (minnext + deltanext) * maxcount - minnext * mincount;
4104      if (mincount != maxcount) {
4105       /* Cannot extend fixed substrings found inside
4106        the group.  */
4107       SCAN_COMMIT(pRExC_state,data,minlenp);
4108       if (mincount && last_str) {
4109        SV * const sv = data->last_found;
4110        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4111         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4112
4113        if (mg)
4114         mg->mg_len = -1;
4115        sv_setsv(sv, last_str);
4116        data->last_end = data->pos_min;
4117        data->last_start_min =
4118         data->pos_min - CHR_SVLEN(last_str);
4119        data->last_start_max = is_inf
4120         ? I32_MAX
4121         : data->pos_min + data->pos_delta
4122         - CHR_SVLEN(last_str);
4123       }
4124       data->longest = &(data->longest_float);
4125      }
4126      SvREFCNT_dec(last_str);
4127     }
4128     if (data && (fl & SF_HAS_EVAL))
4129      data->flags |= SF_HAS_EVAL;
4130    optimize_curly_tail:
4131     if (OP(oscan) != CURLYX) {
4132      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4133       && NEXT_OFF(next))
4134       NEXT_OFF(oscan) += NEXT_OFF(next);
4135     }
4136     continue;
4137    default:   /* REF, and CLUMP only? */
4138     if (flags & SCF_DO_SUBSTR) {
4139      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4140      data->longest = &(data->longest_float);
4141     }
4142     is_inf = is_inf_internal = 1;
4143     if (flags & SCF_DO_STCLASS_OR)
4144      cl_anything(pRExC_state, data->start_class);
4145     flags &= ~SCF_DO_STCLASS;
4146     break;
4147    }
4148   }
4149   else if (OP(scan) == LNBREAK) {
4150    if (flags & SCF_DO_STCLASS) {
4151     int value = 0;
4152     CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4153      if (flags & SCF_DO_STCLASS_AND) {
4154      for (value = 0; value < 256; value++)
4155       if (!is_VERTWS_cp(value))
4156        ANYOF_BITMAP_CLEAR(data->start_class, value);
4157     }
4158     else {
4159      for (value = 0; value < 256; value++)
4160       if (is_VERTWS_cp(value))
4161        ANYOF_BITMAP_SET(data->start_class, value);
4162     }
4163     if (flags & SCF_DO_STCLASS_OR)
4164      cl_and(data->start_class, and_withp);
4165     flags &= ~SCF_DO_STCLASS;
4166    }
4167    min++;
4168    delta++;    /* Because of the 2 char string cr-lf */
4169    if (flags & SCF_DO_SUBSTR) {
4170      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4171      data->pos_min += 1;
4172     data->pos_delta += 1;
4173     data->longest = &(data->longest_float);
4174     }
4175   }
4176   else if (REGNODE_SIMPLE(OP(scan))) {
4177    int value = 0;
4178
4179    if (flags & SCF_DO_SUBSTR) {
4180     SCAN_COMMIT(pRExC_state,data,minlenp);
4181     data->pos_min++;
4182    }
4183    min++;
4184    if (flags & SCF_DO_STCLASS) {
4185     int loop_max = 256;
4186     CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4187
4188     /* Some of the logic below assumes that switching
4189     locale on will only add false positives. */
4190     switch (PL_regkind[OP(scan)]) {
4191      U8 classnum;
4192
4193     case SANY:
4194     default:
4195 #ifdef DEBUGGING
4196     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4197 #endif
4198     do_default:
4199      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4200       cl_anything(pRExC_state, data->start_class);
4201      break;
4202     case REG_ANY:
4203      if (OP(scan) == SANY)
4204       goto do_default;
4205      if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4206       value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4207         || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4208       cl_anything(pRExC_state, data->start_class);
4209      }
4210      if (flags & SCF_DO_STCLASS_AND || !value)
4211       ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4212      break;
4213     case ANYOF:
4214      if (flags & SCF_DO_STCLASS_AND)
4215       cl_and(data->start_class,
4216        (struct regnode_charclass_class*)scan);
4217      else
4218       cl_or(pRExC_state, data->start_class,
4219        (struct regnode_charclass_class*)scan);
4220      break;
4221     case POSIXA:
4222      loop_max = 128;
4223      /* FALL THROUGH */
4224     case POSIXL:
4225     case POSIXD:
4226     case POSIXU:
4227      classnum = FLAGS(scan);
4228      if (flags & SCF_DO_STCLASS_AND) {
4229       if (!(data->start_class->flags & ANYOF_LOCALE)) {
4230        ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4231        for (value = 0; value < loop_max; value++) {
4232         if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4233          ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4234         }
4235        }
4236       }
4237      }
4238      else {
4239       if (data->start_class->flags & ANYOF_LOCALE) {
4240        ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4241       }
4242       else {
4243
4244       /* Even if under locale, set the bits for non-locale
4245       * in case it isn't a true locale-node.  This will
4246       * create false positives if it truly is locale */
4247       for (value = 0; value < loop_max; value++) {
4248        if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4249         ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4250        }
4251       }
4252       }
4253      }
4254      break;
4255     case NPOSIXA:
4256      loop_max = 128;
4257      /* FALL THROUGH */
4258     case NPOSIXL:
4259     case NPOSIXU:
4260     case NPOSIXD:
4261      classnum = FLAGS(scan);
4262      if (flags & SCF_DO_STCLASS_AND) {
4263       if (!(data->start_class->flags & ANYOF_LOCALE)) {
4264        ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4265        for (value = 0; value < loop_max; value++) {
4266         if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4267          ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4268         }
4269        }
4270       }
4271      }
4272      else {
4273       if (data->start_class->flags & ANYOF_LOCALE) {
4274        ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4275       }
4276       else {
4277
4278       /* Even if under locale, set the bits for non-locale in
4279       * case it isn't a true locale-node.  This will create
4280       * false positives if it truly is locale */
4281       for (value = 0; value < loop_max; value++) {
4282        if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4283         ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4284        }
4285       }
4286       if (PL_regkind[OP(scan)] == NPOSIXD) {
4287        data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4288       }
4289       }
4290      }
4291      break;
4292     }
4293     if (flags & SCF_DO_STCLASS_OR)
4294      cl_and(data->start_class, and_withp);
4295     flags &= ~SCF_DO_STCLASS;
4296    }
4297   }
4298   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4299    data->flags |= (OP(scan) == MEOL
4300        ? SF_BEFORE_MEOL
4301        : SF_BEFORE_SEOL);
4302    SCAN_COMMIT(pRExC_state, data, minlenp);
4303
4304   }
4305   else if (  PL_regkind[OP(scan)] == BRANCHJ
4306     /* Lookbehind, or need to calculate parens/evals/stclass: */
4307     && (scan->flags || data || (flags & SCF_DO_STCLASS))
4308     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4309    if ( OP(scan) == UNLESSM &&
4310     scan->flags == 0 &&
4311     OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4312     OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4313    ) {
4314     regnode *opt;
4315     regnode *upto= regnext(scan);
4316     DEBUG_PARSE_r({
4317      SV * const mysv_val=sv_newmortal();
4318      DEBUG_STUDYDATA("OPFAIL",data,depth);
4319
4320      /*DEBUG_PARSE_MSG("opfail");*/
4321      regprop(RExC_rx, mysv_val, upto);
4322      PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4323         SvPV_nolen_const(mysv_val),
4324         (IV)REG_NODE_NUM(upto),
4325         (IV)(upto - scan)
4326      );
4327     });
4328     OP(scan) = OPFAIL;
4329     NEXT_OFF(scan) = upto - scan;
4330     for (opt= scan + 1; opt < upto ; opt++)
4331      OP(opt) = OPTIMIZED;
4332     scan= upto;
4333     continue;
4334    }
4335    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4336     || OP(scan) == UNLESSM )
4337    {
4338     /* Negative Lookahead/lookbehind
4339     In this case we can't do fixed string optimisation.
4340     */
4341
4342     I32 deltanext, minnext, fake = 0;
4343     regnode *nscan;
4344     struct regnode_charclass_class intrnl;
4345     int f = 0;
4346
4347     data_fake.flags = 0;
4348     if (data) {
4349      data_fake.whilem_c = data->whilem_c;
4350      data_fake.last_closep = data->last_closep;
4351     }
4352     else
4353      data_fake.last_closep = &fake;
4354     data_fake.pos_delta = delta;
4355     if ( flags & SCF_DO_STCLASS && !scan->flags
4356      && OP(scan) == IFMATCH ) { /* Lookahead */
4357      cl_init(pRExC_state, &intrnl);
4358      data_fake.start_class = &intrnl;
4359      f |= SCF_DO_STCLASS_AND;
4360     }
4361     if (flags & SCF_WHILEM_VISITED_POS)
4362      f |= SCF_WHILEM_VISITED_POS;
4363     next = regnext(scan);
4364     nscan = NEXTOPER(NEXTOPER(scan));
4365     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4366      last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4367     if (scan->flags) {
4368      if (deltanext) {
4369       FAIL("Variable length lookbehind not implemented");
4370      }
4371      else if (minnext > (I32)U8_MAX) {
4372       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4373      }
4374      scan->flags = (U8)minnext;
4375     }
4376     if (data) {
4377      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4378       pars++;
4379      if (data_fake.flags & SF_HAS_EVAL)
4380       data->flags |= SF_HAS_EVAL;
4381      data->whilem_c = data_fake.whilem_c;
4382     }
4383     if (f & SCF_DO_STCLASS_AND) {
4384      if (flags & SCF_DO_STCLASS_OR) {
4385       /* OR before, AND after: ideally we would recurse with
4386       * data_fake to get the AND applied by study of the
4387       * remainder of the pattern, and then derecurse;
4388       * *** HACK *** for now just treat as "no information".
4389       * See [perl #56690].
4390       */
4391       cl_init(pRExC_state, data->start_class);
4392      }  else {
4393       /* AND before and after: combine and continue */
4394       const int was = TEST_SSC_EOS(data->start_class);
4395
4396       cl_and(data->start_class, &intrnl);
4397       if (was)
4398        SET_SSC_EOS(data->start_class);
4399      }
4400     }
4401    }
4402 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4403    else {
4404     /* Positive Lookahead/lookbehind
4405     In this case we can do fixed string optimisation,
4406     but we must be careful about it. Note in the case of
4407     lookbehind the positions will be offset by the minimum
4408     length of the pattern, something we won't know about
4409     until after the recurse.
4410     */
4411     I32 deltanext, fake = 0;
4412     regnode *nscan;
4413     struct regnode_charclass_class intrnl;
4414     int f = 0;
4415     /* We use SAVEFREEPV so that when the full compile
4416      is finished perl will clean up the allocated
4417      minlens when it's all done. This way we don't
4418      have to worry about freeing them when we know
4419      they wont be used, which would be a pain.
4420     */
4421     I32 *minnextp;
4422     Newx( minnextp, 1, I32 );
4423     SAVEFREEPV(minnextp);
4424
4425     if (data) {
4426      StructCopy(data, &data_fake, scan_data_t);
4427      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4428       f |= SCF_DO_SUBSTR;
4429       if (scan->flags)
4430        SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4431       data_fake.last_found=newSVsv(data->last_found);
4432      }
4433     }
4434     else
4435      data_fake.last_closep = &fake;
4436     data_fake.flags = 0;
4437     data_fake.pos_delta = delta;
4438     if (is_inf)
4439      data_fake.flags |= SF_IS_INF;
4440     if ( flags & SCF_DO_STCLASS && !scan->flags
4441      && OP(scan) == IFMATCH ) { /* Lookahead */
4442      cl_init(pRExC_state, &intrnl);
4443      data_fake.start_class = &intrnl;
4444      f |= SCF_DO_STCLASS_AND;
4445     }
4446     if (flags & SCF_WHILEM_VISITED_POS)
4447      f |= SCF_WHILEM_VISITED_POS;
4448     next = regnext(scan);
4449     nscan = NEXTOPER(NEXTOPER(scan));
4450
4451     *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4452      last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4453     if (scan->flags) {
4454      if (deltanext) {
4455       FAIL("Variable length lookbehind not implemented");
4456      }
4457      else if (*minnextp > (I32)U8_MAX) {
4458       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4459      }
4460      scan->flags = (U8)*minnextp;
4461     }
4462
4463     *minnextp += min;
4464
4465     if (f & SCF_DO_STCLASS_AND) {
4466      const int was = TEST_SSC_EOS(data.start_class);
4467
4468      cl_and(data->start_class, &intrnl);
4469      if (was)
4470       SET_SSC_EOS(data->start_class);
4471     }
4472     if (data) {
4473      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4474       pars++;
4475      if (data_fake.flags & SF_HAS_EVAL)
4476       data->flags |= SF_HAS_EVAL;
4477      data->whilem_c = data_fake.whilem_c;
4478      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4479       if (RExC_rx->minlen<*minnextp)
4480        RExC_rx->minlen=*minnextp;
4481       SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4482       SvREFCNT_dec_NN(data_fake.last_found);
4483
4484       if ( data_fake.minlen_fixed != minlenp )
4485       {
4486        data->offset_fixed= data_fake.offset_fixed;
4487        data->minlen_fixed= data_fake.minlen_fixed;
4488        data->lookbehind_fixed+= scan->flags;
4489       }
4490       if ( data_fake.minlen_float != minlenp )
4491       {
4492        data->minlen_float= data_fake.minlen_float;
4493        data->offset_float_min=data_fake.offset_float_min;
4494        data->offset_float_max=data_fake.offset_float_max;
4495        data->lookbehind_float+= scan->flags;
4496       }
4497      }
4498     }
4499    }
4500 #endif
4501   }
4502   else if (OP(scan) == OPEN) {
4503    if (stopparen != (I32)ARG(scan))
4504     pars++;
4505   }
4506   else if (OP(scan) == CLOSE) {
4507    if (stopparen == (I32)ARG(scan)) {
4508     break;
4509    }
4510    if ((I32)ARG(scan) == is_par) {
4511     next = regnext(scan);
4512
4513     if ( next && (OP(next) != WHILEM) && next < last)
4514      is_par = 0;  /* Disable optimization */
4515    }
4516    if (data)
4517     *(data->last_closep) = ARG(scan);
4518   }
4519   else if (OP(scan) == EVAL) {
4520     if (data)
4521      data->flags |= SF_HAS_EVAL;
4522   }
4523   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4524    if (flags & SCF_DO_SUBSTR) {
4525     SCAN_COMMIT(pRExC_state,data,minlenp);
4526     flags &= ~SCF_DO_SUBSTR;
4527    }
4528    if (data && OP(scan)==ACCEPT) {
4529     data->flags |= SCF_SEEN_ACCEPT;
4530     if (stopmin > min)
4531      stopmin = min;
4532    }
4533   }
4534   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4535   {
4536     if (flags & SCF_DO_SUBSTR) {
4537      SCAN_COMMIT(pRExC_state,data,minlenp);
4538      data->longest = &(data->longest_float);
4539     }
4540     is_inf = is_inf_internal = 1;
4541     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4542      cl_anything(pRExC_state, data->start_class);
4543     flags &= ~SCF_DO_STCLASS;
4544   }
4545   else if (OP(scan) == GPOS) {
4546    if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4547     !(delta || is_inf || (data && data->pos_delta)))
4548    {
4549     if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4550      RExC_rx->extflags |= RXf_ANCH_GPOS;
4551     if (RExC_rx->gofs < (U32)min)
4552      RExC_rx->gofs = min;
4553    } else {
4554     RExC_rx->extflags |= RXf_GPOS_FLOAT;
4555     RExC_rx->gofs = 0;
4556    }
4557   }
4558 #ifdef TRIE_STUDY_OPT
4559 #ifdef FULL_TRIE_STUDY
4560   else if (PL_regkind[OP(scan)] == TRIE) {
4561    /* NOTE - There is similar code to this block above for handling
4562    BRANCH nodes on the initial study.  If you change stuff here
4563    check there too. */
4564    regnode *trie_node= scan;
4565    regnode *tail= regnext(scan);
4566    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4567    I32 max1 = 0, min1 = I32_MAX;
4568    struct regnode_charclass_class accum;
4569
4570    if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4571     SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4572    if (flags & SCF_DO_STCLASS)
4573     cl_init_zero(pRExC_state, &accum);
4574
4575    if (!trie->jump) {
4576     min1= trie->minlen;
4577     max1= trie->maxlen;
4578    } else {
4579     const regnode *nextbranch= NULL;
4580     U32 word;
4581
4582     for ( word=1 ; word <= trie->wordcount ; word++)
4583     {
4584      I32 deltanext=0, minnext=0, f = 0, fake;
4585      struct regnode_charclass_class this_class;
4586
4587      data_fake.flags = 0;
4588      if (data) {
4589       data_fake.whilem_c = data->whilem_c;
4590       data_fake.last_closep = data->last_closep;
4591      }
4592      else
4593       data_fake.last_closep = &fake;
4594      data_fake.pos_delta = delta;
4595      if (flags & SCF_DO_STCLASS) {
4596       cl_init(pRExC_state, &this_class);
4597       data_fake.start_class = &this_class;
4598       f = SCF_DO_STCLASS_AND;
4599      }
4600      if (flags & SCF_WHILEM_VISITED_POS)
4601       f |= SCF_WHILEM_VISITED_POS;
4602
4603      if (trie->jump[word]) {
4604       if (!nextbranch)
4605        nextbranch = trie_node + trie->jump[0];
4606       scan= trie_node + trie->jump[word];
4607       /* We go from the jump point to the branch that follows
4608       it. Note this means we need the vestigal unused branches
4609       even though they arent otherwise used.
4610       */
4611       minnext = study_chunk(pRExC_state, &scan, minlenp,
4612        &deltanext, (regnode *)nextbranch, &data_fake,
4613        stopparen, recursed, NULL, f,depth+1);
4614      }
4615      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4616       nextbranch= regnext((regnode*)nextbranch);
4617
4618      if (min1 > (I32)(minnext + trie->minlen))
4619       min1 = minnext + trie->minlen;
4620      if (deltanext == I32_MAX) {
4621       is_inf = is_inf_internal = 1;
4622       max1 = I32_MAX;
4623      } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4624       max1 = minnext + deltanext + trie->maxlen;
4625
4626      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4627       pars++;
4628      if (data_fake.flags & SCF_SEEN_ACCEPT) {
4629       if ( stopmin > min + min1)
4630        stopmin = min + min1;
4631       flags &= ~SCF_DO_SUBSTR;
4632       if (data)
4633        data->flags |= SCF_SEEN_ACCEPT;
4634      }
4635      if (data) {
4636       if (data_fake.flags & SF_HAS_EVAL)
4637        data->flags |= SF_HAS_EVAL;
4638       data->whilem_c = data_fake.whilem_c;
4639      }
4640      if (flags & SCF_DO_STCLASS)
4641       cl_or(pRExC_state, &accum, &this_class);
4642     }
4643    }
4644    if (flags & SCF_DO_SUBSTR) {
4645     data->pos_min += min1;
4646     data->pos_delta += max1 - min1;
4647     if (max1 != min1 || is_inf)
4648      data->longest = &(data->longest_float);
4649    }
4650    min += min1;
4651    delta += max1 - min1;
4652    if (flags & SCF_DO_STCLASS_OR) {
4653     cl_or(pRExC_state, data->start_class, &accum);
4654     if (min1) {
4655      cl_and(data->start_class, and_withp);
4656      flags &= ~SCF_DO_STCLASS;
4657     }
4658    }
4659    else if (flags & SCF_DO_STCLASS_AND) {
4660     if (min1) {
4661      cl_and(data->start_class, &accum);
4662      flags &= ~SCF_DO_STCLASS;
4663     }
4664     else {
4665      /* Switch to OR mode: cache the old value of
4666      * data->start_class */
4667      INIT_AND_WITHP;
4668      StructCopy(data->start_class, and_withp,
4669        struct regnode_charclass_class);
4670      flags &= ~SCF_DO_STCLASS_AND;
4671      StructCopy(&accum, data->start_class,
4672        struct regnode_charclass_class);
4673      flags |= SCF_DO_STCLASS_OR;
4674      SET_SSC_EOS(data->start_class);
4675     }
4676    }
4677    scan= tail;
4678    continue;
4679   }
4680 #else
4681   else if (PL_regkind[OP(scan)] == TRIE) {
4682    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4683    U8*bang=NULL;
4684
4685    min += trie->minlen;
4686    delta += (trie->maxlen - trie->minlen);
4687    flags &= ~SCF_DO_STCLASS; /* xxx */
4688    if (flags & SCF_DO_SUBSTR) {
4689      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4690      data->pos_min += trie->minlen;
4691      data->pos_delta += (trie->maxlen - trie->minlen);
4692     if (trie->maxlen != trie->minlen)
4693      data->longest = &(data->longest_float);
4694     }
4695     if (trie->jump) /* no more substrings -- for now /grr*/
4696      flags &= ~SCF_DO_SUBSTR;
4697   }
4698 #endif /* old or new */
4699 #endif /* TRIE_STUDY_OPT */
4700
4701   /* Else: zero-length, ignore. */
4702   scan = regnext(scan);
4703  }
4704  if (frame) {
4705   last = frame->last;
4706   scan = frame->next;
4707   stopparen = frame->stop;
4708   frame = frame->prev;
4709   goto fake_study_recurse;
4710  }
4711
4712   finish:
4713  assert(!frame);
4714  DEBUG_STUDYDATA("pre-fin:",data,depth);
4715
4716  *scanp = scan;
4717  *deltap = is_inf_internal ? I32_MAX : delta;
4718  if (flags & SCF_DO_SUBSTR && is_inf)
4719   data->pos_delta = I32_MAX - data->pos_min;
4720  if (is_par > (I32)U8_MAX)
4721   is_par = 0;
4722  if (is_par && pars==1 && data) {
4723   data->flags |= SF_IN_PAR;
4724   data->flags &= ~SF_HAS_PAR;
4725  }
4726  else if (pars && data) {
4727   data->flags |= SF_HAS_PAR;
4728   data->flags &= ~SF_IN_PAR;
4729  }
4730  if (flags & SCF_DO_STCLASS_OR)
4731   cl_and(data->start_class, and_withp);
4732  if (flags & SCF_TRIE_RESTUDY)
4733   data->flags |=  SCF_TRIE_RESTUDY;
4734
4735  DEBUG_STUDYDATA("post-fin:",data,depth);
4736
4737  return min < stopmin ? min : stopmin;
4738 }
4739
4740 STATIC U32
4741 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4742 {
4743  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4744
4745  PERL_ARGS_ASSERT_ADD_DATA;
4746
4747  Renewc(RExC_rxi->data,
4748   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4749   char, struct reg_data);
4750  if(count)
4751   Renew(RExC_rxi->data->what, count + n, U8);
4752  else
4753   Newx(RExC_rxi->data->what, n, U8);
4754  RExC_rxi->data->count = count + n;
4755  Copy(s, RExC_rxi->data->what + count, n, U8);
4756  return count;
4757 }
4758
4759 /*XXX: todo make this not included in a non debugging perl */
4760 #ifndef PERL_IN_XSUB_RE
4761 void
4762 Perl_reginitcolors(pTHX)
4763 {
4764  dVAR;
4765  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4766  if (s) {
4767   char *t = savepv(s);
4768   int i = 0;
4769   PL_colors[0] = t;
4770   while (++i < 6) {
4771    t = strchr(t, '\t');
4772    if (t) {
4773     *t = '\0';
4774     PL_colors[i] = ++t;
4775    }
4776    else
4777     PL_colors[i] = t = (char *)"";
4778   }
4779  } else {
4780   int i = 0;
4781   while (i < 6)
4782    PL_colors[i++] = (char *)"";
4783  }
4784  PL_colorset = 1;
4785 }
4786 #endif
4787
4788
4789 #ifdef TRIE_STUDY_OPT
4790 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4791  STMT_START {                                            \
4792   if (                                                \
4793    (data.flags & SCF_TRIE_RESTUDY)               \
4794    && ! restudied++                              \
4795   ) {                                                 \
4796    dOsomething;                                    \
4797    goto reStudy;                                   \
4798   }                                                   \
4799  } STMT_END
4800 #else
4801 #define CHECK_RESTUDY_GOTO_butfirst
4802 #endif
4803
4804 /*
4805  * pregcomp - compile a regular expression into internal code
4806  *
4807  * Decides which engine's compiler to call based on the hint currently in
4808  * scope
4809  */
4810
4811 #ifndef PERL_IN_XSUB_RE
4812
4813 /* return the currently in-scope regex engine (or the default if none)  */
4814
4815 regexp_engine const *
4816 Perl_current_re_engine(pTHX)
4817 {
4818  dVAR;
4819
4820  if (IN_PERL_COMPILETIME) {
4821   HV * const table = GvHV(PL_hintgv);
4822   SV **ptr;
4823
4824   if (!table)
4825    return &reh_regexp_engine;
4826   ptr = hv_fetchs(table, "regcomp", FALSE);
4827   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4828    return &reh_regexp_engine;
4829   return INT2PTR(regexp_engine*,SvIV(*ptr));
4830  }
4831  else {
4832   SV *ptr;
4833   if (!PL_curcop->cop_hints_hash)
4834    return &reh_regexp_engine;
4835   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4836   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4837    return &reh_regexp_engine;
4838   return INT2PTR(regexp_engine*,SvIV(ptr));
4839  }
4840 }
4841
4842
4843 REGEXP *
4844 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4845 {
4846  dVAR;
4847  regexp_engine const *eng = current_re_engine();
4848  GET_RE_DEBUG_FLAGS_DECL;
4849
4850  PERL_ARGS_ASSERT_PREGCOMP;
4851
4852  /* Dispatch a request to compile a regexp to correct regexp engine. */
4853  DEBUG_COMPILE_r({
4854   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4855       PTR2UV(eng));
4856  });
4857  return CALLREGCOMP_ENG(eng, pattern, flags);
4858 }
4859 #endif
4860
4861 /* public(ish) entry point for the perl core's own regex compiling code.
4862  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4863  * pattern rather than a list of OPs, and uses the internal engine rather
4864  * than the current one */
4865
4866 REGEXP *
4867 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4868 {
4869  SV *pat = pattern; /* defeat constness! */
4870  PERL_ARGS_ASSERT_RE_COMPILE;
4871  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4872 #ifdef PERL_IN_XSUB_RE
4873         &my_reg_engine,
4874 #else
4875         &reh_regexp_engine,
4876 #endif
4877         NULL, NULL, rx_flags, 0);
4878 }
4879
4880
4881 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
4882  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
4883  * point to the realloced string and length.
4884  *
4885  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
4886  * stuff added */
4887
4888 static void
4889 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
4890      char **pat_p, STRLEN *plen_p, int num_code_blocks)
4891 {
4892  U8 *const src = (U8*)*pat_p;
4893  U8 *dst;
4894  int n=0;
4895  STRLEN s = 0, d = 0;
4896  bool do_end = 0;
4897  GET_RE_DEBUG_FLAGS_DECL;
4898
4899  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4900   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4901
4902  Newx(dst, *plen_p * 2 + 1, U8);
4903
4904  while (s < *plen_p) {
4905   const UV uv = NATIVE_TO_ASCII(src[s]);
4906   if (UNI_IS_INVARIANT(uv))
4907    dst[d]   = (U8)UTF_TO_NATIVE(uv);
4908   else {
4909    dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
4910    dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
4911   }
4912   if (n < num_code_blocks) {
4913    if (!do_end && pRExC_state->code_blocks[n].start == s) {
4914     pRExC_state->code_blocks[n].start = d;
4915     assert(dst[d] == '(');
4916     do_end = 1;
4917    }
4918    else if (do_end && pRExC_state->code_blocks[n].end == s) {
4919     pRExC_state->code_blocks[n].end = d;
4920     assert(dst[d] == ')');
4921     do_end = 0;
4922     n++;
4923    }
4924   }
4925   s++;
4926   d++;
4927  }
4928  dst[d] = '\0';
4929  *plen_p = d;
4930  *pat_p = (char*) dst;
4931  SAVEFREEPV(*pat_p);
4932  RExC_orig_utf8 = RExC_utf8 = 1;
4933 }
4934
4935
4936
4937 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
4938  * while recording any code block indices, and handling overloading,
4939  * nested qr// objects etc.  If pat is null, it will allocate a new
4940  * string, or just return the first arg, if there's only one.
4941  *
4942  * Returns the malloced/updated pat.
4943  * patternp and pat_count is the array of SVs to be concatted;
4944  * oplist is the optional list of ops that generated the SVs;
4945  * recompile_p is a pointer to a boolean that will be set if
4946  *   the regex will need to be recompiled.
4947  * delim, if non-null is an SV that will be inserted between each element
4948  */
4949
4950 static SV*
4951 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
4952     SV *pat, SV ** const patternp, int pat_count,
4953     OP *oplist, bool *recompile_p, SV *delim)
4954 {
4955  SV **svp;
4956  int n = 0;
4957  bool use_delim = FALSE;
4958  bool alloced = FALSE;
4959
4960  /* if we know we have at least two args, create an empty string,
4961  * then concatenate args to that. For no args, return an empty string */
4962  if (!pat && pat_count != 1) {
4963   pat = newSVpvn("", 0);
4964   SAVEFREESV(pat);
4965   alloced = TRUE;
4966  }
4967
4968  for (svp = patternp; svp < patternp + pat_count; svp++) {
4969   SV *sv;
4970   SV *rx  = NULL;
4971   STRLEN orig_patlen = 0;
4972   bool code = 0;
4973   SV *msv = use_delim ? delim : *svp;
4974
4975   /* if we've got a delimiter, we go round the loop twice for each
4976   * svp slot (except the last), using the delimiter the second
4977   * time round */
4978   if (use_delim) {
4979    svp--;
4980    use_delim = FALSE;
4981   }
4982   else if (delim)
4983    use_delim = TRUE;
4984
4985   if (SvTYPE(msv) == SVt_PVAV) {
4986    /* we've encountered an interpolated array within
4987    * the pattern, e.g. /...@a..../. Expand the list of elements,
4988    * then recursively append elements.
4989    * The code in this block is based on S_pushav() */
4990
4991    AV *const av = (AV*)msv;
4992    const I32 maxarg = AvFILL(av) + 1;
4993    SV **array;
4994
4995    if (oplist) {
4996     assert(oplist->op_type == OP_PADAV
4997      || oplist->op_type == OP_RV2AV);
4998     oplist = oplist->op_sibling;;
4999    }
5000
5001    if (SvRMAGICAL(av)) {
5002     U32 i;
5003
5004     Newx(array, maxarg, SV*);
5005     SAVEFREEPV(array);
5006     for (i=0; i < (U32)maxarg; i++) {
5007      SV ** const svp = av_fetch(av, i, FALSE);
5008      array[i] = svp ? *svp : &PL_sv_undef;
5009     }
5010    }
5011    else
5012     array = AvARRAY(av);
5013
5014    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5015         array, maxarg, NULL, recompile_p,
5016         /* $" */
5017         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5018
5019    continue;
5020   }
5021
5022
5023   /* we make the assumption here that each op in the list of
5024   * op_siblings maps to one SV pushed onto the stack,
5025   * except for code blocks, with have both an OP_NULL and
5026   * and OP_CONST.
5027   * This allows us to match up the list of SVs against the
5028   * list of OPs to find the next code block.
5029   *
5030   * Note that       PUSHMARK PADSV PADSV ..
5031   * is optimised to
5032   *                 PADRANGE PADSV  PADSV  ..
5033   * so the alignment still works. */
5034
5035   if (oplist) {
5036    if (oplist->op_type == OP_NULL
5037     && (oplist->op_flags & OPf_SPECIAL))
5038    {
5039     assert(n < pRExC_state->num_code_blocks);
5040     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5041     pRExC_state->code_blocks[n].block = oplist;
5042     pRExC_state->code_blocks[n].src_regex = NULL;
5043     n++;
5044     code = 1;
5045     oplist = oplist->op_sibling; /* skip CONST */
5046     assert(oplist);
5047    }
5048    oplist = oplist->op_sibling;;
5049   }
5050
5051   /* apply magic and QR overloading to arg */
5052
5053   SvGETMAGIC(msv);
5054   if (SvROK(msv) && SvAMAGIC(msv)) {
5055    SV *sv = AMG_CALLunary(msv, regexp_amg);
5056    if (sv) {
5057     if (SvROK(sv))
5058      sv = SvRV(sv);
5059     if (SvTYPE(sv) != SVt_REGEXP)
5060      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5061     msv = sv;
5062    }
5063   }
5064
5065   /* try concatenation overload ... */
5066   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5067     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5068   {
5069    sv_setsv(pat, sv);
5070    /* overloading involved: all bets are off over literal
5071    * code. Pretend we haven't seen it */
5072    pRExC_state->num_code_blocks -= n;
5073    n = 0;
5074   }
5075   else  {
5076    /* ... or failing that, try "" overload */
5077    while (SvAMAGIC(msv)
5078      && (sv = AMG_CALLunary(msv, string_amg))
5079      && sv != msv
5080      &&  !(   SvROK(msv)
5081       && SvROK(sv)
5082       && SvRV(msv) == SvRV(sv))
5083    ) {
5084     msv = sv;
5085     SvGETMAGIC(msv);
5086    }
5087    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5088     msv = SvRV(msv);
5089
5090    if (pat) {
5091     /* this is a partially unrolled
5092     *     sv_catsv_nomg(pat, msv);
5093     * that allows us to adjust code block indices if
5094     * needed */
5095     STRLEN dlen;
5096     char *dst = SvPV_force_nomg(pat, dlen);
5097     orig_patlen = dlen;
5098     if (SvUTF8(msv) && !SvUTF8(pat)) {
5099      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5100      sv_setpvn(pat, dst, dlen);
5101      SvUTF8_on(pat);
5102     }
5103     sv_catsv_nomg(pat, msv);
5104     rx = msv;
5105    }
5106    else
5107     pat = msv;
5108
5109    if (code)
5110     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5111   }
5112
5113   /* extract any code blocks within any embedded qr//'s */
5114   if (rx && SvTYPE(rx) == SVt_REGEXP
5115    && RX_ENGINE((REGEXP*)rx)->op_comp)
5116   {
5117
5118    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5119    if (ri->num_code_blocks) {
5120     int i;
5121     /* the presence of an embedded qr// with code means
5122     * we should always recompile: the text of the
5123     * qr// may not have changed, but it may be a
5124     * different closure than last time */
5125     *recompile_p = 1;
5126     Renew(pRExC_state->code_blocks,
5127      pRExC_state->num_code_blocks + ri->num_code_blocks,
5128      struct reg_code_block);
5129     pRExC_state->num_code_blocks += ri->num_code_blocks;
5130
5131     for (i=0; i < ri->num_code_blocks; i++) {
5132      struct reg_code_block *src, *dst;
5133      STRLEN offset =  orig_patlen
5134       + ReANY((REGEXP *)rx)->pre_prefix;
5135      assert(n < pRExC_state->num_code_blocks);
5136      src = &ri->code_blocks[i];
5137      dst = &pRExC_state->code_blocks[n];
5138      dst->start     = src->start + offset;
5139      dst->end     = src->end   + offset;
5140      dst->block     = src->block;
5141      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5142            src->src_regex
5143             ? src->src_regex
5144             : (REGEXP*)rx);
5145      n++;
5146     }
5147    }
5148   }
5149  }
5150  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5151  if (alloced)
5152   SvSETMAGIC(pat);
5153
5154  return pat;
5155 }
5156
5157
5158
5159 /* see if there are any run-time code blocks in the pattern.
5160  * False positives are allowed */
5161
5162 static bool
5163 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5164      char *pat, STRLEN plen)
5165 {
5166  int n = 0;
5167  STRLEN s;
5168
5169  for (s = 0; s < plen; s++) {
5170   if (n < pRExC_state->num_code_blocks
5171    && s == pRExC_state->code_blocks[n].start)
5172   {
5173    s = pRExC_state->code_blocks[n].end;
5174    n++;
5175    continue;
5176   }
5177   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5178   * positives here */
5179   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5180    (pat[s+2] == '{'
5181     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5182   )
5183    return 1;
5184  }
5185  return 0;
5186 }
5187
5188 /* Handle run-time code blocks. We will already have compiled any direct
5189  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5190  * copy of it, but with any literal code blocks blanked out and
5191  * appropriate chars escaped; then feed it into
5192  *
5193  *    eval "qr'modified_pattern'"
5194  *
5195  * For example,
5196  *
5197  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5198  *
5199  * becomes
5200  *
5201  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5202  *
5203  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5204  * and merge them with any code blocks of the original regexp.
5205  *
5206  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5207  * instead, just save the qr and return FALSE; this tells our caller that
5208  * the original pattern needs upgrading to utf8.
5209  */
5210
5211 static bool
5212 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5213  char *pat, STRLEN plen)
5214 {
5215  SV *qr;
5216
5217  GET_RE_DEBUG_FLAGS_DECL;
5218
5219  if (pRExC_state->runtime_code_qr) {
5220   /* this is the second time we've been called; this should
5221   * only happen if the main pattern got upgraded to utf8
5222   * during compilation; re-use the qr we compiled first time
5223   * round (which should be utf8 too)
5224   */
5225   qr = pRExC_state->runtime_code_qr;
5226   pRExC_state->runtime_code_qr = NULL;
5227   assert(RExC_utf8 && SvUTF8(qr));
5228  }
5229  else {
5230   int n = 0;
5231   STRLEN s;
5232   char *p, *newpat;
5233   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5234   SV *sv, *qr_ref;
5235   dSP;
5236
5237   /* determine how many extra chars we need for ' and \ escaping */
5238   for (s = 0; s < plen; s++) {
5239    if (pat[s] == '\'' || pat[s] == '\\')
5240     newlen++;
5241   }
5242
5243   Newx(newpat, newlen, char);
5244   p = newpat;
5245   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5246
5247   for (s = 0; s < plen; s++) {
5248    if (n < pRExC_state->num_code_blocks
5249     && s == pRExC_state->code_blocks[n].start)
5250    {
5251     /* blank out literal code block */
5252     assert(pat[s] == '(');
5253     while (s <= pRExC_state->code_blocks[n].end) {
5254      *p++ = '_';
5255      s++;
5256     }
5257     s--;
5258     n++;
5259     continue;
5260    }
5261    if (pat[s] == '\'' || pat[s] == '\\')
5262     *p++ = '\\';
5263    *p++ = pat[s];
5264   }
5265   *p++ = '\'';
5266   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5267    *p++ = 'x';
5268   *p++ = '\0';
5269   DEBUG_COMPILE_r({
5270    PerlIO_printf(Perl_debug_log,
5271     "%sre-parsing pattern for runtime code:%s %s\n",
5272     PL_colors[4],PL_colors[5],newpat);
5273   });
5274
5275   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5276   Safefree(newpat);
5277
5278   ENTER;
5279   SAVETMPS;
5280   save_re_context();
5281   PUSHSTACKi(PERLSI_REQUIRE);
5282   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5283   * parsing qr''; normally only q'' does this. It also alters
5284   * hints handling */
5285   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5286   SvREFCNT_dec_NN(sv);
5287   SPAGAIN;
5288   qr_ref = POPs;
5289   PUTBACK;
5290   {
5291    SV * const errsv = ERRSV;
5292    if (SvTRUE_NN(errsv))
5293    {
5294     Safefree(pRExC_state->code_blocks);
5295     /* use croak_sv ? */
5296     Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5297    }
5298   }
5299   assert(SvROK(qr_ref));
5300   qr = SvRV(qr_ref);
5301   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5302   /* the leaving below frees the tmp qr_ref.
5303   * Give qr a life of its own */
5304   SvREFCNT_inc(qr);
5305   POPSTACK;
5306   FREETMPS;
5307   LEAVE;
5308
5309  }
5310
5311  if (!RExC_utf8 && SvUTF8(qr)) {
5312   /* first time through; the pattern got upgraded; save the
5313   * qr for the next time through */
5314   assert(!pRExC_state->runtime_code_qr);
5315   pRExC_state->runtime_code_qr = qr;
5316   return 0;
5317  }
5318
5319
5320  /* extract any code blocks within the returned qr//  */
5321
5322
5323  /* merge the main (r1) and run-time (r2) code blocks into one */
5324  {
5325   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5326   struct reg_code_block *new_block, *dst;
5327   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5328   int i1 = 0, i2 = 0;
5329
5330   if (!r2->num_code_blocks) /* we guessed wrong */
5331   {
5332    SvREFCNT_dec_NN(qr);
5333    return 1;
5334   }
5335
5336   Newx(new_block,
5337    r1->num_code_blocks + r2->num_code_blocks,
5338    struct reg_code_block);
5339   dst = new_block;
5340
5341   while (    i1 < r1->num_code_blocks
5342     || i2 < r2->num_code_blocks)
5343   {
5344    struct reg_code_block *src;
5345    bool is_qr = 0;
5346
5347    if (i1 == r1->num_code_blocks) {
5348     src = &r2->code_blocks[i2++];
5349     is_qr = 1;
5350    }
5351    else if (i2 == r2->num_code_blocks)
5352     src = &r1->code_blocks[i1++];
5353    else if (  r1->code_blocks[i1].start
5354      < r2->code_blocks[i2].start)
5355    {
5356     src = &r1->code_blocks[i1++];
5357     assert(src->end < r2->code_blocks[i2].start);
5358    }
5359    else {
5360     assert(  r1->code_blocks[i1].start
5361      > r2->code_blocks[i2].start);
5362     src = &r2->code_blocks[i2++];
5363     is_qr = 1;
5364     assert(src->end < r1->code_blocks[i1].start);
5365    }
5366
5367    assert(pat[src->start] == '(');
5368    assert(pat[src->end]   == ')');
5369    dst->start     = src->start;
5370    dst->end     = src->end;
5371    dst->block     = src->block;
5372    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5373          : src->src_regex;
5374    dst++;
5375   }
5376   r1->num_code_blocks += r2->num_code_blocks;
5377   Safefree(r1->code_blocks);
5378   r1->code_blocks = new_block;
5379  }
5380
5381  SvREFCNT_dec_NN(qr);
5382  return 1;
5383 }
5384
5385
5386 STATIC bool
5387 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5388 {
5389  /* This is the common code for setting up the floating and fixed length
5390  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
5391  * as to whether succeeded or not */
5392
5393  I32 t,ml;
5394
5395  if (! (longest_length
5396   || (eol /* Can't have SEOL and MULTI */
5397    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5398   )
5399    /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5400   || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5401  {
5402   return FALSE;
5403  }
5404
5405  /* copy the information about the longest from the reg_scan_data
5406   over to the program. */
5407  if (SvUTF8(sv_longest)) {
5408   *rx_utf8 = sv_longest;
5409   *rx_substr = NULL;
5410  } else {
5411   *rx_substr = sv_longest;
5412   *rx_utf8 = NULL;
5413  }
5414  /* end_shift is how many chars that must be matched that
5415   follow this item. We calculate it ahead of time as once the
5416   lookbehind offset is added in we lose the ability to correctly
5417   calculate it.*/
5418  ml = minlen ? *(minlen) : (I32)longest_length;
5419  *rx_end_shift = ml - offset
5420   - longest_length + (SvTAIL(sv_longest) != 0)
5421   + lookbehind;
5422
5423  t = (eol/* Can't have SEOL and MULTI */
5424   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5425  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5426
5427  return TRUE;
5428 }
5429
5430 /*
5431  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5432  * regular expression into internal code.
5433  * The pattern may be passed either as:
5434  *    a list of SVs (patternp plus pat_count)
5435  *    a list of OPs (expr)
5436  * If both are passed, the SV list is used, but the OP list indicates
5437  * which SVs are actually pre-compiled code blocks
5438  *
5439  * The SVs in the list have magic and qr overloading applied to them (and
5440  * the list may be modified in-place with replacement SVs in the latter
5441  * case).
5442  *
5443  * If the pattern hasn't changed from old_re, then old_re will be
5444  * returned.
5445  *
5446  * eng is the current engine. If that engine has an op_comp method, then
5447  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5448  * do the initial concatenation of arguments and pass on to the external
5449  * engine.
5450  *
5451  * If is_bare_re is not null, set it to a boolean indicating whether the
5452  * arg list reduced (after overloading) to a single bare regex which has
5453  * been returned (i.e. /$qr/).
5454  *
5455  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5456  *
5457  * pm_flags contains the PMf_* flags, typically based on those from the
5458  * pm_flags field of the related PMOP. Currently we're only interested in
5459  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5460  *
5461  * We can't allocate space until we know how big the compiled form will be,
5462  * but we can't compile it (and thus know how big it is) until we've got a
5463  * place to put the code.  So we cheat:  we compile it twice, once with code
5464  * generation turned off and size counting turned on, and once "for real".
5465  * This also means that we don't allocate space until we are sure that the
5466  * thing really will compile successfully, and we never have to move the
5467  * code and thus invalidate pointers into it.  (Note that it has to be in
5468  * one piece because free() must be able to free it all.) [NB: not true in perl]
5469  *
5470  * Beware that the optimization-preparation code in here knows about some
5471  * of the structure of the compiled regexp.  [I'll say.]
5472  */
5473
5474 REGEXP *
5475 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5476      OP *expr, const regexp_engine* eng, REGEXP *old_re,
5477      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5478 {
5479  dVAR;
5480  REGEXP *rx;
5481  struct regexp *r;
5482  regexp_internal *ri;
5483  STRLEN plen;
5484  char *exp;
5485  regnode *scan;
5486  I32 flags;
5487  I32 minlen = 0;
5488  U32 rx_flags;
5489  SV *pat;
5490  SV *code_blocksv = NULL;
5491  SV** new_patternp = patternp;
5492
5493  /* these are all flags - maybe they should be turned
5494  * into a single int with different bit masks */
5495  I32 sawlookahead = 0;
5496  I32 sawplus = 0;
5497  I32 sawopen = 0;
5498  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5499  bool recompile = 0;
5500  bool runtime_code = 0;
5501  scan_data_t data;
5502  RExC_state_t RExC_state;
5503  RExC_state_t * const pRExC_state = &RExC_state;
5504 #ifdef TRIE_STUDY_OPT
5505  int restudied = 0;
5506  RExC_state_t copyRExC_state;
5507 #endif
5508  GET_RE_DEBUG_FLAGS_DECL;
5509
5510  PERL_ARGS_ASSERT_RE_OP_COMPILE;
5511
5512  DEBUG_r(if (!PL_colorset) reginitcolors());
5513
5514 #ifndef PERL_IN_XSUB_RE
5515  /* Initialize these here instead of as-needed, as is quick and avoids
5516  * having to test them each time otherwise */
5517  if (! PL_AboveLatin1) {
5518   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5519   PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5520   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5521
5522   PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5523         = _new_invlist_C_array(L1PosixAlnum_invlist);
5524   PL_Posix_ptrs[_CC_ALPHANUMERIC]
5525         = _new_invlist_C_array(PosixAlnum_invlist);
5526
5527   PL_L1Posix_ptrs[_CC_ALPHA]
5528         = _new_invlist_C_array(L1PosixAlpha_invlist);
5529   PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5530
5531   PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5532   PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5533
5534   /* Cased is the same as Alpha in the ASCII range */
5535   PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5536   PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5537
5538   PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5539   PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5540
5541   PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5542   PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5543
5544   PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5545   PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5546
5547   PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5548   PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5549
5550   PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5551   PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5552
5553   PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5554   PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5555
5556   PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5557   PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5558   PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5559   PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5560
5561   PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5562   PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5563
5564   PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5565
5566   PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5567   PL_L1Posix_ptrs[_CC_WORDCHAR]
5568         = _new_invlist_C_array(L1PosixWord_invlist);
5569
5570   PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5571   PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5572
5573   PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5574  }
5575 #endif
5576
5577  pRExC_state->code_blocks = NULL;
5578  pRExC_state->num_code_blocks = 0;
5579
5580  if (is_bare_re)
5581   *is_bare_re = FALSE;
5582
5583  if (expr && (expr->op_type == OP_LIST ||
5584     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5585   /* allocate code_blocks if needed */
5586   OP *o;
5587   int ncode = 0;
5588
5589   for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5590    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5591     ncode++; /* count of DO blocks */
5592   if (ncode) {
5593    pRExC_state->num_code_blocks = ncode;
5594    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5595   }
5596  }
5597
5598  if (!pat_count) {
5599   /* compile-time pattern with just OP_CONSTs and DO blocks */
5600
5601   int n;
5602   OP *o;
5603
5604   /* find how many CONSTs there are */
5605   assert(expr);
5606   n = 0;
5607   if (expr->op_type == OP_CONST)
5608    n = 1;
5609   else
5610    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5611     if (o->op_type == OP_CONST)
5612      n++;
5613    }
5614
5615   /* fake up an SV array */
5616
5617   assert(!new_patternp);
5618   Newx(new_patternp, n, SV*);
5619   SAVEFREEPV(new_patternp);
5620   pat_count = n;
5621
5622   n = 0;
5623   if (expr->op_type == OP_CONST)
5624    new_patternp[n] = cSVOPx_sv(expr);
5625   else
5626    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5627     if (o->op_type == OP_CONST)
5628      new_patternp[n++] = cSVOPo_sv;
5629    }
5630
5631  }
5632
5633  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5634   "Assembling pattern from %d elements%s\n", pat_count,
5635    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5636
5637  /* set expr to the first arg op */
5638
5639  if (pRExC_state->num_code_blocks
5640   && expr->op_type != OP_CONST)
5641  {
5642    expr = cLISTOPx(expr)->op_first;
5643    assert(   expr->op_type == OP_PUSHMARK
5644     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5645     || expr->op_type == OP_PADRANGE);
5646    expr = expr->op_sibling;
5647  }
5648
5649  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5650       expr, &recompile, NULL);
5651
5652  /* handle bare (possibly after overloading) regex: foo =~ $re */
5653  {
5654   SV *re = pat;
5655   if (SvROK(re))
5656    re = SvRV(re);
5657   if (SvTYPE(re) == SVt_REGEXP) {
5658    if (is_bare_re)
5659     *is_bare_re = TRUE;
5660    SvREFCNT_inc(re);
5661    Safefree(pRExC_state->code_blocks);
5662    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5663     "Precompiled pattern%s\n",
5664      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5665
5666    return (REGEXP*)re;
5667   }
5668  }
5669
5670  exp = SvPV_nomg(pat, plen);
5671
5672  if (!eng->op_comp) {
5673   if ((SvUTF8(pat) && IN_BYTES)
5674     || SvGMAGICAL(pat) || SvAMAGIC(pat))
5675   {
5676    /* make a temporary copy; either to convert to bytes,
5677    * or to avoid repeating get-magic / overloaded stringify */
5678    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5679           (IN_BYTES ? 0 : SvUTF8(pat)));
5680   }
5681   Safefree(pRExC_state->code_blocks);
5682   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5683  }
5684
5685  /* ignore the utf8ness if the pattern is 0 length */
5686  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5687  RExC_uni_semantics = 0;
5688  RExC_contains_locale = 0;
5689  pRExC_state->runtime_code_qr = NULL;
5690
5691  DEBUG_COMPILE_r({
5692    SV *dsv= sv_newmortal();
5693    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
5694    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5695       PL_colors[4],PL_colors[5],s);
5696   });
5697
5698   redo_first_pass:
5699  /* we jump here if we upgrade the pattern to utf8 and have to
5700  * recompile */
5701
5702  if ((pm_flags & PMf_USE_RE_EVAL)
5703     /* this second condition covers the non-regex literal case,
5704     * i.e.  $foo =~ '(?{})'. */
5705     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
5706  )
5707   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
5708
5709  /* return old regex if pattern hasn't changed */
5710  /* XXX: note in the below we have to check the flags as well as the pattern.
5711  *
5712  * Things get a touch tricky as we have to compare the utf8 flag independently
5713  * from the compile flags.
5714  */
5715
5716  if (   old_re
5717   && !recompile
5718   && !!RX_UTF8(old_re) == !!RExC_utf8
5719   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
5720   && RX_PRECOMP(old_re)
5721   && RX_PRELEN(old_re) == plen
5722   && memEQ(RX_PRECOMP(old_re), exp, plen)
5723   && !runtime_code /* with runtime code, always recompile */ )
5724  {
5725   Safefree(pRExC_state->code_blocks);
5726   return old_re;
5727  }
5728
5729  rx_flags = orig_rx_flags;
5730
5731  if (initial_charset == REGEX_LOCALE_CHARSET) {
5732   RExC_contains_locale = 1;
5733  }
5734  else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5735
5736   /* Set to use unicode semantics if the pattern is in utf8 and has the
5737   * 'depends' charset specified, as it means unicode when utf8  */
5738   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5739  }
5740
5741  RExC_precomp = exp;
5742  RExC_flags = rx_flags;
5743  RExC_pm_flags = pm_flags;
5744
5745  if (runtime_code) {
5746   if (TAINTING_get && TAINT_get)
5747    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5748
5749   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5750    /* whoops, we have a non-utf8 pattern, whilst run-time code
5751    * got compiled as utf8. Try again with a utf8 pattern */
5752    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5753          pRExC_state->num_code_blocks);
5754    goto redo_first_pass;
5755   }
5756  }
5757  assert(!pRExC_state->runtime_code_qr);
5758
5759  RExC_sawback = 0;
5760
5761  RExC_seen = 0;
5762  RExC_in_lookbehind = 0;
5763  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5764  RExC_extralen = 0;
5765  RExC_override_recoding = 0;
5766  RExC_in_multi_char_class = 0;
5767
5768  /* First pass: determine size, legality. */
5769  RExC_parse = exp;
5770  RExC_start = exp;
5771  RExC_end = exp + plen;
5772  RExC_naughty = 0;
5773  RExC_npar = 1;
5774  RExC_nestroot = 0;
5775  RExC_size = 0L;
5776  RExC_emit = &PL_regdummy;
5777  RExC_whilem_seen = 0;
5778  RExC_open_parens = NULL;
5779  RExC_close_parens = NULL;
5780  RExC_opend = NULL;
5781  RExC_paren_names = NULL;
5782 #ifdef DEBUGGING
5783  RExC_paren_name_list = NULL;
5784 #endif
5785  RExC_recurse = NULL;
5786  RExC_recurse_count = 0;
5787  pRExC_state->code_index = 0;
5788
5789 #if 0 /* REGC() is (currently) a NOP at the first pass.
5790  * Clever compilers notice this and complain. --jhi */
5791  REGC((U8)REG_MAGIC, (char*)RExC_emit);
5792 #endif
5793  DEBUG_PARSE_r(
5794   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5795   RExC_lastnum=0;
5796   RExC_lastparse=NULL;
5797  );
5798  /* reg may croak on us, not giving us a chance to free
5799  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5800  need it to survive as long as the regexp (qr/(?{})/).
5801  We must check that code_blocksv is not already set, because we may
5802  have jumped back to restart the sizing pass. */
5803  if (pRExC_state->code_blocks && !code_blocksv) {
5804   code_blocksv = newSV_type(SVt_PV);
5805   SAVEFREESV(code_blocksv);
5806   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5807   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5808  }
5809  if (reg(pRExC_state, 0, &flags,1) == NULL) {
5810   /* It's possible to write a regexp in ascii that represents Unicode
5811   codepoints outside of the byte range, such as via \x{100}. If we
5812   detect such a sequence we have to convert the entire pattern to utf8
5813   and then recompile, as our sizing calculation will have been based
5814   on 1 byte == 1 character, but we will need to use utf8 to encode
5815   at least some part of the pattern, and therefore must convert the whole
5816   thing.
5817   -- dmq */
5818   if (flags & RESTART_UTF8) {
5819    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5820          pRExC_state->num_code_blocks);
5821    goto redo_first_pass;
5822   }
5823   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
5824  }
5825  if (code_blocksv)
5826   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5827
5828  DEBUG_PARSE_r({
5829   PerlIO_printf(Perl_debug_log,
5830    "Required size %"IVdf" nodes\n"
5831    "Starting second pass (creation)\n",
5832    (IV)RExC_size);
5833   RExC_lastnum=0;
5834   RExC_lastparse=NULL;
5835  });
5836
5837  /* The first pass could have found things that force Unicode semantics */
5838  if ((RExC_utf8 || RExC_uni_semantics)
5839   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5840  {
5841   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5842  }
5843
5844  /* Small enough for pointer-storage convention?
5845  If extralen==0, this means that we will not need long jumps. */
5846  if (RExC_size >= 0x10000L && RExC_extralen)
5847   RExC_size += RExC_extralen;
5848  else
5849   RExC_extralen = 0;
5850  if (RExC_whilem_seen > 15)
5851   RExC_whilem_seen = 15;
5852
5853  /* Allocate space and zero-initialize. Note, the two step process
5854  of zeroing when in debug mode, thus anything assigned has to
5855  happen after that */
5856  rx = (REGEXP*) newSV_type(SVt_REGEXP);
5857  r = ReANY(rx);
5858  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5859   char, regexp_internal);
5860  if ( r == NULL || ri == NULL )
5861   FAIL("Regexp out of space");
5862 #ifdef DEBUGGING
5863  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5864  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5865 #else
5866  /* bulk initialize base fields with 0. */
5867  Zero(ri, sizeof(regexp_internal), char);
5868 #endif
5869
5870  /* non-zero initialization begins here */
5871  RXi_SET( r, ri );
5872  r->engine= eng;
5873  r->extflags = rx_flags;
5874  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
5875
5876  if (pm_flags & PMf_IS_QR) {
5877   ri->code_blocks = pRExC_state->code_blocks;
5878   ri->num_code_blocks = pRExC_state->num_code_blocks;
5879  }
5880  else
5881  {
5882   int n;
5883   for (n = 0; n < pRExC_state->num_code_blocks; n++)
5884    if (pRExC_state->code_blocks[n].src_regex)
5885     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5886   SAVEFREEPV(pRExC_state->code_blocks);
5887  }
5888
5889  {
5890   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5891   bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5892
5893   /* The caret is output if there are any defaults: if not all the STD
5894   * flags are set, or if no character set specifier is needed */
5895   bool has_default =
5896      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5897      || ! has_charset);
5898   bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5899   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5900        >> RXf_PMf_STD_PMMOD_SHIFT);
5901   const char *fptr = STD_PAT_MODS;        /*"msix"*/
5902   char *p;
5903   /* Allocate for the worst case, which is all the std flags are turned
5904   * on.  If more precision is desired, we could do a population count of
5905   * the flags set.  This could be done with a small lookup table, or by
5906   * shifting, masking and adding, or even, when available, assembly
5907   * language for a machine-language population count.
5908   * We never output a minus, as all those are defaults, so are
5909   * covered by the caret */
5910   const STRLEN wraplen = plen + has_p + has_runon
5911    + has_default       /* If needs a caret */
5912
5913     /* If needs a character set specifier */
5914    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5915    + (sizeof(STD_PAT_MODS) - 1)
5916    + (sizeof("(?:)") - 1);
5917
5918   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5919   r->xpv_len_u.xpvlenu_pv = p;
5920   if (RExC_utf8)
5921    SvFLAGS(rx) |= SVf_UTF8;
5922   *p++='('; *p++='?';
5923
5924   /* If a default, cover it using the caret */
5925   if (has_default) {
5926    *p++= DEFAULT_PAT_MOD;
5927   }
5928   if (has_charset) {
5929    STRLEN len;
5930    const char* const name = get_regex_charset_name(r->extflags, &len);
5931    Copy(name, p, len, char);
5932    p += len;
5933   }
5934   if (has_p)
5935    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5936   {
5937    char ch;
5938    while((ch = *fptr++)) {
5939     if(reganch & 1)
5940      *p++ = ch;
5941     reganch >>= 1;
5942    }
5943   }
5944
5945   *p++ = ':';
5946   Copy(RExC_precomp, p, plen, char);
5947   assert ((RX_WRAPPED(rx) - p) < 16);
5948   r->pre_prefix = p - RX_WRAPPED(rx);
5949   p += plen;
5950   if (has_runon)
5951    *p++ = '\n';
5952   *p++ = ')';
5953   *p = 0;
5954   SvCUR_set(rx, p - RX_WRAPPED(rx));
5955  }
5956
5957  r->intflags = 0;
5958  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5959
5960  if (RExC_seen & REG_SEEN_RECURSE) {
5961   Newxz(RExC_open_parens, RExC_npar,regnode *);
5962   SAVEFREEPV(RExC_open_parens);
5963   Newxz(RExC_close_parens,RExC_npar,regnode *);
5964   SAVEFREEPV(RExC_close_parens);
5965  }
5966
5967  /* Useful during FAIL. */
5968 #ifdef RE_TRACK_PATTERN_OFFSETS
5969  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5970  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5971       "%s %"UVuf" bytes for offset annotations.\n",
5972       ri->u.offsets ? "Got" : "Couldn't get",
5973       (UV)((2*RExC_size+1) * sizeof(U32))));
5974 #endif
5975  SetProgLen(ri,RExC_size);
5976  RExC_rx_sv = rx;
5977  RExC_rx = r;
5978  RExC_rxi = ri;
5979  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5980
5981  /* Second pass: emit code. */
5982  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5983  RExC_pm_flags = pm_flags;
5984  RExC_parse = exp;
5985  RExC_end = exp + plen;
5986  RExC_naughty = 0;
5987  RExC_npar = 1;
5988  RExC_emit_start = ri->program;
5989  RExC_emit = ri->program;
5990  RExC_emit_bound = ri->program + RExC_size + 1;
5991  pRExC_state->code_index = 0;
5992
5993  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5994  if (reg(pRExC_state, 0, &flags,1) == NULL) {
5995   ReREFCNT_dec(rx);
5996   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags);
5997  }
5998  /* XXXX To minimize changes to RE engine we always allocate
5999  3-units-long substrs field. */
6000  Newx(r->substrs, 1, struct reg_substr_data);
6001  if (RExC_recurse_count) {
6002   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6003   SAVEFREEPV(RExC_recurse);
6004  }
6005
6006 reStudy:
6007  r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6008  Zero(r->substrs, 1, struct reg_substr_data);
6009
6010 #ifdef TRIE_STUDY_OPT
6011  if (!restudied) {
6012   StructCopy(&zero_scan_data, &data, scan_data_t);
6013   copyRExC_state = RExC_state;
6014  } else {
6015   U32 seen=RExC_seen;
6016   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6017
6018   RExC_state = copyRExC_state;
6019   if (seen & REG_TOP_LEVEL_BRANCHES)
6020    RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6021   else
6022    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6023   StructCopy(&zero_scan_data, &data, scan_data_t);
6024  }
6025 #else
6026  StructCopy(&zero_scan_data, &data, scan_data_t);
6027 #endif
6028
6029  /* Dig out information for optimizations. */
6030  r->extflags = RExC_flags; /* was pm_op */
6031  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6032
6033  if (UTF)
6034   SvUTF8_on(rx); /* Unicode in it? */
6035  ri->regstclass = NULL;
6036  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6037   r->intflags |= PREGf_NAUGHTY;
6038  scan = ri->program + 1;  /* First BRANCH. */
6039
6040  /* testing for BRANCH here tells us whether there is "must appear"
6041  data in the pattern. If there is then we can use it for optimisations */
6042  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6043   I32 fake;
6044   STRLEN longest_float_length, longest_fixed_length;
6045   struct regnode_charclass_class ch_class; /* pointed to by data */
6046   int stclass_flag;
6047   I32 last_close = 0; /* pointed to by data */
6048   regnode *first= scan;
6049   regnode *first_next= regnext(first);
6050   /*
6051   * Skip introductions and multiplicators >= 1
6052   * so that we can extract the 'meat' of the pattern that must
6053   * match in the large if() sequence following.
6054   * NOTE that EXACT is NOT covered here, as it is normally
6055   * picked up by the optimiser separately.
6056   *
6057   * This is unfortunate as the optimiser isnt handling lookahead
6058   * properly currently.
6059   *
6060   */
6061   while ((OP(first) == OPEN && (sawopen = 1)) ||
6062    /* An OR of *one* alternative - should not happen now. */
6063    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6064    /* for now we can't handle lookbehind IFMATCH*/
6065    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6066    (OP(first) == PLUS) ||
6067    (OP(first) == MINMOD) ||
6068    /* An {n,m} with n>0 */
6069    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6070    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6071   {
6072     /*
6073     * the only op that could be a regnode is PLUS, all the rest
6074     * will be regnode_1 or regnode_2.
6075     *
6076     */
6077     if (OP(first) == PLUS)
6078      sawplus = 1;
6079     else
6080      first += regarglen[OP(first)];
6081
6082     first = NEXTOPER(first);
6083     first_next= regnext(first);
6084   }
6085
6086   /* Starting-point info. */
6087  again:
6088   DEBUG_PEEP("first:",first,0);
6089   /* Ignore EXACT as we deal with it later. */
6090   if (PL_regkind[OP(first)] == EXACT) {
6091    if (OP(first) == EXACT)
6092     NOOP; /* Empty, get anchored substr later. */
6093    else
6094     ri->regstclass = first;
6095   }
6096 #ifdef TRIE_STCLASS
6097   else if (PL_regkind[OP(first)] == TRIE &&
6098     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6099   {
6100    regnode *trie_op;
6101    /* this can happen only on restudy */
6102    if ( OP(first) == TRIE ) {
6103     struct regnode_1 *trieop = (struct regnode_1 *)
6104      PerlMemShared_calloc(1, sizeof(struct regnode_1));
6105     StructCopy(first,trieop,struct regnode_1);
6106     trie_op=(regnode *)trieop;
6107    } else {
6108     struct regnode_charclass *trieop = (struct regnode_charclass *)
6109      PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6110     StructCopy(first,trieop,struct regnode_charclass);
6111     trie_op=(regnode *)trieop;
6112    }
6113    OP(trie_op)+=2;
6114    make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6115    ri->regstclass = trie_op;
6116   }
6117 #endif
6118   else if (REGNODE_SIMPLE(OP(first)))
6119    ri->regstclass = first;
6120   else if (PL_regkind[OP(first)] == BOUND ||
6121     PL_regkind[OP(first)] == NBOUND)
6122    ri->regstclass = first;
6123   else if (PL_regkind[OP(first)] == BOL) {
6124    r->extflags |= (OP(first) == MBOL
6125       ? RXf_ANCH_MBOL
6126       : (OP(first) == SBOL
6127        ? RXf_ANCH_SBOL
6128        : RXf_ANCH_BOL));
6129    first = NEXTOPER(first);
6130    goto again;
6131   }
6132   else if (OP(first) == GPOS) {
6133    r->extflags |= RXf_ANCH_GPOS;
6134    first = NEXTOPER(first);
6135    goto again;
6136   }
6137   else if ((!sawopen || !RExC_sawback) &&
6138    (OP(first) == STAR &&
6139    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6140    !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6141   {
6142    /* turn .* into ^.* with an implied $*=1 */
6143    const int type =
6144     (OP(NEXTOPER(first)) == REG_ANY)
6145      ? RXf_ANCH_MBOL
6146      : RXf_ANCH_SBOL;
6147    r->extflags |= type;
6148    r->intflags |= PREGf_IMPLICIT;
6149    first = NEXTOPER(first);
6150    goto again;
6151   }
6152   if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6153    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6154    /* x+ must match at the 1st pos of run of x's */
6155    r->intflags |= PREGf_SKIP;
6156
6157   /* Scan is after the zeroth branch, first is atomic matcher. */
6158 #ifdef TRIE_STUDY_OPT
6159   DEBUG_PARSE_r(
6160    if (!restudied)
6161     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6162        (IV)(first - scan + 1))
6163   );
6164 #else
6165   DEBUG_PARSE_r(
6166    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6167     (IV)(first - scan + 1))
6168   );
6169 #endif
6170
6171
6172   /*
6173   * If there's something expensive in the r.e., find the
6174   * longest literal string that must appear and make it the
6175   * regmust.  Resolve ties in favor of later strings, since
6176   * the regstart check works with the beginning of the r.e.
6177   * and avoiding duplication strengthens checking.  Not a
6178   * strong reason, but sufficient in the absence of others.
6179   * [Now we resolve ties in favor of the earlier string if
6180   * it happens that c_offset_min has been invalidated, since the
6181   * earlier string may buy us something the later one won't.]
6182   */
6183
6184   data.longest_fixed = newSVpvs("");
6185   data.longest_float = newSVpvs("");
6186   data.last_found = newSVpvs("");
6187   data.longest = &(data.longest_fixed);
6188   ENTER_with_name("study_chunk");
6189   SAVEFREESV(data.longest_fixed);
6190   SAVEFREESV(data.longest_float);
6191   SAVEFREESV(data.last_found);
6192   first = scan;
6193   if (!ri->regstclass) {
6194    cl_init(pRExC_state, &ch_class);
6195    data.start_class = &ch_class;
6196    stclass_flag = SCF_DO_STCLASS_AND;
6197   } else    /* XXXX Check for BOUND? */
6198    stclass_flag = 0;
6199   data.last_closep = &last_close;
6200
6201   minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6202    &data, -1, NULL, NULL,
6203    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6204
6205
6206   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6207
6208
6209   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6210    && data.last_start_min == 0 && data.last_end > 0
6211    && !RExC_seen_zerolen
6212    && !(RExC_seen & REG_SEEN_VERBARG)
6213    && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6214    r->extflags |= RXf_CHECK_ALL;
6215   scan_commit(pRExC_state, &data,&minlen,0);
6216
6217   longest_float_length = CHR_SVLEN(data.longest_float);
6218
6219   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6220     && data.offset_fixed == data.offset_float_min
6221     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6222    && S_setup_longest (aTHX_ pRExC_state,
6223          data.longest_float,
6224          &(r->float_utf8),
6225          &(r->float_substr),
6226          &(r->float_end_shift),
6227          data.lookbehind_float,
6228          data.offset_float_min,
6229          data.minlen_float,
6230          longest_float_length,
6231          cBOOL(data.flags & SF_FL_BEFORE_EOL),
6232          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6233   {
6234    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6235    r->float_max_offset = data.offset_float_max;
6236    if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6237     r->float_max_offset -= data.lookbehind_float;
6238    SvREFCNT_inc_simple_void_NN(data.longest_float);
6239   }
6240   else {
6241    r->float_substr = r->float_utf8 = NULL;
6242    longest_float_length = 0;
6243   }
6244
6245   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6246
6247   if (S_setup_longest (aTHX_ pRExC_state,
6248         data.longest_fixed,
6249         &(r->anchored_utf8),
6250         &(r->anchored_substr),
6251         &(r->anchored_end_shift),
6252         data.lookbehind_fixed,
6253         data.offset_fixed,
6254         data.minlen_fixed,
6255         longest_fixed_length,
6256         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6257         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6258   {
6259    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6260    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6261   }
6262   else {
6263    r->anchored_substr = r->anchored_utf8 = NULL;
6264    longest_fixed_length = 0;
6265   }
6266   LEAVE_with_name("study_chunk");
6267
6268   if (ri->regstclass
6269    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6270    ri->regstclass = NULL;
6271
6272   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6273    && stclass_flag
6274    && ! TEST_SSC_EOS(data.start_class)
6275    && !cl_is_anything(data.start_class))
6276   {
6277    const U32 n = add_data(pRExC_state, 1, "f");
6278    OP(data.start_class) = ANYOF_SYNTHETIC;
6279
6280    Newx(RExC_rxi->data->data[n], 1,
6281     struct regnode_charclass_class);
6282    StructCopy(data.start_class,
6283      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6284      struct regnode_charclass_class);
6285    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6286    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6287    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6288      regprop(r, sv, (regnode*)data.start_class);
6289      PerlIO_printf(Perl_debug_log,
6290          "synthetic stclass \"%s\".\n",
6291          SvPVX_const(sv));});
6292   }
6293
6294   /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6295   if (longest_fixed_length > longest_float_length) {
6296    r->check_end_shift = r->anchored_end_shift;
6297    r->check_substr = r->anchored_substr;
6298    r->check_utf8 = r->anchored_utf8;
6299    r->check_offset_min = r->check_offset_max = r->anchored_offset;
6300    if (r->extflags & RXf_ANCH_SINGLE)
6301     r->extflags |= RXf_NOSCAN;
6302   }
6303   else {
6304    r->check_end_shift = r->float_end_shift;
6305    r->check_substr = r->float_substr;
6306    r->check_utf8 = r->float_utf8;
6307    r->check_offset_min = r->float_min_offset;
6308    r->check_offset_max = r->float_max_offset;
6309   }
6310   /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6311   This should be changed ASAP!  */
6312   if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6313    r->extflags |= RXf_USE_INTUIT;
6314    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6315     r->extflags |= RXf_INTUIT_TAIL;
6316   }
6317   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6318   if ( (STRLEN)minlen < longest_float_length )
6319    minlen= longest_float_length;
6320   if ( (STRLEN)minlen < longest_fixed_length )
6321    minlen= longest_fixed_length;
6322   */
6323  }
6324  else {
6325   /* Several toplevels. Best we can is to set minlen. */
6326   I32 fake;
6327   struct regnode_charclass_class ch_class;
6328   I32 last_close = 0;
6329
6330   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6331
6332   scan = ri->program + 1;
6333   cl_init(pRExC_state, &ch_class);
6334   data.start_class = &ch_class;
6335   data.last_closep = &last_close;
6336
6337
6338   minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6339    &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6340
6341   CHECK_RESTUDY_GOTO_butfirst(NOOP);
6342
6343   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6344     = r->float_substr = r->float_utf8 = NULL;
6345
6346   if (! TEST_SSC_EOS(data.start_class)
6347    && !cl_is_anything(data.start_class))
6348   {
6349    const U32 n = add_data(pRExC_state, 1, "f");
6350    OP(data.start_class) = ANYOF_SYNTHETIC;
6351
6352    Newx(RExC_rxi->data->data[n], 1,
6353     struct regnode_charclass_class);
6354    StructCopy(data.start_class,
6355      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6356      struct regnode_charclass_class);
6357    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6358    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6359    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6360      regprop(r, sv, (regnode*)data.start_class);
6361      PerlIO_printf(Perl_debug_log,
6362          "synthetic stclass \"%s\".\n",
6363          SvPVX_const(sv));});
6364   }
6365  }
6366
6367  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6368  the "real" pattern. */
6369  DEBUG_OPTIMISE_r({
6370   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6371      (IV)minlen, (IV)r->minlen);
6372  });
6373  r->minlenret = minlen;
6374  if (r->minlen < minlen)
6375   r->minlen = minlen;
6376
6377  if (RExC_seen & REG_SEEN_GPOS)
6378   r->extflags |= RXf_GPOS_SEEN;
6379  if (RExC_seen & REG_SEEN_LOOKBEHIND)
6380   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6381  if (pRExC_state->num_code_blocks)
6382   r->extflags |= RXf_EVAL_SEEN;
6383  if (RExC_seen & REG_SEEN_CANY)
6384   r->extflags |= RXf_CANY_SEEN;
6385  if (RExC_seen & REG_SEEN_VERBARG)
6386  {
6387   r->intflags |= PREGf_VERBARG_SEEN;
6388   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6389  }
6390  if (RExC_seen & REG_SEEN_CUTGROUP)
6391   r->intflags |= PREGf_CUTGROUP_SEEN;
6392  if (pm_flags & PMf_USE_RE_EVAL)
6393   r->intflags |= PREGf_USE_RE_EVAL;
6394  if (RExC_paren_names)
6395   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6396  else
6397   RXp_PAREN_NAMES(r) = NULL;
6398
6399  {
6400   regnode *first = ri->program + 1;
6401   U8 fop = OP(first);
6402   regnode *next = NEXTOPER(first);
6403   U8 nop = OP(next);
6404
6405   if (PL_regkind[fop] == NOTHING && nop == END)
6406    r->extflags |= RXf_NULL;
6407   else if (PL_regkind[fop] == BOL && nop == END)
6408    r->extflags |= RXf_START_ONLY;
6409   else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6410    r->extflags |= RXf_WHITE;
6411   else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6412    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6413
6414  }
6415 #ifdef DEBUGGING
6416  if (RExC_paren_names) {
6417   ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6418   ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6419  } else
6420 #endif
6421   ri->name_list_idx = 0;
6422
6423  if (RExC_recurse_count) {
6424   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6425    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6426    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6427   }
6428  }
6429  Newxz(r->offs, RExC_npar, regexp_paren_pair);
6430  /* assume we don't need to swap parens around before we match */
6431
6432  DEBUG_DUMP_r({
6433   PerlIO_printf(Perl_debug_log,"Final program:\n");
6434   regdump(r);
6435  });
6436 #ifdef RE_TRACK_PATTERN_OFFSETS
6437  DEBUG_OFFSETS_r(if (ri->u.offsets) {
6438   const U32 len = ri->u.offsets[0];
6439   U32 i;
6440   GET_RE_DEBUG_FLAGS_DECL;
6441   PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6442   for (i = 1; i <= len; i++) {
6443    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6444     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6445     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6446    }
6447   PerlIO_printf(Perl_debug_log, "\n");
6448  });
6449 #endif
6450
6451 #ifdef USE_ITHREADS
6452  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6453  * by setting the regexp SV to readonly-only instead. If the
6454  * pattern's been recompiled, the USEDness should remain. */
6455  if (old_re && SvREADONLY(old_re))
6456   SvREADONLY_on(rx);
6457 #endif
6458  return rx;
6459 }
6460
6461
6462 SV*
6463 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6464      const U32 flags)
6465 {
6466  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6467
6468  PERL_UNUSED_ARG(value);
6469
6470  if (flags & RXapif_FETCH) {
6471   return reg_named_buff_fetch(rx, key, flags);
6472  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6473   Perl_croak_no_modify();
6474   return NULL;
6475  } else if (flags & RXapif_EXISTS) {
6476   return reg_named_buff_exists(rx, key, flags)
6477    ? &PL_sv_yes
6478    : &PL_sv_no;
6479  } else if (flags & RXapif_REGNAMES) {
6480   return reg_named_buff_all(rx, flags);
6481  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6482   return reg_named_buff_scalar(rx, flags);
6483  } else {
6484   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6485   return NULL;
6486  }
6487 }
6488
6489 SV*
6490 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6491       const U32 flags)
6492 {
6493  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6494  PERL_UNUSED_ARG(lastkey);
6495
6496  if (flags & RXapif_FIRSTKEY)
6497   return reg_named_buff_firstkey(rx, flags);
6498  else if (flags & RXapif_NEXTKEY)
6499   return reg_named_buff_nextkey(rx, flags);
6500  else {
6501   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6502   return NULL;
6503  }
6504 }
6505
6506 SV*
6507 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6508       const U32 flags)
6509 {
6510  AV *retarray = NULL;
6511  SV *ret;
6512  struct regexp *const rx = ReANY(r);
6513
6514  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6515
6516  if (flags & RXapif_ALL)
6517   retarray=newAV();
6518
6519  if (rx && RXp_PAREN_NAMES(rx)) {
6520   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6521   if (he_str) {
6522    IV i;
6523    SV* sv_dat=HeVAL(he_str);
6524    I32 *nums=(I32*)SvPVX(sv_dat);
6525    for ( i=0; i<SvIVX(sv_dat); i++ ) {
6526     if ((I32)(rx->nparens) >= nums[i]
6527      && rx->offs[nums[i]].start != -1
6528      && rx->offs[nums[i]].end != -1)
6529     {
6530      ret = newSVpvs("");
6531      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6532      if (!retarray)
6533       return ret;
6534     } else {
6535      if (retarray)
6536       ret = newSVsv(&PL_sv_undef);
6537     }
6538     if (retarray)
6539      av_push(retarray, ret);
6540    }
6541    if (retarray)
6542     return newRV_noinc(MUTABLE_SV(retarray));
6543   }
6544  }
6545  return NULL;
6546 }
6547
6548 bool
6549 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6550       const U32 flags)
6551 {
6552  struct regexp *const rx = ReANY(r);
6553
6554  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6555
6556  if (rx && RXp_PAREN_NAMES(rx)) {
6557   if (flags & RXapif_ALL) {
6558    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6559   } else {
6560    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6561    if (sv) {
6562     SvREFCNT_dec_NN(sv);
6563     return TRUE;
6564    } else {
6565     return FALSE;
6566    }
6567   }
6568  } else {
6569   return FALSE;
6570  }
6571 }
6572
6573 SV*
6574 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6575 {
6576  struct regexp *const rx = ReANY(r);
6577
6578  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6579
6580  if ( rx && RXp_PAREN_NAMES(rx) ) {
6581   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6582
6583   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6584  } else {
6585   return FALSE;
6586  }
6587 }
6588
6589 SV*
6590 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6591 {
6592  struct regexp *const rx = ReANY(r);
6593  GET_RE_DEBUG_FLAGS_DECL;
6594
6595  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6596
6597  if (rx && RXp_PAREN_NAMES(rx)) {
6598   HV *hv = RXp_PAREN_NAMES(rx);
6599   HE *temphe;
6600   while ( (temphe = hv_iternext_flags(hv,0)) ) {
6601    IV i;
6602    IV parno = 0;
6603    SV* sv_dat = HeVAL(temphe);
6604    I32 *nums = (I32*)SvPVX(sv_dat);
6605    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6606     if ((I32)(rx->lastparen) >= nums[i] &&
6607      rx->offs[nums[i]].start != -1 &&
6608      rx->offs[nums[i]].end != -1)
6609     {
6610      parno = nums[i];
6611      break;
6612     }
6613    }
6614    if (parno || flags & RXapif_ALL) {
6615     return newSVhek(HeKEY_hek(temphe));
6616    }
6617   }
6618  }
6619  return NULL;
6620 }
6621
6622 SV*
6623 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6624 {
6625  SV *ret;
6626  AV *av;
6627  I32 length;
6628  struct regexp *const rx = ReANY(r);
6629
6630  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6631
6632  if (rx && RXp_PAREN_NAMES(rx)) {
6633   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6634    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6635   } else if (flags & RXapif_ONE) {
6636    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6637    av = MUTABLE_AV(SvRV(ret));
6638    length = av_len(av);
6639    SvREFCNT_dec_NN(ret);
6640    return newSViv(length + 1);
6641   } else {
6642    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6643    return NULL;
6644   }
6645  }
6646  return &PL_sv_undef;
6647 }
6648
6649 SV*
6650 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6651 {
6652  struct regexp *const rx = ReANY(r);
6653  AV *av = newAV();
6654
6655  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6656
6657  if (rx && RXp_PAREN_NAMES(rx)) {
6658   HV *hv= RXp_PAREN_NAMES(rx);
6659   HE *temphe;
6660   (void)hv_iterinit(hv);
6661   while ( (temphe = hv_iternext_flags(hv,0)) ) {
6662    IV i;
6663    IV parno = 0;
6664    SV* sv_dat = HeVAL(temphe);
6665    I32 *nums = (I32*)SvPVX(sv_dat);
6666    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6667     if ((I32)(rx->lastparen) >= nums[i] &&
6668      rx->offs[nums[i]].start != -1 &&
6669      rx->offs[nums[i]].end != -1)
6670     {
6671      parno = nums[i];
6672      break;
6673     }
6674    }
6675    if (parno || flags & RXapif_ALL) {
6676     av_push(av, newSVhek(HeKEY_hek(temphe)));
6677    }
6678   }
6679  }
6680
6681  return newRV_noinc(MUTABLE_SV(av));
6682 }
6683
6684 void
6685 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6686        SV * const sv)
6687 {
6688  struct regexp *const rx = ReANY(r);
6689  char *s = NULL;
6690  I32 i = 0;
6691  I32 s1, t1;
6692  I32 n = paren;
6693
6694  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6695
6696  if (      n == RX_BUFF_IDX_CARET_PREMATCH
6697   || n == RX_BUFF_IDX_CARET_FULLMATCH
6698   || n == RX_BUFF_IDX_CARET_POSTMATCH
6699  )
6700  {
6701   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6702   if (!keepcopy) {
6703    /* on something like
6704    *    $r = qr/.../;
6705    *    /$qr/p;
6706    * the KEEPCOPY is set on the PMOP rather than the regex */
6707    if (PL_curpm && r == PM_GETRE(PL_curpm))
6708     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6709   }
6710   if (!keepcopy)
6711    goto ret_undef;
6712  }
6713
6714  if (!rx->subbeg)
6715   goto ret_undef;
6716
6717  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6718   /* no need to distinguish between them any more */
6719   n = RX_BUFF_IDX_FULLMATCH;
6720
6721  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6722   && rx->offs[0].start != -1)
6723  {
6724   /* $`, ${^PREMATCH} */
6725   i = rx->offs[0].start;
6726   s = rx->subbeg;
6727  }
6728  else
6729  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6730   && rx->offs[0].end != -1)
6731  {
6732   /* $', ${^POSTMATCH} */
6733   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6734   i = rx->sublen + rx->suboffset - rx->offs[0].end;
6735  }
6736  else
6737  if ( 0 <= n && n <= (I32)rx->nparens &&
6738   (s1 = rx->offs[n].start) != -1 &&
6739   (t1 = rx->offs[n].end) != -1)
6740  {
6741   /* $&, ${^MATCH},  $1 ... */
6742   i = t1 - s1;
6743   s = rx->subbeg + s1 - rx->suboffset;
6744  } else {
6745   goto ret_undef;
6746  }
6747
6748  assert(s >= rx->subbeg);
6749  assert(rx->sublen >= (s - rx->subbeg) + i );
6750  if (i >= 0) {
6751 #if NO_TAINT_SUPPORT
6752   sv_setpvn(sv, s, i);
6753 #else
6754   const int oldtainted = TAINT_get;
6755   TAINT_NOT;
6756   sv_setpvn(sv, s, i);
6757   TAINT_set(oldtainted);
6758 #endif
6759   if ( (rx->extflags & RXf_CANY_SEEN)
6760    ? (RXp_MATCH_UTF8(rx)
6761       && (!i || is_utf8_string((U8*)s, i)))
6762    : (RXp_MATCH_UTF8(rx)) )
6763   {
6764    SvUTF8_on(sv);
6765   }
6766   else
6767    SvUTF8_off(sv);
6768   if (TAINTING_get) {
6769    if (RXp_MATCH_TAINTED(rx)) {
6770     if (SvTYPE(sv) >= SVt_PVMG) {
6771      MAGIC* const mg = SvMAGIC(sv);
6772      MAGIC* mgt;
6773      TAINT;
6774      SvMAGIC_set(sv, mg->mg_moremagic);
6775      SvTAINT(sv);
6776      if ((mgt = SvMAGIC(sv))) {
6777       mg->mg_moremagic = mgt;
6778       SvMAGIC_set(sv, mg);
6779      }
6780     } else {
6781      TAINT;
6782      SvTAINT(sv);
6783     }
6784    } else
6785     SvTAINTED_off(sv);
6786   }
6787  } else {
6788  ret_undef:
6789   sv_setsv(sv,&PL_sv_undef);
6790   return;
6791  }
6792 }
6793
6794 void
6795 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6796               SV const * const value)
6797 {
6798  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6799
6800  PERL_UNUSED_ARG(rx);
6801  PERL_UNUSED_ARG(paren);
6802  PERL_UNUSED_ARG(value);
6803
6804  if (!PL_localizing)
6805   Perl_croak_no_modify();
6806 }
6807
6808 I32
6809 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6810        const I32 paren)
6811 {
6812  struct regexp *const rx = ReANY(r);
6813  I32 i;
6814  I32 s1, t1;
6815
6816  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6817
6818  if (   paren == RX_BUFF_IDX_CARET_PREMATCH
6819   || paren == RX_BUFF_IDX_CARET_FULLMATCH
6820   || paren == RX_BUFF_IDX_CARET_POSTMATCH
6821  )
6822  {
6823   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6824   if (!keepcopy) {
6825    /* on something like
6826    *    $r = qr/.../;
6827    *    /$qr/p;
6828    * the KEEPCOPY is set on the PMOP rather than the regex */
6829    if (PL_curpm && r == PM_GETRE(PL_curpm))
6830     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6831   }
6832   if (!keepcopy)
6833    goto warn_undef;
6834  }
6835
6836  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6837  switch (paren) {
6838  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6839  case RX_BUFF_IDX_PREMATCH:       /* $` */
6840   if (rx->offs[0].start != -1) {
6841       i = rx->offs[0].start;
6842       if (i > 0) {
6843         s1 = 0;
6844         t1 = i;
6845         goto getlen;
6846       }
6847    }
6848   return 0;
6849
6850  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6851  case RX_BUFF_IDX_POSTMATCH:       /* $' */
6852    if (rx->offs[0].end != -1) {
6853       i = rx->sublen - rx->offs[0].end;
6854       if (i > 0) {
6855         s1 = rx->offs[0].end;
6856         t1 = rx->sublen;
6857         goto getlen;
6858       }
6859    }
6860   return 0;
6861
6862  default: /* $& / ${^MATCH}, $1, $2, ... */
6863    if (paren <= (I32)rx->nparens &&
6864    (s1 = rx->offs[paren].start) != -1 &&
6865    (t1 = rx->offs[paren].end) != -1)
6866    {
6867    i = t1 - s1;
6868    goto getlen;
6869   } else {
6870   warn_undef:
6871    if (ckWARN(WARN_UNINITIALIZED))
6872     report_uninit((const SV *)sv);
6873    return 0;
6874   }
6875  }
6876   getlen:
6877  if (i > 0 && RXp_MATCH_UTF8(rx)) {
6878   const char * const s = rx->subbeg - rx->suboffset + s1;
6879   const U8 *ep;
6880   STRLEN el;
6881
6882   i = t1 - s1;
6883   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6884       i = el;
6885  }
6886  return i;
6887 }
6888
6889 SV*
6890 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6891 {
6892  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6893   PERL_UNUSED_ARG(rx);
6894   if (0)
6895    return NULL;
6896   else
6897    return newSVpvs("Regexp");
6898 }
6899
6900 /* Scans the name of a named buffer from the pattern.
6901  * If flags is REG_RSN_RETURN_NULL returns null.
6902  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6903  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6904  * to the parsed name as looked up in the RExC_paren_names hash.
6905  * If there is an error throws a vFAIL().. type exception.
6906  */
6907
6908 #define REG_RSN_RETURN_NULL    0
6909 #define REG_RSN_RETURN_NAME    1
6910 #define REG_RSN_RETURN_DATA    2
6911
6912 STATIC SV*
6913 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6914 {
6915  char *name_start = RExC_parse;
6916
6917  PERL_ARGS_ASSERT_REG_SCAN_NAME;
6918
6919  if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6920   /* skip IDFIRST by using do...while */
6921   if (UTF)
6922    do {
6923     RExC_parse += UTF8SKIP(RExC_parse);
6924    } while (isWORDCHAR_utf8((U8*)RExC_parse));
6925   else
6926    do {
6927     RExC_parse++;
6928    } while (isWORDCHAR(*RExC_parse));
6929  } else {
6930   RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6931   vFAIL("Group name must start with a non-digit word character");
6932  }
6933  if ( flags ) {
6934   SV* sv_name
6935    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6936        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6937   if ( flags == REG_RSN_RETURN_NAME)
6938    return sv_name;
6939   else if (flags==REG_RSN_RETURN_DATA) {
6940    HE *he_str = NULL;
6941    SV *sv_dat = NULL;
6942    if ( ! sv_name )      /* should not happen*/
6943     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6944    if (RExC_paren_names)
6945     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6946    if ( he_str )
6947     sv_dat = HeVAL(he_str);
6948    if ( ! sv_dat )
6949     vFAIL("Reference to nonexistent named group");
6950    return sv_dat;
6951   }
6952   else {
6953    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6954      (unsigned long) flags);
6955   }
6956   assert(0); /* NOT REACHED */
6957  }
6958  return NULL;
6959 }
6960
6961 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6962  int rem=(int)(RExC_end - RExC_parse);                       \
6963  int cut;                                                    \
6964  int num;                                                    \
6965  int iscut=0;                                                \
6966  if (rem>10) {                                               \
6967   rem=10;                                                 \
6968   iscut=1;                                                \
6969  }                                                           \
6970  cut=10-rem;                                                 \
6971  if (RExC_lastparse!=RExC_parse)                             \
6972   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6973    rem, RExC_parse,                                    \
6974    cut + 4,                                            \
6975    iscut ? "..." : "<"                                 \
6976   );                                                      \
6977  else                                                        \
6978   PerlIO_printf(Perl_debug_log,"%16s","");                \
6979                 \
6980  if (SIZE_ONLY)                                              \
6981  num = RExC_size + 1;                                     \
6982  else                                                        \
6983  num=REG_NODE_NUM(RExC_emit);                             \
6984  if (RExC_lastnum!=num)                                      \
6985  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6986  else                                                        \
6987  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6988  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6989   (int)((depth*2)), "",                                   \
6990   (funcname)                                              \
6991  );                                                          \
6992  RExC_lastnum=num;                                           \
6993  RExC_lastparse=RExC_parse;                                  \
6994 })
6995
6996
6997
6998 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6999  DEBUG_PARSE_MSG((funcname));                            \
7000  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
7001 })
7002 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7003  DEBUG_PARSE_MSG((funcname));                            \
7004  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7005 })
7006
7007 /* This section of code defines the inversion list object and its methods.  The
7008  * interfaces are highly subject to change, so as much as possible is static to
7009  * this file.  An inversion list is here implemented as a malloc'd C UV array
7010  * with some added info that is placed as UVs at the beginning in a header
7011  * portion.  An inversion list for Unicode is an array of code points, sorted
7012  * by ordinal number.  The zeroth element is the first code point in the list.
7013  * The 1th element is the first element beyond that not in the list.  In other
7014  * words, the first range is
7015  *  invlist[0]..(invlist[1]-1)
7016  * The other ranges follow.  Thus every element whose index is divisible by two
7017  * marks the beginning of a range that is in the list, and every element not
7018  * divisible by two marks the beginning of a range not in the list.  A single
7019  * element inversion list that contains the single code point N generally
7020  * consists of two elements
7021  *  invlist[0] == N
7022  *  invlist[1] == N+1
7023  * (The exception is when N is the highest representable value on the
7024  * machine, in which case the list containing just it would be a single
7025  * element, itself.  By extension, if the last range in the list extends to
7026  * infinity, then the first element of that range will be in the inversion list
7027  * at a position that is divisible by two, and is the final element in the
7028  * list.)
7029  * Taking the complement (inverting) an inversion list is quite simple, if the
7030  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7031  * This implementation reserves an element at the beginning of each inversion
7032  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
7033  * actual beginning of the list is either that element if 0, or the next one if
7034  * 1.
7035  *
7036  * More about inversion lists can be found in "Unicode Demystified"
7037  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7038  * More will be coming when functionality is added later.
7039  *
7040  * The inversion list data structure is currently implemented as an SV pointing
7041  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7042  * array of UV whose memory management is automatically handled by the existing
7043  * facilities for SV's.
7044  *
7045  * Some of the methods should always be private to the implementation, and some
7046  * should eventually be made public */
7047
7048 /* The header definitions are in F<inline_invlist.c> */
7049 #define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
7050 #define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
7051
7052 #define INVLIST_INITIAL_LEN 10
7053
7054 PERL_STATIC_INLINE UV*
7055 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7056 {
7057  /* Returns a pointer to the first element in the inversion list's array.
7058  * This is called upon initialization of an inversion list.  Where the
7059  * array begins depends on whether the list has the code point U+0000
7060  * in it or not.  The other parameter tells it whether the code that
7061  * follows this call is about to put a 0 in the inversion list or not.
7062  * The first element is either the element with 0, if 0, or the next one,
7063  * if 1 */
7064
7065  UV* zero = get_invlist_zero_addr(invlist);
7066
7067  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7068
7069  /* Must be empty */
7070  assert(! *_get_invlist_len_addr(invlist));
7071
7072  /* 1^1 = 0; 1^0 = 1 */
7073  *zero = 1 ^ will_have_0;
7074  return zero + *zero;
7075 }
7076
7077 PERL_STATIC_INLINE UV*
7078 S_invlist_array(pTHX_ SV* const invlist)
7079 {
7080  /* Returns the pointer to the inversion list's array.  Every time the
7081  * length changes, this needs to be called in case malloc or realloc moved
7082  * it */
7083
7084  PERL_ARGS_ASSERT_INVLIST_ARRAY;
7085
7086  /* Must not be empty.  If these fail, you probably didn't check for <len>
7087  * being non-zero before trying to get the array */
7088  assert(*_get_invlist_len_addr(invlist));
7089  assert(*get_invlist_zero_addr(invlist) == 0
7090   || *get_invlist_zero_addr(invlist) == 1);
7091
7092  /* The array begins either at the element reserved for zero if the
7093  * list contains 0 (that element will be set to 0), or otherwise the next
7094  * element (in which case the reserved element will be set to 1). */
7095  return (UV *) (get_invlist_zero_addr(invlist)
7096     + *get_invlist_zero_addr(invlist));
7097 }
7098
7099 PERL_STATIC_INLINE void
7100 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7101 {
7102  /* Sets the current number of elements stored in the inversion list */
7103
7104  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7105
7106  *_get_invlist_len_addr(invlist) = len;
7107
7108  assert(len <= SvLEN(invlist));
7109
7110  SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7111  /* If the list contains U+0000, that element is part of the header,
7112  * and should not be counted as part of the array.  It will contain
7113  * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7114  * subtract:
7115  * SvCUR_set(invlist,
7116  *    TO_INTERNAL_SIZE(len
7117  *       - (*get_invlist_zero_addr(inv_list) ^ 1)));
7118  * But, this is only valid if len is not 0.  The consequences of not doing
7119  * this is that the memory allocation code may think that 1 more UV is
7120  * being used than actually is, and so might do an unnecessary grow.  That
7121  * seems worth not bothering to make this the precise amount.
7122  *
7123  * Note that when inverting, SvCUR shouldn't change */
7124 }
7125
7126 PERL_STATIC_INLINE IV*
7127 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7128 {
7129  /* Return the address of the UV that is reserved to hold the cached index
7130  * */
7131
7132  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7133
7134  return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7135 }
7136
7137 PERL_STATIC_INLINE IV
7138 S_invlist_previous_index(pTHX_ SV* const invlist)
7139 {
7140  /* Returns cached index of previous search */
7141
7142  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7143
7144  return *get_invlist_previous_index_addr(invlist);
7145 }
7146
7147 PERL_STATIC_INLINE void
7148 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7149 {
7150  /* Caches <index> for later retrieval */
7151
7152  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7153
7154  assert(index == 0 || index < (int) _invlist_len(invlist));
7155
7156  *get_invlist_previous_index_addr(invlist) = index;
7157 }
7158
7159 PERL_STATIC_INLINE UV
7160 S_invlist_max(pTHX_ SV* const invlist)
7161 {
7162  /* Returns the maximum number of elements storable in the inversion list's
7163  * array, without having to realloc() */
7164
7165  PERL_ARGS_ASSERT_INVLIST_MAX;
7166
7167  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7168   ? _invlist_len(invlist)
7169   : FROM_INTERNAL_SIZE(SvLEN(invlist));
7170 }
7171
7172 PERL_STATIC_INLINE UV*
7173 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7174 {
7175  /* Return the address of the UV that is reserved to hold 0 if the inversion
7176  * list contains 0.  This has to be the last element of the heading, as the
7177  * list proper starts with either it if 0, or the next element if not.
7178  * (But we force it to contain either 0 or 1) */
7179
7180  PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7181
7182  return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7183 }
7184
7185 #ifndef PERL_IN_XSUB_RE
7186 SV*
7187 Perl__new_invlist(pTHX_ IV initial_size)
7188 {
7189
7190  /* Return a pointer to a newly constructed inversion list, with enough
7191  * space to store 'initial_size' elements.  If that number is negative, a
7192  * system default is used instead */
7193
7194  SV* new_list;
7195
7196  if (initial_size < 0) {
7197   initial_size = INVLIST_INITIAL_LEN;
7198  }
7199
7200  /* Allocate the initial space */
7201  new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7202  invlist_set_len(new_list, 0);
7203
7204  /* Force iterinit() to be used to get iteration to work */
7205  *get_invlist_iter_addr(new_list) = UV_MAX;
7206
7207  /* This should force a segfault if a method doesn't initialize this
7208  * properly */
7209  *get_invlist_zero_addr(new_list) = UV_MAX;
7210
7211  *get_invlist_previous_index_addr(new_list) = 0;
7212  *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7213 #if HEADER_LENGTH != 5
7214 #   error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7215 #endif
7216
7217  return new_list;
7218 }
7219 #endif
7220
7221 STATIC SV*
7222 S__new_invlist_C_array(pTHX_ UV* list)
7223 {
7224  /* Return a pointer to a newly constructed inversion list, initialized to
7225  * point to <list>, which has to be in the exact correct inversion list
7226  * form, including internal fields.  Thus this is a dangerous routine that
7227  * should not be used in the wrong hands */
7228
7229  SV* invlist = newSV_type(SVt_PV);
7230
7231  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7232
7233  SvPV_set(invlist, (char *) list);
7234  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7235        shouldn't touch it */
7236  SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7237
7238  if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7239   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7240  }
7241
7242  /* Initialize the iteration pointer.
7243  * XXX This could be done at compile time in charclass_invlists.h, but I
7244  * (khw) am not confident that the suffixes for specifying the C constant
7245  * UV_MAX are portable, e.g.  'ull' on a 32 bit machine that is configured
7246  * to use 64 bits; might need a Configure probe */
7247  invlist_iterfinish(invlist);
7248
7249  return invlist;
7250 }
7251
7252 STATIC void
7253 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7254 {
7255  /* Grow the maximum size of an inversion list */
7256
7257  PERL_ARGS_ASSERT_INVLIST_EXTEND;
7258
7259  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7260 }
7261
7262 PERL_STATIC_INLINE void
7263 S_invlist_trim(pTHX_ SV* const invlist)
7264 {
7265  PERL_ARGS_ASSERT_INVLIST_TRIM;
7266
7267  /* Change the length of the inversion list to how many entries it currently
7268  * has */
7269
7270  SvPV_shrink_to_cur((SV *) invlist);
7271 }
7272
7273 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7274
7275 STATIC void
7276 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7277 {
7278    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7279  * the end of the inversion list.  The range must be above any existing
7280  * ones. */
7281
7282  UV* array;
7283  UV max = invlist_max(invlist);
7284  UV len = _invlist_len(invlist);
7285
7286  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7287
7288  if (len == 0) { /* Empty lists must be initialized */
7289   array = _invlist_array_init(invlist, start == 0);
7290  }
7291  else {
7292   /* Here, the existing list is non-empty. The current max entry in the
7293   * list is generally the first value not in the set, except when the
7294   * set extends to the end of permissible values, in which case it is
7295   * the first entry in that final set, and so this call is an attempt to
7296   * append out-of-order */
7297
7298   UV final_element = len - 1;
7299   array = invlist_array(invlist);
7300   if (array[final_element] > start
7301    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7302   {
7303    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7304      array[final_element], start,
7305      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7306   }
7307
7308   /* Here, it is a legal append.  If the new range begins with the first
7309   * value not in the set, it is extending the set, so the new first
7310   * value not in the set is one greater than the newly extended range.
7311   * */
7312   if (array[final_element] == start) {
7313    if (end != UV_MAX) {
7314     array[final_element] = end + 1;
7315    }
7316    else {
7317     /* But if the end is the maximum representable on the machine,
7318     * just let the range that this would extend to have no end */
7319     invlist_set_len(invlist, len - 1);
7320    }
7321    return;
7322   }
7323  }
7324
7325  /* Here the new range doesn't extend any existing set.  Add it */
7326
7327  len += 2; /* Includes an element each for the start and end of range */
7328
7329  /* If overflows the existing space, extend, which may cause the array to be
7330  * moved */
7331  if (max < len) {
7332   invlist_extend(invlist, len);
7333   invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7334           failure in invlist_array() */
7335   array = invlist_array(invlist);
7336  }
7337  else {
7338   invlist_set_len(invlist, len);
7339  }
7340
7341  /* The next item on the list starts the range, the one after that is
7342  * one past the new range.  */
7343  array[len - 2] = start;
7344  if (end != UV_MAX) {
7345   array[len - 1] = end + 1;
7346  }
7347  else {
7348   /* But if the end is the maximum representable on the machine, just let
7349   * the range have no end */
7350   invlist_set_len(invlist, len - 1);
7351  }
7352 }
7353
7354 #ifndef PERL_IN_XSUB_RE
7355
7356 IV
7357 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7358 {
7359  /* Searches the inversion list for the entry that contains the input code
7360  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7361  * return value is the index into the list's array of the range that
7362  * contains <cp> */
7363
7364  IV low = 0;
7365  IV mid;
7366  IV high = _invlist_len(invlist);
7367  const IV highest_element = high - 1;
7368  const UV* array;
7369
7370  PERL_ARGS_ASSERT__INVLIST_SEARCH;
7371
7372  /* If list is empty, return failure. */
7373  if (high == 0) {
7374   return -1;
7375  }
7376
7377  /* (We can't get the array unless we know the list is non-empty) */
7378  array = invlist_array(invlist);
7379
7380  mid = invlist_previous_index(invlist);
7381  assert(mid >=0 && mid <= highest_element);
7382
7383  /* <mid> contains the cache of the result of the previous call to this
7384  * function (0 the first time).  See if this call is for the same result,
7385  * or if it is for mid-1.  This is under the theory that calls to this
7386  * function will often be for related code points that are near each other.
7387  * And benchmarks show that caching gives better results.  We also test
7388  * here if the code point is within the bounds of the list.  These tests
7389  * replace others that would have had to be made anyway to make sure that
7390  * the array bounds were not exceeded, and these give us extra information
7391  * at the same time */
7392  if (cp >= array[mid]) {
7393   if (cp >= array[highest_element]) {
7394    return highest_element;
7395   }
7396
7397   /* Here, array[mid] <= cp < array[highest_element].  This means that
7398   * the final element is not the answer, so can exclude it; it also
7399   * means that <mid> is not the final element, so can refer to 'mid + 1'
7400   * safely */
7401   if (cp < array[mid + 1]) {
7402    return mid;
7403   }
7404   high--;
7405   low = mid + 1;
7406  }
7407  else { /* cp < aray[mid] */
7408   if (cp < array[0]) { /* Fail if outside the array */
7409    return -1;
7410   }
7411   high = mid;
7412   if (cp >= array[mid - 1]) {
7413    goto found_entry;
7414   }
7415  }
7416
7417  /* Binary search.  What we are looking for is <i> such that
7418  * array[i] <= cp < array[i+1]
7419  * The loop below converges on the i+1.  Note that there may not be an
7420  * (i+1)th element in the array, and things work nonetheless */
7421  while (low < high) {
7422   mid = (low + high) / 2;
7423   assert(mid <= highest_element);
7424   if (array[mid] <= cp) { /* cp >= array[mid] */
7425    low = mid + 1;
7426
7427    /* We could do this extra test to exit the loop early.
7428    if (cp < array[low]) {
7429     return mid;
7430    }
7431    */
7432   }
7433   else { /* cp < array[mid] */
7434    high = mid;
7435   }
7436  }
7437
7438   found_entry:
7439  high--;
7440  invlist_set_previous_index(invlist, high);
7441  return high;
7442 }
7443
7444 void
7445 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7446 {
7447  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7448  * but is used when the swash has an inversion list.  This makes this much
7449  * faster, as it uses a binary search instead of a linear one.  This is
7450  * intimately tied to that function, and perhaps should be in utf8.c,
7451  * except it is intimately tied to inversion lists as well.  It assumes
7452  * that <swatch> is all 0's on input */
7453
7454  UV current = start;
7455  const IV len = _invlist_len(invlist);
7456  IV i;
7457  const UV * array;
7458
7459  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7460
7461  if (len == 0) { /* Empty inversion list */
7462   return;
7463  }
7464
7465  array = invlist_array(invlist);
7466
7467  /* Find which element it is */
7468  i = _invlist_search(invlist, start);
7469
7470  /* We populate from <start> to <end> */
7471  while (current < end) {
7472   UV upper;
7473
7474   /* The inversion list gives the results for every possible code point
7475   * after the first one in the list.  Only those ranges whose index is
7476   * even are ones that the inversion list matches.  For the odd ones,
7477   * and if the initial code point is not in the list, we have to skip
7478   * forward to the next element */
7479   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7480    i++;
7481    if (i >= len) { /* Finished if beyond the end of the array */
7482     return;
7483    }
7484    current = array[i];
7485    if (current >= end) {   /* Finished if beyond the end of what we
7486          are populating */
7487     if (LIKELY(end < UV_MAX)) {
7488      return;
7489     }
7490
7491     /* We get here when the upper bound is the maximum
7492     * representable on the machine, and we are looking for just
7493     * that code point.  Have to special case it */
7494     i = len;
7495     goto join_end_of_list;
7496    }
7497   }
7498   assert(current >= start);
7499
7500   /* The current range ends one below the next one, except don't go past
7501   * <end> */
7502   i++;
7503   upper = (i < len && array[i] < end) ? array[i] : end;
7504
7505   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7506   * for each code point in it */
7507   for (; current < upper; current++) {
7508    const STRLEN offset = (STRLEN)(current - start);
7509    swatch[offset >> 3] |= 1 << (offset & 7);
7510   }
7511
7512  join_end_of_list:
7513
7514   /* Quit if at the end of the list */
7515   if (i >= len) {
7516
7517    /* But first, have to deal with the highest possible code point on
7518    * the platform.  The previous code assumes that <end> is one
7519    * beyond where we want to populate, but that is impossible at the
7520    * platform's infinity, so have to handle it specially */
7521    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7522    {
7523     const STRLEN offset = (STRLEN)(end - start);
7524     swatch[offset >> 3] |= 1 << (offset & 7);
7525    }
7526    return;
7527   }
7528
7529   /* Advance to the next range, which will be for code points not in the
7530   * inversion list */
7531   current = array[i];
7532  }
7533
7534  return;
7535 }
7536
7537 void
7538 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7539 {
7540  /* Take the union of two inversion lists and point <output> to it.  *output
7541  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7542  * the reference count to that list will be decremented.  The first list,
7543  * <a>, may be NULL, in which case a copy of the second list is returned.
7544  * If <complement_b> is TRUE, the union is taken of the complement
7545  * (inversion) of <b> instead of b itself.
7546  *
7547  * The basis for this comes from "Unicode Demystified" Chapter 13 by
7548  * Richard Gillam, published by Addison-Wesley, and explained at some
7549  * length there.  The preface says to incorporate its examples into your
7550  * code at your own risk.
7551  *
7552  * The algorithm is like a merge sort.
7553  *
7554  * XXX A potential performance improvement is to keep track as we go along
7555  * if only one of the inputs contributes to the result, meaning the other
7556  * is a subset of that one.  In that case, we can skip the final copy and
7557  * return the larger of the input lists, but then outside code might need
7558  * to keep track of whether to free the input list or not */
7559
7560  UV* array_a;    /* a's array */
7561  UV* array_b;
7562  UV len_a;     /* length of a's array */
7563  UV len_b;
7564
7565  SV* u;   /* the resulting union */
7566  UV* array_u;
7567  UV len_u;
7568
7569  UV i_a = 0;      /* current index into a's array */
7570  UV i_b = 0;
7571  UV i_u = 0;
7572
7573  /* running count, as explained in the algorithm source book; items are
7574  * stopped accumulating and are output when the count changes to/from 0.
7575  * The count is incremented when we start a range that's in the set, and
7576  * decremented when we start a range that's not in the set.  So its range
7577  * is 0 to 2.  Only when the count is zero is something not in the set.
7578  */
7579  UV count = 0;
7580
7581  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7582  assert(a != b);
7583
7584  /* If either one is empty, the union is the other one */
7585  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7586   if (*output == a) {
7587    if (a != NULL) {
7588     SvREFCNT_dec_NN(a);
7589    }
7590   }
7591   if (*output != b) {
7592    *output = invlist_clone(b);
7593    if (complement_b) {
7594     _invlist_invert(*output);
7595    }
7596   } /* else *output already = b; */
7597   return;
7598  }
7599  else if ((len_b = _invlist_len(b)) == 0) {
7600   if (*output == b) {
7601    SvREFCNT_dec_NN(b);
7602   }
7603
7604   /* The complement of an empty list is a list that has everything in it,
7605   * so the union with <a> includes everything too */
7606   if (complement_b) {
7607    if (a == *output) {
7608     SvREFCNT_dec_NN(a);
7609    }
7610    *output = _new_invlist(1);
7611    _append_range_to_invlist(*output, 0, UV_MAX);
7612   }
7613   else if (*output != a) {
7614    *output = invlist_clone(a);
7615   }
7616   /* else *output already = a; */
7617   return;
7618  }
7619
7620  /* Here both lists exist and are non-empty */
7621  array_a = invlist_array(a);
7622  array_b = invlist_array(b);
7623
7624  /* If are to take the union of 'a' with the complement of b, set it
7625  * up so are looking at b's complement. */
7626  if (complement_b) {
7627
7628   /* To complement, we invert: if the first element is 0, remove it.  To
7629   * do this, we just pretend the array starts one later, and clear the
7630   * flag as we don't have to do anything else later */
7631   if (array_b[0] == 0) {
7632    array_b++;
7633    len_b--;
7634    complement_b = FALSE;
7635   }
7636   else {
7637
7638    /* But if the first element is not zero, we unshift a 0 before the
7639    * array.  The data structure reserves a space for that 0 (which
7640    * should be a '1' right now), so physical shifting is unneeded,
7641    * but temporarily change that element to 0.  Before exiting the
7642    * routine, we must restore the element to '1' */
7643    array_b--;
7644    len_b++;
7645    array_b[0] = 0;
7646   }
7647  }
7648
7649  /* Size the union for the worst case: that the sets are completely
7650  * disjoint */
7651  u = _new_invlist(len_a + len_b);
7652
7653  /* Will contain U+0000 if either component does */
7654  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7655          || (len_b > 0 && array_b[0] == 0));
7656
7657  /* Go through each list item by item, stopping when exhausted one of
7658  * them */
7659  while (i_a < len_a && i_b < len_b) {
7660   UV cp;     /* The element to potentially add to the union's array */
7661   bool cp_in_set;   /* is it in the the input list's set or not */
7662
7663   /* We need to take one or the other of the two inputs for the union.
7664   * Since we are merging two sorted lists, we take the smaller of the
7665   * next items.  In case of a tie, we take the one that is in its set
7666   * first.  If we took one not in the set first, it would decrement the
7667   * count, possibly to 0 which would cause it to be output as ending the
7668   * range, and the next time through we would take the same number, and
7669   * output it again as beginning the next range.  By doing it the
7670   * opposite way, there is no possibility that the count will be
7671   * momentarily decremented to 0, and thus the two adjoining ranges will
7672   * be seamlessly merged.  (In a tie and both are in the set or both not
7673   * in the set, it doesn't matter which we take first.) */
7674   if (array_a[i_a] < array_b[i_b]
7675    || (array_a[i_a] == array_b[i_b]
7676     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7677   {
7678    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7679    cp= array_a[i_a++];
7680   }
7681   else {
7682    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7683    cp = array_b[i_b++];
7684   }
7685
7686   /* Here, have chosen which of the two inputs to look at.  Only output
7687   * if the running count changes to/from 0, which marks the
7688   * beginning/end of a range in that's in the set */
7689   if (cp_in_set) {
7690    if (count == 0) {
7691     array_u[i_u++] = cp;
7692    }
7693    count++;
7694   }
7695   else {
7696    count--;
7697    if (count == 0) {
7698     array_u[i_u++] = cp;
7699    }
7700   }
7701  }
7702
7703  /* Here, we are finished going through at least one of the lists, which
7704  * means there is something remaining in at most one.  We check if the list
7705  * that hasn't been exhausted is positioned such that we are in the middle
7706  * of a range in its set or not.  (i_a and i_b point to the element beyond
7707  * the one we care about.) If in the set, we decrement 'count'; if 0, there
7708  * is potentially more to output.
7709  * There are four cases:
7710  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
7711  *    in the union is entirely from the non-exhausted set.
7712  * 2) Both were in their sets, count is 2.  Nothing further should
7713  *    be output, as everything that remains will be in the exhausted
7714  *    list's set, hence in the union; decrementing to 1 but not 0 insures
7715  *    that
7716  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7717  *    Nothing further should be output because the union includes
7718  *    everything from the exhausted set.  Not decrementing ensures that.
7719  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7720  *    decrementing to 0 insures that we look at the remainder of the
7721  *    non-exhausted set */
7722  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7723   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7724  {
7725   count--;
7726  }
7727
7728  /* The final length is what we've output so far, plus what else is about to
7729  * be output.  (If 'count' is non-zero, then the input list we exhausted
7730  * has everything remaining up to the machine's limit in its set, and hence
7731  * in the union, so there will be no further output. */
7732  len_u = i_u;
7733  if (count == 0) {
7734   /* At most one of the subexpressions will be non-zero */
7735   len_u += (len_a - i_a) + (len_b - i_b);
7736  }
7737
7738  /* Set result to final length, which can change the pointer to array_u, so
7739  * re-find it */
7740  if (len_u != _invlist_len(u)) {
7741   invlist_set_len(u, len_u);
7742   invlist_trim(u);
7743   array_u = invlist_array(u);
7744  }
7745
7746  /* When 'count' is 0, the list that was exhausted (if one was shorter than
7747  * the other) ended with everything above it not in its set.  That means
7748  * that the remaining part of the union is precisely the same as the
7749  * non-exhausted list, so can just copy it unchanged.  (If both list were
7750  * exhausted at the same time, then the operations below will be both 0.)
7751  */
7752  if (count == 0) {
7753   IV copy_count; /* At most one will have a non-zero copy count */
7754   if ((copy_count = len_a - i_a) > 0) {
7755    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7756   }
7757   else if ((copy_count = len_b - i_b) > 0) {
7758    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7759   }
7760  }
7761
7762  /* If we've changed b, restore it */
7763  if (complement_b) {
7764   array_b[0] = 1;
7765  }
7766
7767  /*  We may be removing a reference to one of the inputs */
7768  if (a == *output || b == *output) {
7769   assert(! invlist_is_iterating(*output));
7770   SvREFCNT_dec_NN(*output);
7771  }
7772
7773  *output = u;
7774  return;
7775 }
7776
7777 void
7778 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7779 {
7780  /* Take the intersection of two inversion lists and point <i> to it.  *i
7781  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7782  * the reference count to that list will be decremented.
7783  * If <complement_b> is TRUE, the result will be the intersection of <a>
7784  * and the complement (or inversion) of <b> instead of <b> directly.
7785  *
7786  * The basis for this comes from "Unicode Demystified" Chapter 13 by
7787  * Richard Gillam, published by Addison-Wesley, and explained at some
7788  * length there.  The preface says to incorporate its examples into your
7789  * code at your own risk.  In fact, it had bugs
7790  *
7791  * The algorithm is like a merge sort, and is essentially the same as the
7792  * union above
7793  */
7794
7795  UV* array_a;  /* a's array */
7796  UV* array_b;
7797  UV len_a; /* length of a's array */
7798  UV len_b;
7799
7800  SV* r;       /* the resulting intersection */
7801  UV* array_r;
7802  UV len_r;
7803
7804  UV i_a = 0;      /* current index into a's array */
7805  UV i_b = 0;
7806  UV i_r = 0;
7807
7808  /* running count, as explained in the algorithm source book; items are
7809  * stopped accumulating and are output when the count changes to/from 2.
7810  * The count is incremented when we start a range that's in the set, and
7811  * decremented when we start a range that's not in the set.  So its range
7812  * is 0 to 2.  Only when the count is 2 is something in the intersection.
7813  */
7814  UV count = 0;
7815
7816  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7817  assert(a != b);
7818
7819  /* Special case if either one is empty */
7820  len_a = _invlist_len(a);
7821  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7822
7823   if (len_a != 0 && complement_b) {
7824
7825    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7826    * be empty.  Here, also we are using 'b's complement, which hence
7827    * must be every possible code point.  Thus the intersection is
7828    * simply 'a'. */
7829    if (*i != a) {
7830     *i = invlist_clone(a);
7831
7832     if (*i == b) {
7833      SvREFCNT_dec_NN(b);
7834     }
7835    }
7836    /* else *i is already 'a' */
7837    return;
7838   }
7839
7840   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7841   * intersection must be empty */
7842   if (*i == a) {
7843    SvREFCNT_dec_NN(a);
7844   }
7845   else if (*i == b) {
7846    SvREFCNT_dec_NN(b);
7847   }
7848   *i = _new_invlist(0);
7849   return;
7850  }
7851
7852  /* Here both lists exist and are non-empty */
7853  array_a = invlist_array(a);
7854  array_b = invlist_array(b);
7855
7856  /* If are to take the intersection of 'a' with the complement of b, set it
7857  * up so are looking at b's complement. */
7858  if (complement_b) {
7859
7860   /* To complement, we invert: if the first element is 0, remove it.  To
7861   * do this, we just pretend the array starts one later, and clear the
7862   * flag as we don't have to do anything else later */
7863   if (array_b[0] == 0) {
7864    array_b++;
7865    len_b--;
7866    complement_b = FALSE;
7867   }
7868   else {
7869
7870    /* But if the first element is not zero, we unshift a 0 before the
7871    * array.  The data structure reserves a space for that 0 (which
7872    * should be a '1' right now), so physical shifting is unneeded,
7873    * but temporarily change that element to 0.  Before exiting the
7874    * routine, we must restore the element to '1' */
7875    array_b--;
7876    len_b++;
7877    array_b[0] = 0;
7878   }
7879  }
7880
7881  /* Size the intersection for the worst case: that the intersection ends up
7882  * fragmenting everything to be completely disjoint */
7883  r= _new_invlist(len_a + len_b);
7884
7885  /* Will contain U+0000 iff both components do */
7886  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7887          && len_b > 0 && array_b[0] == 0);
7888
7889  /* Go through each list item by item, stopping when exhausted one of
7890  * them */
7891  while (i_a < len_a && i_b < len_b) {
7892   UV cp;     /* The element to potentially add to the intersection's
7893      array */
7894   bool cp_in_set; /* Is it in the input list's set or not */
7895
7896   /* We need to take one or the other of the two inputs for the
7897   * intersection.  Since we are merging two sorted lists, we take the
7898   * smaller of the next items.  In case of a tie, we take the one that
7899   * is not in its set first (a difference from the union algorithm).  If
7900   * we took one in the set first, it would increment the count, possibly
7901   * to 2 which would cause it to be output as starting a range in the
7902   * intersection, and the next time through we would take that same
7903   * number, and output it again as ending the set.  By doing it the
7904   * opposite of this, there is no possibility that the count will be
7905   * momentarily incremented to 2.  (In a tie and both are in the set or
7906   * both not in the set, it doesn't matter which we take first.) */
7907   if (array_a[i_a] < array_b[i_b]
7908    || (array_a[i_a] == array_b[i_b]
7909     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7910   {
7911    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7912    cp= array_a[i_a++];
7913   }
7914   else {
7915    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7916    cp= array_b[i_b++];
7917   }
7918
7919   /* Here, have chosen which of the two inputs to look at.  Only output
7920   * if the running count changes to/from 2, which marks the
7921   * beginning/end of a range that's in the intersection */
7922   if (cp_in_set) {
7923    count++;
7924    if (count == 2) {
7925     array_r[i_r++] = cp;
7926    }
7927   }
7928   else {
7929    if (count == 2) {
7930     array_r[i_r++] = cp;
7931    }
7932    count--;
7933   }
7934  }
7935
7936  /* Here, we are finished going through at least one of the lists, which
7937  * means there is something remaining in at most one.  We check if the list
7938  * that has been exhausted is positioned such that we are in the middle
7939  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7940  * the ones we care about.)  There are four cases:
7941  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
7942  *    nothing left in the intersection.
7943  * 2) Both were in their sets, count is 2 and perhaps is incremented to
7944  *    above 2.  What should be output is exactly that which is in the
7945  *    non-exhausted set, as everything it has is also in the intersection
7946  *    set, and everything it doesn't have can't be in the intersection
7947  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7948  *    gets incremented to 2.  Like the previous case, the intersection is
7949  *    everything that remains in the non-exhausted set.
7950  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7951  *    remains 1.  And the intersection has nothing more. */
7952  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7953   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7954  {
7955   count++;
7956  }
7957
7958  /* The final length is what we've output so far plus what else is in the
7959  * intersection.  At most one of the subexpressions below will be non-zero */
7960  len_r = i_r;
7961  if (count >= 2) {
7962   len_r += (len_a - i_a) + (len_b - i_b);
7963  }
7964
7965  /* Set result to final length, which can change the pointer to array_r, so
7966  * re-find it */
7967  if (len_r != _invlist_len(r)) {
7968   invlist_set_len(r, len_r);
7969   invlist_trim(r);
7970   array_r = invlist_array(r);
7971  }
7972
7973  /* Finish outputting any remaining */
7974  if (count >= 2) { /* At most one will have a non-zero copy count */
7975   IV copy_count;
7976   if ((copy_count = len_a - i_a) > 0) {
7977    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7978   }
7979   else if ((copy_count = len_b - i_b) > 0) {
7980    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7981   }
7982  }
7983
7984  /* If we've changed b, restore it */
7985  if (complement_b) {
7986   array_b[0] = 1;
7987  }
7988
7989  /*  We may be removing a reference to one of the inputs */
7990  if (a == *i || b == *i) {
7991   assert(! invlist_is_iterating(*i));
7992   SvREFCNT_dec_NN(*i);
7993  }
7994
7995  *i = r;
7996  return;
7997 }
7998
7999 SV*
8000 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8001 {
8002  /* Add the range from 'start' to 'end' inclusive to the inversion list's
8003  * set.  A pointer to the inversion list is returned.  This may actually be
8004  * a new list, in which case the passed in one has been destroyed.  The
8005  * passed in inversion list can be NULL, in which case a new one is created
8006  * with just the one range in it */
8007
8008  SV* range_invlist;
8009  UV len;
8010
8011  if (invlist == NULL) {
8012   invlist = _new_invlist(2);
8013   len = 0;
8014  }
8015  else {
8016   len = _invlist_len(invlist);
8017  }
8018
8019  /* If comes after the final entry actually in the list, can just append it
8020  * to the end, */
8021  if (len == 0
8022   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8023    && start >= invlist_array(invlist)[len - 1]))
8024  {
8025   _append_range_to_invlist(invlist, start, end);
8026   return invlist;
8027  }
8028
8029  /* Here, can't just append things, create and return a new inversion list
8030  * which is the union of this range and the existing inversion list */
8031  range_invlist = _new_invlist(2);
8032  _append_range_to_invlist(range_invlist, start, end);
8033
8034  _invlist_union(invlist, range_invlist, &invlist);
8035
8036  /* The temporary can be freed */
8037  SvREFCNT_dec_NN(range_invlist);
8038
8039  return invlist;
8040 }
8041
8042 #endif
8043
8044 PERL_STATIC_INLINE SV*
8045 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8046  return _add_range_to_invlist(invlist, cp, cp);
8047 }
8048
8049 #ifndef PERL_IN_XSUB_RE
8050 void
8051 Perl__invlist_invert(pTHX_ SV* const invlist)
8052 {
8053  /* Complement the input inversion list.  This adds a 0 if the list didn't
8054  * have a zero; removes it otherwise.  As described above, the data
8055  * structure is set up so that this is very efficient */
8056
8057  UV* len_pos = _get_invlist_len_addr(invlist);
8058
8059  PERL_ARGS_ASSERT__INVLIST_INVERT;
8060
8061  assert(! invlist_is_iterating(invlist));
8062
8063  /* The inverse of matching nothing is matching everything */
8064  if (*len_pos == 0) {
8065   _append_range_to_invlist(invlist, 0, UV_MAX);
8066   return;
8067  }
8068
8069  /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
8070  * zero element was a 0, so it is being removed, so the length decrements
8071  * by 1; and vice-versa.  SvCUR is unaffected */
8072  if (*get_invlist_zero_addr(invlist) ^= 1) {
8073   (*len_pos)--;
8074  }
8075  else {
8076   (*len_pos)++;
8077  }
8078 }
8079
8080 void
8081 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8082 {
8083  /* Complement the input inversion list (which must be a Unicode property,
8084  * all of which don't match above the Unicode maximum code point.)  And
8085  * Perl has chosen to not have the inversion match above that either.  This
8086  * adds a 0x110000 if the list didn't end with it, and removes it if it did
8087  */
8088
8089  UV len;
8090  UV* array;
8091
8092  PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8093
8094  _invlist_invert(invlist);
8095
8096  len = _invlist_len(invlist);
8097
8098  if (len != 0) { /* If empty do nothing */
8099   array = invlist_array(invlist);
8100   if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8101    /* Add 0x110000.  First, grow if necessary */
8102    len++;
8103    if (invlist_max(invlist) < len) {
8104     invlist_extend(invlist, len);
8105     array = invlist_array(invlist);
8106    }
8107    invlist_set_len(invlist, len);
8108    array[len - 1] = PERL_UNICODE_MAX + 1;
8109   }
8110   else {  /* Remove the 0x110000 */
8111    invlist_set_len(invlist, len - 1);
8112   }
8113  }
8114
8115  return;
8116 }
8117 #endif
8118
8119 PERL_STATIC_INLINE SV*
8120 S_invlist_clone(pTHX_ SV* const invlist)
8121 {
8122
8123  /* Return a new inversion list that is a copy of the input one, which is
8124  * unchanged */
8125
8126  /* Need to allocate extra space to accommodate Perl's addition of a
8127  * trailing NUL to SvPV's, since it thinks they are always strings */
8128  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8129  STRLEN length = SvCUR(invlist);
8130
8131  PERL_ARGS_ASSERT_INVLIST_CLONE;
8132
8133  SvCUR_set(new_invlist, length); /* This isn't done automatically */
8134  Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8135
8136  return new_invlist;
8137 }
8138
8139 PERL_STATIC_INLINE UV*
8140 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8141 {
8142  /* Return the address of the UV that contains the current iteration
8143  * position */
8144
8145  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8146
8147  return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8148 }
8149
8150 PERL_STATIC_INLINE UV*
8151 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8152 {
8153  /* Return the address of the UV that contains the version id. */
8154
8155  PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8156
8157  return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8158 }
8159
8160 PERL_STATIC_INLINE void
8161 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8162 {
8163  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8164
8165  *get_invlist_iter_addr(invlist) = 0;
8166 }
8167
8168 PERL_STATIC_INLINE void
8169 S_invlist_iterfinish(pTHX_ SV* invlist)
8170 {
8171  /* Terminate iterator for invlist.  This is to catch development errors.
8172  * Any iteration that is interrupted before completed should call this
8173  * function.  Functions that add code points anywhere else but to the end
8174  * of an inversion list assert that they are not in the middle of an
8175  * iteration.  If they were, the addition would make the iteration
8176  * problematical: if the iteration hadn't reached the place where things
8177  * were being added, it would be ok */
8178
8179  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8180
8181  *get_invlist_iter_addr(invlist) = UV_MAX;
8182 }
8183
8184 STATIC bool
8185 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8186 {
8187  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8188  * This call sets in <*start> and <*end>, the next range in <invlist>.
8189  * Returns <TRUE> if successful and the next call will return the next
8190  * range; <FALSE> if was already at the end of the list.  If the latter,
8191  * <*start> and <*end> are unchanged, and the next call to this function
8192  * will start over at the beginning of the list */
8193
8194  UV* pos = get_invlist_iter_addr(invlist);
8195  UV len = _invlist_len(invlist);
8196  UV *array;
8197
8198  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8199
8200  if (*pos >= len) {
8201   *pos = UV_MAX; /* Force iterinit() to be required next time */
8202   return FALSE;
8203  }
8204
8205  array = invlist_array(invlist);
8206
8207  *start = array[(*pos)++];
8208
8209  if (*pos >= len) {
8210   *end = UV_MAX;
8211  }
8212  else {
8213   *end = array[(*pos)++] - 1;
8214  }
8215
8216  return TRUE;
8217 }
8218
8219 PERL_STATIC_INLINE bool
8220 S_invlist_is_iterating(pTHX_ SV* const invlist)
8221 {
8222  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8223
8224  return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8225 }
8226
8227 PERL_STATIC_INLINE UV
8228 S_invlist_highest(pTHX_ SV* const invlist)
8229 {
8230  /* Returns the highest code point that matches an inversion list.  This API
8231  * has an ambiguity, as it returns 0 under either the highest is actually
8232  * 0, or if the list is empty.  If this distinction matters to you, check
8233  * for emptiness before calling this function */
8234
8235  UV len = _invlist_len(invlist);
8236  UV *array;
8237
8238  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8239
8240  if (len == 0) {
8241   return 0;
8242  }
8243
8244  array = invlist_array(invlist);
8245
8246  /* The last element in the array in the inversion list always starts a
8247  * range that goes to infinity.  That range may be for code points that are
8248  * matched in the inversion list, or it may be for ones that aren't
8249  * matched.  In the latter case, the highest code point in the set is one
8250  * less than the beginning of this range; otherwise it is the final element
8251  * of this range: infinity */
8252  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8253   ? UV_MAX
8254   : array[len - 1] - 1;
8255 }
8256
8257 #ifndef PERL_IN_XSUB_RE
8258 SV *
8259 Perl__invlist_contents(pTHX_ SV* const invlist)
8260 {
8261  /* Get the contents of an inversion list into a string SV so that they can
8262  * be printed out.  It uses the format traditionally done for debug tracing
8263  */
8264
8265  UV start, end;
8266  SV* output = newSVpvs("\n");
8267
8268  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8269
8270  assert(! invlist_is_iterating(invlist));
8271
8272  invlist_iterinit(invlist);
8273  while (invlist_iternext(invlist, &start, &end)) {
8274   if (end == UV_MAX) {
8275    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8276   }
8277   else if (end != start) {
8278    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8279      start,       end);
8280   }
8281   else {
8282    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8283   }
8284  }
8285
8286  return output;
8287 }
8288 #endif
8289
8290 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8291 void
8292 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8293 {
8294  /* Dumps out the ranges in an inversion list.  The string 'header'
8295  * if present is output on a line before the first range */
8296
8297  UV start, end;
8298
8299  PERL_ARGS_ASSERT__INVLIST_DUMP;
8300
8301  if (header && strlen(header)) {
8302   PerlIO_printf(Perl_debug_log, "%s\n", header);
8303  }
8304  if (invlist_is_iterating(invlist)) {
8305   PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8306   return;
8307  }
8308
8309  invlist_iterinit(invlist);
8310  while (invlist_iternext(invlist, &start, &end)) {
8311   if (end == UV_MAX) {
8312    PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8313   }
8314   else if (end != start) {
8315    PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8316             start,         end);
8317   }
8318   else {
8319    PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8320   }
8321  }
8322 }
8323 #endif
8324
8325 #if 0
8326 bool
8327 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8328 {
8329  /* Return a boolean as to if the two passed in inversion lists are
8330  * identical.  The final argument, if TRUE, says to take the complement of
8331  * the second inversion list before doing the comparison */
8332
8333  UV* array_a = invlist_array(a);
8334  UV* array_b = invlist_array(b);
8335  UV len_a = _invlist_len(a);
8336  UV len_b = _invlist_len(b);
8337
8338  UV i = 0;      /* current index into the arrays */
8339  bool retval = TRUE;     /* Assume are identical until proven otherwise */
8340
8341  PERL_ARGS_ASSERT__INVLISTEQ;
8342
8343  /* If are to compare 'a' with the complement of b, set it
8344  * up so are looking at b's complement. */
8345  if (complement_b) {
8346
8347   /* The complement of nothing is everything, so <a> would have to have
8348   * just one element, starting at zero (ending at infinity) */
8349   if (len_b == 0) {
8350    return (len_a == 1 && array_a[0] == 0);
8351   }
8352   else if (array_b[0] == 0) {
8353
8354    /* Otherwise, to complement, we invert.  Here, the first element is
8355    * 0, just remove it.  To do this, we just pretend the array starts
8356    * one later, and clear the flag as we don't have to do anything
8357    * else later */
8358
8359    array_b++;
8360    len_b--;
8361    complement_b = FALSE;
8362   }
8363   else {
8364
8365    /* But if the first element is not zero, we unshift a 0 before the
8366    * array.  The data structure reserves a space for that 0 (which
8367    * should be a '1' right now), so physical shifting is unneeded,
8368    * but temporarily change that element to 0.  Before exiting the
8369    * routine, we must restore the element to '1' */
8370    array_b--;
8371    len_b++;
8372    array_b[0] = 0;
8373   }
8374  }
8375
8376  /* Make sure that the lengths are the same, as well as the final element
8377  * before looping through the remainder.  (Thus we test the length, final,
8378  * and first elements right off the bat) */
8379  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8380   retval = FALSE;
8381  }
8382  else for (i = 0; i < len_a - 1; i++) {
8383   if (array_a[i] != array_b[i]) {
8384    retval = FALSE;
8385    break;
8386   }
8387  }
8388
8389  if (complement_b) {
8390   array_b[0] = 1;
8391  }
8392  return retval;
8393 }
8394 #endif
8395
8396 #undef HEADER_LENGTH
8397 #undef INVLIST_INITIAL_LENGTH
8398 #undef TO_INTERNAL_SIZE
8399 #undef FROM_INTERNAL_SIZE
8400 #undef INVLIST_LEN_OFFSET
8401 #undef INVLIST_ZERO_OFFSET
8402 #undef INVLIST_ITER_OFFSET
8403 #undef INVLIST_VERSION_ID
8404 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8405
8406 /* End of inversion list object */
8407
8408 STATIC void
8409 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8410 {
8411  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8412  * constructs, and updates RExC_flags with them.  On input, RExC_parse
8413  * should point to the first flag; it is updated on output to point to the
8414  * final ')' or ':'.  There needs to be at least one flag, or this will
8415  * abort */
8416
8417  /* for (?g), (?gc), and (?o) warnings; warning
8418  about (?c) will warn about (?g) -- japhy    */
8419
8420 #define WASTED_O  0x01
8421 #define WASTED_G  0x02
8422 #define WASTED_C  0x04
8423 #define WASTED_GC (0x02|0x04)
8424  I32 wastedflags = 0x00;
8425  U32 posflags = 0, negflags = 0;
8426  U32 *flagsp = &posflags;
8427  char has_charset_modifier = '\0';
8428  regex_charset cs;
8429  bool has_use_defaults = FALSE;
8430  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8431
8432  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8433
8434  /* '^' as an initial flag sets certain defaults */
8435  if (UCHARAT(RExC_parse) == '^') {
8436   RExC_parse++;
8437   has_use_defaults = TRUE;
8438   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8439   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8440           ? REGEX_UNICODE_CHARSET
8441           : REGEX_DEPENDS_CHARSET);
8442  }
8443
8444  cs = get_regex_charset(RExC_flags);
8445  if (cs == REGEX_DEPENDS_CHARSET
8446   && (RExC_utf8 || RExC_uni_semantics))
8447  {
8448   cs = REGEX_UNICODE_CHARSET;
8449  }
8450
8451  while (*RExC_parse) {
8452   /* && strchr("iogcmsx", *RExC_parse) */
8453   /* (?g), (?gc) and (?o) are useless here
8454   and must be globally applied -- japhy */
8455   switch (*RExC_parse) {
8456
8457    /* Code for the imsx flags */
8458    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8459
8460    case LOCALE_PAT_MOD:
8461     if (has_charset_modifier) {
8462      goto excess_modifier;
8463     }
8464     else if (flagsp == &negflags) {
8465      goto neg_modifier;
8466     }
8467     cs = REGEX_LOCALE_CHARSET;
8468     has_charset_modifier = LOCALE_PAT_MOD;
8469     RExC_contains_locale = 1;
8470     break;
8471    case UNICODE_PAT_MOD:
8472     if (has_charset_modifier) {
8473      goto excess_modifier;
8474     }
8475     else if (flagsp == &negflags) {
8476      goto neg_modifier;
8477     }
8478     cs = REGEX_UNICODE_CHARSET;
8479     has_charset_modifier = UNICODE_PAT_MOD;
8480     break;
8481    case ASCII_RESTRICT_PAT_MOD:
8482     if (flagsp == &negflags) {
8483      goto neg_modifier;
8484     }
8485     if (has_charset_modifier) {
8486      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8487       goto excess_modifier;
8488      }
8489      /* Doubled modifier implies more restricted */
8490      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8491     }
8492     else {
8493      cs = REGEX_ASCII_RESTRICTED_CHARSET;
8494     }
8495     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8496     break;
8497    case DEPENDS_PAT_MOD:
8498     if (has_use_defaults) {
8499      goto fail_modifiers;
8500     }
8501     else if (flagsp == &negflags) {
8502      goto neg_modifier;
8503     }
8504     else if (has_charset_modifier) {
8505      goto excess_modifier;
8506     }
8507
8508     /* The dual charset means unicode semantics if the
8509     * pattern (or target, not known until runtime) are
8510     * utf8, or something in the pattern indicates unicode
8511     * semantics */
8512     cs = (RExC_utf8 || RExC_uni_semantics)
8513      ? REGEX_UNICODE_CHARSET
8514      : REGEX_DEPENDS_CHARSET;
8515     has_charset_modifier = DEPENDS_PAT_MOD;
8516     break;
8517    excess_modifier:
8518     RExC_parse++;
8519     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8520      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8521     }
8522     else if (has_charset_modifier == *(RExC_parse - 1)) {
8523      vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8524     }
8525     else {
8526      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8527     }
8528     /*NOTREACHED*/
8529    neg_modifier:
8530     RExC_parse++;
8531     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8532     /*NOTREACHED*/
8533    case ONCE_PAT_MOD: /* 'o' */
8534    case GLOBAL_PAT_MOD: /* 'g' */
8535     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8536      const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8537      if (! (wastedflags & wflagbit) ) {
8538       wastedflags |= wflagbit;
8539       vWARN5(
8540        RExC_parse + 1,
8541        "Useless (%s%c) - %suse /%c modifier",
8542        flagsp == &negflags ? "?-" : "?",
8543        *RExC_parse,
8544        flagsp == &negflags ? "don't " : "",
8545        *RExC_parse
8546       );
8547      }
8548     }
8549     break;
8550
8551    case CONTINUE_PAT_MOD: /* 'c' */
8552     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8553      if (! (wastedflags & WASTED_C) ) {
8554       wastedflags |= WASTED_GC;
8555       vWARN3(
8556        RExC_parse + 1,
8557        "Useless (%sc) - %suse /gc modifier",
8558        flagsp == &negflags ? "?-" : "?",
8559        flagsp == &negflags ? "don't " : ""
8560       );
8561      }
8562     }
8563     break;
8564    case KEEPCOPY_PAT_MOD: /* 'p' */
8565     if (flagsp == &negflags) {
8566      if (SIZE_ONLY)
8567       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8568     } else {
8569      *flagsp |= RXf_PMf_KEEPCOPY;
8570     }
8571     break;
8572    case '-':
8573     /* A flag is a default iff it is following a minus, so
8574     * if there is a minus, it means will be trying to
8575     * re-specify a default which is an error */
8576     if (has_use_defaults || flagsp == &negflags) {
8577      goto fail_modifiers;
8578     }
8579     flagsp = &negflags;
8580     wastedflags = 0;  /* reset so (?g-c) warns twice */
8581     break;
8582    case ':':
8583    case ')':
8584     RExC_flags |= posflags;
8585     RExC_flags &= ~negflags;
8586     set_regex_charset(&RExC_flags, cs);
8587     return;
8588     /*NOTREACHED*/
8589    default:
8590    fail_modifiers:
8591     RExC_parse++;
8592     vFAIL3("Sequence (%.*s...) not recognized",
8593      RExC_parse-seqstart, seqstart);
8594     /*NOTREACHED*/
8595   }
8596
8597   ++RExC_parse;
8598  }
8599 }
8600
8601 /*
8602  - reg - regular expression, i.e. main body or parenthesized thing
8603  *
8604  * Caller must absorb opening parenthesis.
8605  *
8606  * Combining parenthesis handling with the base level of regular expression
8607  * is a trifle forced, but the need to tie the tails of the branches to what
8608  * follows makes it hard to avoid.
8609  */
8610 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8611 #ifdef DEBUGGING
8612 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8613 #else
8614 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8615 #endif
8616
8617 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8618    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8619    needs to be restarted.
8620    Otherwise would only return NULL if regbranch() returns NULL, which
8621    cannot happen.  */
8622 STATIC regnode *
8623 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8624  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
8625  * 2 is like 1, but indicates that nextchar() has been called to advance
8626  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
8627  * this flag alerts us to the need to check for that */
8628 {
8629  dVAR;
8630  regnode *ret;  /* Will be the head of the group. */
8631  regnode *br;
8632  regnode *lastbr;
8633  regnode *ender = NULL;
8634  I32 parno = 0;
8635  I32 flags;
8636  U32 oregflags = RExC_flags;
8637  bool have_branch = 0;
8638  bool is_open = 0;
8639  I32 freeze_paren = 0;
8640  I32 after_freeze = 0;
8641
8642  char * parse_start = RExC_parse; /* MJD */
8643  char * const oregcomp_parse = RExC_parse;
8644
8645  GET_RE_DEBUG_FLAGS_DECL;
8646
8647  PERL_ARGS_ASSERT_REG;
8648  DEBUG_PARSE("reg ");
8649
8650  *flagp = 0;    /* Tentatively. */
8651
8652
8653  /* Make an OPEN node, if parenthesized. */
8654  if (paren) {
8655
8656   /* Under /x, space and comments can be gobbled up between the '(' and
8657   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
8658   * intervening space, as the sequence is a token, and a token should be
8659   * indivisible */
8660   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
8661
8662   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8663    char *start_verb = RExC_parse;
8664    STRLEN verb_len = 0;
8665    char *start_arg = NULL;
8666    unsigned char op = 0;
8667    int argok = 1;
8668    int internal_argval = 0; /* internal_argval is only useful if !argok */
8669
8670    if (has_intervening_patws && SIZE_ONLY) {
8671     ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
8672    }
8673    while ( *RExC_parse && *RExC_parse != ')' ) {
8674     if ( *RExC_parse == ':' ) {
8675      start_arg = RExC_parse + 1;
8676      break;
8677     }
8678     RExC_parse++;
8679    }
8680    ++start_verb;
8681    verb_len = RExC_parse - start_verb;
8682    if ( start_arg ) {
8683     RExC_parse++;
8684     while ( *RExC_parse && *RExC_parse != ')' )
8685      RExC_parse++;
8686     if ( *RExC_parse != ')' )
8687      vFAIL("Unterminated verb pattern argument");
8688     if ( RExC_parse == start_arg )
8689      start_arg = NULL;
8690    } else {
8691     if ( *RExC_parse != ')' )
8692      vFAIL("Unterminated verb pattern");
8693    }
8694
8695    switch ( *start_verb ) {
8696    case 'A':  /* (*ACCEPT) */
8697     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8698      op = ACCEPT;
8699      internal_argval = RExC_nestroot;
8700     }
8701     break;
8702    case 'C':  /* (*COMMIT) */
8703     if ( memEQs(start_verb,verb_len,"COMMIT") )
8704      op = COMMIT;
8705     break;
8706    case 'F':  /* (*FAIL) */
8707     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8708      op = OPFAIL;
8709      argok = 0;
8710     }
8711     break;
8712    case ':':  /* (*:NAME) */
8713    case 'M':  /* (*MARK:NAME) */
8714     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8715      op = MARKPOINT;
8716      argok = -1;
8717     }
8718     break;
8719    case 'P':  /* (*PRUNE) */
8720     if ( memEQs(start_verb,verb_len,"PRUNE") )
8721      op = PRUNE;
8722     break;
8723    case 'S':   /* (*SKIP) */
8724     if ( memEQs(start_verb,verb_len,"SKIP") )
8725      op = SKIP;
8726     break;
8727    case 'T':  /* (*THEN) */
8728     /* [19:06] <TimToady> :: is then */
8729     if ( memEQs(start_verb,verb_len,"THEN") ) {
8730      op = CUTGROUP;
8731      RExC_seen |= REG_SEEN_CUTGROUP;
8732     }
8733     break;
8734    }
8735    if ( ! op ) {
8736     RExC_parse++;
8737     vFAIL3("Unknown verb pattern '%.*s'",
8738      verb_len, start_verb);
8739    }
8740    if ( argok ) {
8741     if ( start_arg && internal_argval ) {
8742      vFAIL3("Verb pattern '%.*s' may not have an argument",
8743       verb_len, start_verb);
8744     } else if ( argok < 0 && !start_arg ) {
8745      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8746       verb_len, start_verb);
8747     } else {
8748      ret = reganode(pRExC_state, op, internal_argval);
8749      if ( ! internal_argval && ! SIZE_ONLY ) {
8750       if (start_arg) {
8751        SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8752        ARG(ret) = add_data( pRExC_state, 1, "S" );
8753        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8754        ret->flags = 0;
8755       } else {
8756        ret->flags = 1;
8757       }
8758      }
8759     }
8760     if (!internal_argval)
8761      RExC_seen |= REG_SEEN_VERBARG;
8762    } else if ( start_arg ) {
8763     vFAIL3("Verb pattern '%.*s' may not have an argument",
8764       verb_len, start_verb);
8765    } else {
8766     ret = reg_node(pRExC_state, op);
8767    }
8768    nextchar(pRExC_state);
8769    return ret;
8770   } else
8771   if (*RExC_parse == '?') { /* (?...) */
8772    bool is_logical = 0;
8773    const char * const seqstart = RExC_parse;
8774    if (has_intervening_patws && SIZE_ONLY) {
8775     ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
8776    }
8777
8778    RExC_parse++;
8779    paren = *RExC_parse++;
8780    ret = NULL;   /* For look-ahead/behind. */
8781    switch (paren) {
8782
8783    case 'P': /* (?P...) variants for those used to PCRE/Python */
8784     paren = *RExC_parse++;
8785     if ( paren == '<')         /* (?P<...>) named capture */
8786      goto named_capture;
8787     else if (paren == '>') {   /* (?P>name) named recursion */
8788      goto named_recursion;
8789     }
8790     else if (paren == '=') {   /* (?P=...)  named backref */
8791      /* this pretty much dupes the code for \k<NAME> in regatom(), if
8792      you change this make sure you change that */
8793      char* name_start = RExC_parse;
8794      U32 num = 0;
8795      SV *sv_dat = reg_scan_name(pRExC_state,
8796       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8797      if (RExC_parse == name_start || *RExC_parse != ')')
8798       vFAIL2("Sequence %.3s... not terminated",parse_start);
8799
8800      if (!SIZE_ONLY) {
8801       num = add_data( pRExC_state, 1, "S" );
8802       RExC_rxi->data->data[num]=(void*)sv_dat;
8803       SvREFCNT_inc_simple_void(sv_dat);
8804      }
8805      RExC_sawback = 1;
8806      ret = reganode(pRExC_state,
8807         ((! FOLD)
8808          ? NREF
8809          : (ASCII_FOLD_RESTRICTED)
8810          ? NREFFA
8811          : (AT_LEAST_UNI_SEMANTICS)
8812           ? NREFFU
8813           : (LOC)
8814           ? NREFFL
8815           : NREFF),
8816          num);
8817      *flagp |= HASWIDTH;
8818
8819      Set_Node_Offset(ret, parse_start+1);
8820      Set_Node_Cur_Length(ret); /* MJD */
8821
8822      nextchar(pRExC_state);
8823      return ret;
8824     }
8825     RExC_parse++;
8826     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8827     /*NOTREACHED*/
8828    case '<':           /* (?<...) */
8829     if (*RExC_parse == '!')
8830      paren = ',';
8831     else if (*RExC_parse != '=')
8832    named_capture:
8833     {               /* (?<...>) */
8834      char *name_start;
8835      SV *svname;
8836      paren= '>';
8837    case '\'':          /* (?'...') */
8838       name_start= RExC_parse;
8839       svname = reg_scan_name(pRExC_state,
8840        SIZE_ONLY ?  /* reverse test from the others */
8841        REG_RSN_RETURN_NAME :
8842        REG_RSN_RETURN_NULL);
8843      if (RExC_parse == name_start) {
8844       RExC_parse++;
8845       vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8846       /*NOTREACHED*/
8847      }
8848      if (*RExC_parse != paren)
8849       vFAIL2("Sequence (?%c... not terminated",
8850        paren=='>' ? '<' : paren);
8851      if (SIZE_ONLY) {
8852       HE *he_str;
8853       SV *sv_dat = NULL;
8854       if (!svname) /* shouldn't happen */
8855        Perl_croak(aTHX_
8856         "panic: reg_scan_name returned NULL");
8857       if (!RExC_paren_names) {
8858        RExC_paren_names= newHV();
8859        sv_2mortal(MUTABLE_SV(RExC_paren_names));
8860 #ifdef DEBUGGING
8861        RExC_paren_name_list= newAV();
8862        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8863 #endif
8864       }
8865       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8866       if ( he_str )
8867        sv_dat = HeVAL(he_str);
8868       if ( ! sv_dat ) {
8869        /* croak baby croak */
8870        Perl_croak(aTHX_
8871         "panic: paren_name hash element allocation failed");
8872       } else if ( SvPOK(sv_dat) ) {
8873        /* (?|...) can mean we have dupes so scan to check
8874        its already been stored. Maybe a flag indicating
8875        we are inside such a construct would be useful,
8876        but the arrays are likely to be quite small, so
8877        for now we punt -- dmq */
8878        IV count = SvIV(sv_dat);
8879        I32 *pv = (I32*)SvPVX(sv_dat);
8880        IV i;
8881        for ( i = 0 ; i < count ; i++ ) {
8882         if ( pv[i] == RExC_npar ) {
8883          count = 0;
8884          break;
8885         }
8886        }
8887        if ( count ) {
8888         pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8889         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8890         pv[count] = RExC_npar;
8891         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8892        }
8893       } else {
8894        (void)SvUPGRADE(sv_dat,SVt_PVNV);
8895        sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8896        SvIOK_on(sv_dat);
8897        SvIV_set(sv_dat, 1);
8898       }
8899 #ifdef DEBUGGING
8900       /* Yes this does cause a memory leak in debugging Perls */
8901       if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8902        SvREFCNT_dec_NN(svname);
8903 #endif
8904
8905       /*sv_dump(sv_dat);*/
8906      }
8907      nextchar(pRExC_state);
8908      paren = 1;
8909      goto capturing_parens;
8910     }
8911     RExC_seen |= REG_SEEN_LOOKBEHIND;
8912     RExC_in_lookbehind++;
8913     RExC_parse++;
8914    case '=':           /* (?=...) */
8915     RExC_seen_zerolen++;
8916     break;
8917    case '!':           /* (?!...) */
8918     RExC_seen_zerolen++;
8919     if (*RExC_parse == ')') {
8920      ret=reg_node(pRExC_state, OPFAIL);
8921      nextchar(pRExC_state);
8922      return ret;
8923     }
8924     break;
8925    case '|':           /* (?|...) */
8926     /* branch reset, behave like a (?:...) except that
8927     buffers in alternations share the same numbers */
8928     paren = ':';
8929     after_freeze = freeze_paren = RExC_npar;
8930     break;
8931    case ':':           /* (?:...) */
8932    case '>':           /* (?>...) */
8933     break;
8934    case '$':           /* (?$...) */
8935    case '@':           /* (?@...) */
8936     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8937     break;
8938    case '#':           /* (?#...) */
8939     /* XXX As soon as we disallow separating the '?' and '*' (by
8940     * spaces or (?#...) comment), it is believed that this case
8941     * will be unreachable and can be removed.  See
8942     * [perl #117327] */
8943     while (*RExC_parse && *RExC_parse != ')')
8944      RExC_parse++;
8945     if (*RExC_parse != ')')
8946      FAIL("Sequence (?#... not terminated");
8947     nextchar(pRExC_state);
8948     *flagp = TRYAGAIN;
8949     return NULL;
8950    case '0' :           /* (?0) */
8951    case 'R' :           /* (?R) */
8952     if (*RExC_parse != ')')
8953      FAIL("Sequence (?R) not terminated");
8954     ret = reg_node(pRExC_state, GOSTART);
8955     *flagp |= POSTPONED;
8956     nextchar(pRExC_state);
8957     return ret;
8958     /*notreached*/
8959    { /* named and numeric backreferences */
8960     I32 num;
8961    case '&':            /* (?&NAME) */
8962     parse_start = RExC_parse - 1;
8963    named_recursion:
8964     {
8965       SV *sv_dat = reg_scan_name(pRExC_state,
8966        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8967       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8968     }
8969     goto gen_recurse_regop;
8970     assert(0); /* NOT REACHED */
8971    case '+':
8972     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8973      RExC_parse++;
8974      vFAIL("Illegal pattern");
8975     }
8976     goto parse_recursion;
8977     /* NOT REACHED*/
8978    case '-': /* (?-1) */
8979     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8980      RExC_parse--; /* rewind to let it be handled later */
8981      goto parse_flags;
8982     }
8983     /*FALLTHROUGH */
8984    case '1': case '2': case '3': case '4': /* (?1) */
8985    case '5': case '6': case '7': case '8': case '9':
8986     RExC_parse--;
8987    parse_recursion:
8988     num = atoi(RExC_parse);
8989     parse_start = RExC_parse - 1; /* MJD */
8990     if (*RExC_parse == '-')
8991      RExC_parse++;
8992     while (isDIGIT(*RExC_parse))
8993       RExC_parse++;
8994     if (*RExC_parse!=')')
8995      vFAIL("Expecting close bracket");
8996
8997    gen_recurse_regop:
8998     if ( paren == '-' ) {
8999      /*
9000      Diagram of capture buffer numbering.
9001      Top line is the normal capture buffer numbers
9002      Bottom line is the negative indexing as from
9003      the X (the (?-2))
9004
9005      +   1 2    3 4 5 X          6 7
9006      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9007      -   5 4    3 2 1 X          x x
9008
9009      */
9010      num = RExC_npar + num;
9011      if (num < 1)  {
9012       RExC_parse++;
9013       vFAIL("Reference to nonexistent group");
9014      }
9015     } else if ( paren == '+' ) {
9016      num = RExC_npar + num - 1;
9017     }
9018
9019     ret = reganode(pRExC_state, GOSUB, num);
9020     if (!SIZE_ONLY) {
9021      if (num > (I32)RExC_rx->nparens) {
9022       RExC_parse++;
9023       vFAIL("Reference to nonexistent group");
9024      }
9025      ARG2L_SET( ret, RExC_recurse_count++);
9026      RExC_emit++;
9027      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9028       "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9029     } else {
9030      RExC_size++;
9031      }
9032      RExC_seen |= REG_SEEN_RECURSE;
9033     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9034     Set_Node_Offset(ret, parse_start); /* MJD */
9035
9036     *flagp |= POSTPONED;
9037     nextchar(pRExC_state);
9038     return ret;
9039    } /* named and numeric backreferences */
9040    assert(0); /* NOT REACHED */
9041
9042    case '?':           /* (??...) */
9043     is_logical = 1;
9044     if (*RExC_parse != '{') {
9045      RExC_parse++;
9046      vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9047      /*NOTREACHED*/
9048     }
9049     *flagp |= POSTPONED;
9050     paren = *RExC_parse++;
9051     /* FALL THROUGH */
9052    case '{':           /* (?{...}) */
9053    {
9054     U32 n = 0;
9055     struct reg_code_block *cb;
9056
9057     RExC_seen_zerolen++;
9058
9059     if (   !pRExC_state->num_code_blocks
9060      || pRExC_state->code_index >= pRExC_state->num_code_blocks
9061      || pRExC_state->code_blocks[pRExC_state->code_index].start
9062       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9063        - RExC_start)
9064     ) {
9065      if (RExC_pm_flags & PMf_USE_RE_EVAL)
9066       FAIL("panic: Sequence (?{...}): no code block found\n");
9067      FAIL("Eval-group not allowed at runtime, use re 'eval'");
9068     }
9069     /* this is a pre-compiled code block (?{...}) */
9070     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9071     RExC_parse = RExC_start + cb->end;
9072     if (!SIZE_ONLY) {
9073      OP *o = cb->block;
9074      if (cb->src_regex) {
9075       n = add_data(pRExC_state, 2, "rl");
9076       RExC_rxi->data->data[n] =
9077        (void*)SvREFCNT_inc((SV*)cb->src_regex);
9078       RExC_rxi->data->data[n+1] = (void*)o;
9079      }
9080      else {
9081       n = add_data(pRExC_state, 1,
9082        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9083       RExC_rxi->data->data[n] = (void*)o;
9084      }
9085     }
9086     pRExC_state->code_index++;
9087     nextchar(pRExC_state);
9088
9089     if (is_logical) {
9090      regnode *eval;
9091      ret = reg_node(pRExC_state, LOGICAL);
9092      eval = reganode(pRExC_state, EVAL, n);
9093      if (!SIZE_ONLY) {
9094       ret->flags = 2;
9095       /* for later propagation into (??{}) return value */
9096       eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9097      }
9098      REGTAIL(pRExC_state, ret, eval);
9099      /* deal with the length of this later - MJD */
9100      return ret;
9101     }
9102     ret = reganode(pRExC_state, EVAL, n);
9103     Set_Node_Length(ret, RExC_parse - parse_start + 1);
9104     Set_Node_Offset(ret, parse_start);
9105     return ret;
9106    }
9107    case '(':           /* (?(?{...})...) and (?(?=...)...) */
9108    {
9109     int is_define= 0;
9110     if (RExC_parse[0] == '?') {        /* (?(?...)) */
9111      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9112       || RExC_parse[1] == '<'
9113       || RExC_parse[1] == '{') { /* Lookahead or eval. */
9114       I32 flag;
9115       regnode *tail;
9116
9117       ret = reg_node(pRExC_state, LOGICAL);
9118       if (!SIZE_ONLY)
9119        ret->flags = 1;
9120
9121       tail = reg(pRExC_state, 1, &flag, depth+1);
9122       if (flag & RESTART_UTF8) {
9123        *flagp = RESTART_UTF8;
9124        return NULL;
9125       }
9126       REGTAIL(pRExC_state, ret, tail);
9127       goto insert_if;
9128      }
9129     }
9130     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9131       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9132     {
9133      char ch = RExC_parse[0] == '<' ? '>' : '\'';
9134      char *name_start= RExC_parse++;
9135      U32 num = 0;
9136      SV *sv_dat=reg_scan_name(pRExC_state,
9137       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9138      if (RExC_parse == name_start || *RExC_parse != ch)
9139       vFAIL2("Sequence (?(%c... not terminated",
9140        (ch == '>' ? '<' : ch));
9141      RExC_parse++;
9142      if (!SIZE_ONLY) {
9143       num = add_data( pRExC_state, 1, "S" );
9144       RExC_rxi->data->data[num]=(void*)sv_dat;
9145       SvREFCNT_inc_simple_void(sv_dat);
9146      }
9147      ret = reganode(pRExC_state,NGROUPP,num);
9148      goto insert_if_check_paren;
9149     }
9150     else if (RExC_parse[0] == 'D' &&
9151       RExC_parse[1] == 'E' &&
9152       RExC_parse[2] == 'F' &&
9153       RExC_parse[3] == 'I' &&
9154       RExC_parse[4] == 'N' &&
9155       RExC_parse[5] == 'E')
9156     {
9157      ret = reganode(pRExC_state,DEFINEP,0);
9158      RExC_parse +=6 ;
9159      is_define = 1;
9160      goto insert_if_check_paren;
9161     }
9162     else if (RExC_parse[0] == 'R') {
9163      RExC_parse++;
9164      parno = 0;
9165      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9166       parno = atoi(RExC_parse++);
9167       while (isDIGIT(*RExC_parse))
9168        RExC_parse++;
9169      } else if (RExC_parse[0] == '&') {
9170       SV *sv_dat;
9171       RExC_parse++;
9172       sv_dat = reg_scan_name(pRExC_state,
9173         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9174        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9175      }
9176      ret = reganode(pRExC_state,INSUBP,parno);
9177      goto insert_if_check_paren;
9178     }
9179     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9180      /* (?(1)...) */
9181      char c;
9182      parno = atoi(RExC_parse++);
9183
9184      while (isDIGIT(*RExC_parse))
9185       RExC_parse++;
9186      ret = reganode(pRExC_state, GROUPP, parno);
9187
9188     insert_if_check_paren:
9189      if ((c = *nextchar(pRExC_state)) != ')')
9190       vFAIL("Switch condition not recognized");
9191     insert_if:
9192      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9193      br = regbranch(pRExC_state, &flags, 1,depth+1);
9194      if (br == NULL) {
9195       if (flags & RESTART_UTF8) {
9196        *flagp = RESTART_UTF8;
9197        return NULL;
9198       }
9199       FAIL2("panic: regbranch returned NULL, flags=%#X",
9200        flags);
9201      } else
9202       REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9203      c = *nextchar(pRExC_state);
9204      if (flags&HASWIDTH)
9205       *flagp |= HASWIDTH;
9206      if (c == '|') {
9207       if (is_define)
9208        vFAIL("(?(DEFINE)....) does not allow branches");
9209       lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9210       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9211        if (flags & RESTART_UTF8) {
9212         *flagp = RESTART_UTF8;
9213         return NULL;
9214        }
9215        FAIL2("panic: regbranch returned NULL, flags=%#X",
9216         flags);
9217       }
9218       REGTAIL(pRExC_state, ret, lastbr);
9219       if (flags&HASWIDTH)
9220        *flagp |= HASWIDTH;
9221       c = *nextchar(pRExC_state);
9222      }
9223      else
9224       lastbr = NULL;
9225      if (c != ')')
9226       vFAIL("Switch (?(condition)... contains too many branches");
9227      ender = reg_node(pRExC_state, TAIL);
9228      REGTAIL(pRExC_state, br, ender);
9229      if (lastbr) {
9230       REGTAIL(pRExC_state, lastbr, ender);
9231       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9232      }
9233      else
9234       REGTAIL(pRExC_state, ret, ender);
9235      RExC_size++; /* XXX WHY do we need this?!!
9236          For large programs it seems to be required
9237          but I can't figure out why. -- dmq*/
9238      return ret;
9239     }
9240     else {
9241      vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9242     }
9243    }
9244    case '[':           /* (?[ ... ]) */
9245     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9246           oregcomp_parse);
9247    case 0:
9248     RExC_parse--; /* for vFAIL to print correctly */
9249     vFAIL("Sequence (? incomplete");
9250     break;
9251    default: /* e.g., (?i) */
9252     --RExC_parse;
9253    parse_flags:
9254     parse_lparen_question_flags(pRExC_state);
9255     if (UCHARAT(RExC_parse) != ':') {
9256      nextchar(pRExC_state);
9257      *flagp = TRYAGAIN;
9258      return NULL;
9259     }
9260     paren = ':';
9261     nextchar(pRExC_state);
9262     ret = NULL;
9263     goto parse_rest;
9264    } /* end switch */
9265   }
9266   else {                  /* (...) */
9267   capturing_parens:
9268    parno = RExC_npar;
9269    RExC_npar++;
9270
9271    ret = reganode(pRExC_state, OPEN, parno);
9272    if (!SIZE_ONLY ){
9273     if (!RExC_nestroot)
9274      RExC_nestroot = parno;
9275     if (RExC_seen & REG_SEEN_RECURSE
9276      && !RExC_open_parens[parno-1])
9277     {
9278      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9279       "Setting open paren #%"IVdf" to %d\n",
9280       (IV)parno, REG_NODE_NUM(ret)));
9281      RExC_open_parens[parno-1]= ret;
9282     }
9283    }
9284    Set_Node_Length(ret, 1); /* MJD */
9285    Set_Node_Offset(ret, RExC_parse); /* MJD */
9286    is_open = 1;
9287   }
9288  }
9289  else                        /* ! paren */
9290   ret = NULL;
9291
9292    parse_rest:
9293  /* Pick up the branches, linking them together. */
9294  parse_start = RExC_parse;   /* MJD */
9295  br = regbranch(pRExC_state, &flags, 1,depth+1);
9296
9297  /*     branch_len = (paren != 0); */
9298
9299  if (br == NULL) {
9300   if (flags & RESTART_UTF8) {
9301    *flagp = RESTART_UTF8;
9302    return NULL;
9303   }
9304   FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9305  }
9306  if (*RExC_parse == '|') {
9307   if (!SIZE_ONLY && RExC_extralen) {
9308    reginsert(pRExC_state, BRANCHJ, br, depth+1);
9309   }
9310   else {                  /* MJD */
9311    reginsert(pRExC_state, BRANCH, br, depth+1);
9312    Set_Node_Length(br, paren != 0);
9313    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9314   }
9315   have_branch = 1;
9316   if (SIZE_ONLY)
9317    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
9318  }
9319  else if (paren == ':') {
9320   *flagp |= flags&SIMPLE;
9321  }
9322  if (is_open) {    /* Starts with OPEN. */
9323   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9324  }
9325  else if (paren != '?')  /* Not Conditional */
9326   ret = br;
9327  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9328  lastbr = br;
9329  while (*RExC_parse == '|') {
9330   if (!SIZE_ONLY && RExC_extralen) {
9331    ender = reganode(pRExC_state, LONGJMP,0);
9332    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9333   }
9334   if (SIZE_ONLY)
9335    RExC_extralen += 2;  /* Account for LONGJMP. */
9336   nextchar(pRExC_state);
9337   if (freeze_paren) {
9338    if (RExC_npar > after_freeze)
9339     after_freeze = RExC_npar;
9340    RExC_npar = freeze_paren;
9341   }
9342   br = regbranch(pRExC_state, &flags, 0, depth+1);
9343
9344   if (br == NULL) {
9345    if (flags & RESTART_UTF8) {
9346     *flagp = RESTART_UTF8;
9347     return NULL;
9348    }
9349    FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9350   }
9351   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9352   lastbr = br;
9353   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9354  }
9355
9356  if (have_branch || paren != ':') {
9357   /* Make a closing node, and hook it on the end. */
9358   switch (paren) {
9359   case ':':
9360    ender = reg_node(pRExC_state, TAIL);
9361    break;
9362   case 1: case 2:
9363    ender = reganode(pRExC_state, CLOSE, parno);
9364    if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9365     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9366       "Setting close paren #%"IVdf" to %d\n",
9367       (IV)parno, REG_NODE_NUM(ender)));
9368     RExC_close_parens[parno-1]= ender;
9369     if (RExC_nestroot == parno)
9370      RExC_nestroot = 0;
9371    }
9372    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9373    Set_Node_Length(ender,1); /* MJD */
9374    break;
9375   case '<':
9376   case ',':
9377   case '=':
9378   case '!':
9379    *flagp &= ~HASWIDTH;
9380    /* FALL THROUGH */
9381   case '>':
9382    ender = reg_node(pRExC_state, SUCCEED);
9383    break;
9384   case 0:
9385    ender = reg_node(pRExC_state, END);
9386    if (!SIZE_ONLY) {
9387     assert(!RExC_opend); /* there can only be one! */
9388     RExC_opend = ender;
9389    }
9390    break;
9391   }
9392   DEBUG_PARSE_r(if (!SIZE_ONLY) {
9393    SV * const mysv_val1=sv_newmortal();
9394    SV * const mysv_val2=sv_newmortal();
9395    DEBUG_PARSE_MSG("lsbr");
9396    regprop(RExC_rx, mysv_val1, lastbr);
9397    regprop(RExC_rx, mysv_val2, ender);
9398    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9399       SvPV_nolen_const(mysv_val1),
9400       (IV)REG_NODE_NUM(lastbr),
9401       SvPV_nolen_const(mysv_val2),
9402       (IV)REG_NODE_NUM(ender),
9403       (IV)(ender - lastbr)
9404    );
9405   });
9406   REGTAIL(pRExC_state, lastbr, ender);
9407
9408   if (have_branch && !SIZE_ONLY) {
9409    char is_nothing= 1;
9410    if (depth==1)
9411     RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9412
9413    /* Hook the tails of the branches to the closing node. */
9414    for (br = ret; br; br = regnext(br)) {
9415     const U8 op = PL_regkind[OP(br)];
9416     if (op == BRANCH) {
9417      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9418      if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9419       is_nothing= 0;
9420     }
9421     else if (op == BRANCHJ) {
9422      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9423      /* for now we always disable this optimisation * /
9424      if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9425      */
9426       is_nothing= 0;
9427     }
9428    }
9429    if (is_nothing) {
9430     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9431     DEBUG_PARSE_r(if (!SIZE_ONLY) {
9432      SV * const mysv_val1=sv_newmortal();
9433      SV * const mysv_val2=sv_newmortal();
9434      DEBUG_PARSE_MSG("NADA");
9435      regprop(RExC_rx, mysv_val1, ret);
9436      regprop(RExC_rx, mysv_val2, ender);
9437      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9438         SvPV_nolen_const(mysv_val1),
9439         (IV)REG_NODE_NUM(ret),
9440         SvPV_nolen_const(mysv_val2),
9441         (IV)REG_NODE_NUM(ender),
9442         (IV)(ender - ret)
9443      );
9444     });
9445     OP(br)= NOTHING;
9446     if (OP(ender) == TAIL) {
9447      NEXT_OFF(br)= 0;
9448      RExC_emit= br + 1;
9449     } else {
9450      regnode *opt;
9451      for ( opt= br + 1; opt < ender ; opt++ )
9452       OP(opt)= OPTIMIZED;
9453      NEXT_OFF(br)= ender - br;
9454     }
9455    }
9456   }
9457  }
9458
9459  {
9460   const char *p;
9461   static const char parens[] = "=!<,>";
9462
9463   if (paren && (p = strchr(parens, paren))) {
9464    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9465    int flag = (p - parens) > 1;
9466
9467    if (paren == '>')
9468     node = SUSPEND, flag = 0;
9469    reginsert(pRExC_state, node,ret, depth+1);
9470    Set_Node_Cur_Length(ret);
9471    Set_Node_Offset(ret, parse_start + 1);
9472    ret->flags = flag;
9473    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9474   }
9475  }
9476
9477  /* Check for proper termination. */
9478  if (paren) {
9479   /* restore original flags, but keep (?p) */
9480   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9481   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9482    RExC_parse = oregcomp_parse;
9483    vFAIL("Unmatched (");
9484   }
9485  }
9486  else if (!paren && RExC_parse < RExC_end) {
9487   if (*RExC_parse == ')') {
9488    RExC_parse++;
9489    vFAIL("Unmatched )");
9490   }
9491   else
9492    FAIL("Junk on end of regexp"); /* "Can't happen". */
9493   assert(0); /* NOTREACHED */
9494  }
9495
9496  if (RExC_in_lookbehind) {
9497   RExC_in_lookbehind--;
9498  }
9499  if (after_freeze > RExC_npar)
9500   RExC_npar = after_freeze;
9501  return(ret);
9502 }
9503
9504 /*
9505  - regbranch - one alternative of an | operator
9506  *
9507  * Implements the concatenation operator.
9508  *
9509  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9510  * restarted.
9511  */
9512 STATIC regnode *
9513 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9514 {
9515  dVAR;
9516  regnode *ret;
9517  regnode *chain = NULL;
9518  regnode *latest;
9519  I32 flags = 0, c = 0;
9520  GET_RE_DEBUG_FLAGS_DECL;
9521
9522  PERL_ARGS_ASSERT_REGBRANCH;
9523
9524  DEBUG_PARSE("brnc");
9525
9526  if (first)
9527   ret = NULL;
9528  else {
9529   if (!SIZE_ONLY && RExC_extralen)
9530    ret = reganode(pRExC_state, BRANCHJ,0);
9531   else {
9532    ret = reg_node(pRExC_state, BRANCH);
9533    Set_Node_Length(ret, 1);
9534   }
9535  }
9536
9537  if (!first && SIZE_ONLY)
9538   RExC_extralen += 1;   /* BRANCHJ */
9539
9540  *flagp = WORST;   /* Tentatively. */
9541
9542  RExC_parse--;
9543  nextchar(pRExC_state);
9544  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9545   flags &= ~TRYAGAIN;
9546   latest = regpiece(pRExC_state, &flags,depth+1);
9547   if (latest == NULL) {
9548    if (flags & TRYAGAIN)
9549     continue;
9550    if (flags & RESTART_UTF8) {
9551     *flagp = RESTART_UTF8;
9552     return NULL;
9553    }
9554    FAIL2("panic: regpiece returned NULL, flags=%#X", flags);
9555   }
9556   else if (ret == NULL)
9557    ret = latest;
9558   *flagp |= flags&(HASWIDTH|POSTPONED);
9559   if (chain == NULL)  /* First piece. */
9560    *flagp |= flags&SPSTART;
9561   else {
9562    RExC_naughty++;
9563    REGTAIL(pRExC_state, chain, latest);
9564   }
9565   chain = latest;
9566   c++;
9567  }
9568  if (chain == NULL) { /* Loop ran zero times. */
9569   chain = reg_node(pRExC_state, NOTHING);
9570   if (ret == NULL)
9571    ret = chain;
9572  }
9573  if (c == 1) {
9574   *flagp |= flags&SIMPLE;
9575  }
9576
9577  return ret;
9578 }
9579
9580 /*
9581  - regpiece - something followed by possible [*+?]
9582  *
9583  * Note that the branching code sequences used for ? and the general cases
9584  * of * and + are somewhat optimized:  they use the same NOTHING node as
9585  * both the endmarker for their branch list and the body of the last branch.
9586  * It might seem that this node could be dispensed with entirely, but the
9587  * endmarker role is not redundant.
9588  *
9589  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9590  * TRYAGAIN.
9591  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9592  * restarted.
9593  */
9594 STATIC regnode *
9595 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9596 {
9597  dVAR;
9598  regnode *ret;
9599  char op;
9600  char *next;
9601  I32 flags;
9602  const char * const origparse = RExC_parse;
9603  I32 min;
9604  I32 max = REG_INFTY;
9605 #ifdef RE_TRACK_PATTERN_OFFSETS
9606  char *parse_start;
9607 #endif
9608  const char *maxpos = NULL;
9609
9610  /* Save the original in case we change the emitted regop to a FAIL. */
9611  regnode * const orig_emit = RExC_emit;
9612
9613  GET_RE_DEBUG_FLAGS_DECL;
9614
9615  PERL_ARGS_ASSERT_REGPIECE;
9616
9617  DEBUG_PARSE("piec");
9618
9619  ret = regatom(pRExC_state, &flags,depth+1);
9620  if (ret == NULL) {
9621   if (flags & (TRYAGAIN|RESTART_UTF8))
9622    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9623   else
9624    FAIL2("panic: regatom returned NULL, flags=%#X", flags);
9625   return(NULL);
9626  }
9627
9628  op = *RExC_parse;
9629
9630  if (op == '{' && regcurly(RExC_parse, FALSE)) {
9631   maxpos = NULL;
9632 #ifdef RE_TRACK_PATTERN_OFFSETS
9633   parse_start = RExC_parse; /* MJD */
9634 #endif
9635   next = RExC_parse + 1;
9636   while (isDIGIT(*next) || *next == ',') {
9637    if (*next == ',') {
9638     if (maxpos)
9639      break;
9640     else
9641      maxpos = next;
9642    }
9643    next++;
9644   }
9645   if (*next == '}') {  /* got one */
9646    if (!maxpos)
9647     maxpos = next;
9648    RExC_parse++;
9649    min = atoi(RExC_parse);
9650    if (*maxpos == ',')
9651     maxpos++;
9652    else
9653     maxpos = RExC_parse;
9654    max = atoi(maxpos);
9655    if (!max && *maxpos != '0')
9656     max = REG_INFTY;  /* meaning "infinity" */
9657    else if (max >= REG_INFTY)
9658     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9659    RExC_parse = next;
9660    nextchar(pRExC_state);
9661    if (max < min) {    /* If can't match, warn and optimize to fail
9662         unconditionally */
9663     if (SIZE_ONLY) {
9664      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9665
9666      /* We can't back off the size because we have to reserve
9667      * enough space for all the things we are about to throw
9668      * away, but we can shrink it by the ammount we are about
9669      * to re-use here */
9670      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9671     }
9672     else {
9673      RExC_emit = orig_emit;
9674     }
9675     ret = reg_node(pRExC_state, OPFAIL);
9676     return ret;
9677    }
9678
9679   do_curly:
9680    if ((flags&SIMPLE)) {
9681     RExC_naughty += 2 + RExC_naughty / 2;
9682     reginsert(pRExC_state, CURLY, ret, depth+1);
9683     Set_Node_Offset(ret, parse_start+1); /* MJD */
9684     Set_Node_Cur_Length(ret);
9685    }
9686    else {
9687     regnode * const w = reg_node(pRExC_state, WHILEM);
9688
9689     w->flags = 0;
9690     REGTAIL(pRExC_state, ret, w);
9691     if (!SIZE_ONLY && RExC_extralen) {
9692      reginsert(pRExC_state, LONGJMP,ret, depth+1);
9693      reginsert(pRExC_state, NOTHING,ret, depth+1);
9694      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9695     }
9696     reginsert(pRExC_state, CURLYX,ret, depth+1);
9697         /* MJD hk */
9698     Set_Node_Offset(ret, parse_start+1);
9699     Set_Node_Length(ret,
9700         op == '{' ? (RExC_parse - parse_start) : 1);
9701
9702     if (!SIZE_ONLY && RExC_extralen)
9703      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9704     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9705     if (SIZE_ONLY)
9706      RExC_whilem_seen++, RExC_extralen += 3;
9707     RExC_naughty += 4 + RExC_naughty; /* compound interest */
9708    }
9709    ret->flags = 0;
9710
9711    if (min > 0)
9712     *flagp = WORST;
9713    if (max > 0)
9714     *flagp |= HASWIDTH;
9715    if (!SIZE_ONLY) {
9716     ARG1_SET(ret, (U16)min);
9717     ARG2_SET(ret, (U16)max);
9718    }
9719
9720    goto nest_check;
9721   }
9722  }
9723
9724  if (!ISMULT1(op)) {
9725   *flagp = flags;
9726   return(ret);
9727  }
9728
9729 #if 0    /* Now runtime fix should be reliable. */
9730
9731  /* if this is reinstated, don't forget to put this back into perldiag:
9732
9733    =item Regexp *+ operand could be empty at {#} in regex m/%s/
9734
9735   (F) The part of the regexp subject to either the * or + quantifier
9736   could match an empty string. The {#} shows in the regular
9737   expression about where the problem was discovered.
9738
9739  */
9740
9741  if (!(flags&HASWIDTH) && op != '?')
9742  vFAIL("Regexp *+ operand could be empty");
9743 #endif
9744
9745 #ifdef RE_TRACK_PATTERN_OFFSETS
9746  parse_start = RExC_parse;
9747 #endif
9748  nextchar(pRExC_state);
9749
9750  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9751
9752  if (op == '*' && (flags&SIMPLE)) {
9753   reginsert(pRExC_state, STAR, ret, depth+1);
9754   ret->flags = 0;
9755   RExC_naughty += 4;
9756  }
9757  else if (op == '*') {
9758   min = 0;
9759   goto do_curly;
9760  }
9761  else if (op == '+' && (flags&SIMPLE)) {
9762   reginsert(pRExC_state, PLUS, ret, depth+1);
9763   ret->flags = 0;
9764   RExC_naughty += 3;
9765  }
9766  else if (op == '+') {
9767   min = 1;
9768   goto do_curly;
9769  }
9770  else if (op == '?') {
9771   min = 0; max = 1;
9772   goto do_curly;
9773  }
9774   nest_check:
9775  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9776   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9777   ckWARN3reg(RExC_parse,
9778     "%.*s matches null string many times",
9779     (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9780     origparse);
9781   (void)ReREFCNT_inc(RExC_rx_sv);
9782  }
9783
9784  if (RExC_parse < RExC_end && *RExC_parse == '?') {
9785   nextchar(pRExC_state);
9786   reginsert(pRExC_state, MINMOD, ret, depth+1);
9787   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9788  }
9789 #ifndef REG_ALLOW_MINMOD_SUSPEND
9790  else
9791 #endif
9792  if (RExC_parse < RExC_end && *RExC_parse == '+') {
9793   regnode *ender;
9794   nextchar(pRExC_state);
9795   ender = reg_node(pRExC_state, SUCCEED);
9796   REGTAIL(pRExC_state, ret, ender);
9797   reginsert(pRExC_state, SUSPEND, ret, depth+1);
9798   ret->flags = 0;
9799   ender = reg_node(pRExC_state, TAIL);
9800   REGTAIL(pRExC_state, ret, ender);
9801   /*ret= ender;*/
9802  }
9803
9804  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9805   RExC_parse++;
9806   vFAIL("Nested quantifiers");
9807  }
9808
9809  return(ret);
9810 }
9811
9812 STATIC bool
9813 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9814   const bool strict   /* Apply stricter parsing rules? */
9815  )
9816 {
9817
9818  /* This is expected to be called by a parser routine that has recognized '\N'
9819    and needs to handle the rest. RExC_parse is expected to point at the first
9820    char following the N at the time of the call.  On successful return,
9821    RExC_parse has been updated to point to just after the sequence identified
9822    by this routine, and <*flagp> has been updated.
9823
9824    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9825    character class.
9826
9827    \N may begin either a named sequence, or if outside a character class, mean
9828    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9829    attempted to decide which, and in the case of a named sequence, converted it
9830    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9831    where c1... are the characters in the sequence.  For single-quoted regexes,
9832    the tokenizer passes the \N sequence through unchanged; this code will not
9833    attempt to determine this nor expand those, instead raising a syntax error.
9834    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9835    or there is no '}', it signals that this \N occurrence means to match a
9836    non-newline.
9837
9838    Only the \N{U+...} form should occur in a character class, for the same
9839    reason that '.' inside a character class means to just match a period: it
9840    just doesn't make sense.
9841
9842    The function raises an error (via vFAIL), and doesn't return for various
9843    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9844    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9845    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9846    only possible if node_p is non-NULL.
9847
9848
9849    If <valuep> is non-null, it means the caller can accept an input sequence
9850    consisting of a just a single code point; <*valuep> is set to that value
9851    if the input is such.
9852
9853    If <node_p> is non-null it signifies that the caller can accept any other
9854    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9855    is set as follows:
9856  1) \N means not-a-NL: points to a newly created REG_ANY node;
9857  2) \N{}:              points to a new NOTHING node;
9858  3) otherwise:         points to a new EXACT node containing the resolved
9859       string.
9860    Note that FALSE is returned for single code point sequences if <valuep> is
9861    null.
9862  */
9863
9864  char * endbrace;    /* '}' following the name */
9865  char* p;
9866  char *endchar; /* Points to '.' or '}' ending cur char in the input
9867       stream */
9868  bool has_multiple_chars; /* true if the input stream contains a sequence of
9869         more than one character */
9870
9871  GET_RE_DEBUG_FLAGS_DECL;
9872
9873  PERL_ARGS_ASSERT_GROK_BSLASH_N;
9874
9875  GET_RE_DEBUG_FLAGS;
9876
9877  assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9878
9879  /* The [^\n] meaning of \N ignores spaces and comments under the /x
9880  * modifier.  The other meaning does not */
9881  p = (RExC_flags & RXf_PMf_EXTENDED)
9882   ? regwhite( pRExC_state, RExC_parse )
9883   : RExC_parse;
9884
9885  /* Disambiguate between \N meaning a named character versus \N meaning
9886  * [^\n].  The former is assumed when it can't be the latter. */
9887  if (*p != '{' || regcurly(p, FALSE)) {
9888   RExC_parse = p;
9889   if (! node_p) {
9890    /* no bare \N in a charclass */
9891    if (in_char_class) {
9892     vFAIL("\\N in a character class must be a named character: \\N{...}");
9893    }
9894    return FALSE;
9895   }
9896   nextchar(pRExC_state);
9897   *node_p = reg_node(pRExC_state, REG_ANY);
9898   *flagp |= HASWIDTH|SIMPLE;
9899   RExC_naughty++;
9900   RExC_parse--;
9901   Set_Node_Length(*node_p, 1); /* MJD */
9902   return TRUE;
9903  }
9904
9905  /* Here, we have decided it should be a named character or sequence */
9906
9907  /* The test above made sure that the next real character is a '{', but
9908  * under the /x modifier, it could be separated by space (or a comment and
9909  * \n) and this is not allowed (for consistency with \x{...} and the
9910  * tokenizer handling of \N{NAME}). */
9911  if (*RExC_parse != '{') {
9912   vFAIL("Missing braces on \\N{}");
9913  }
9914
9915  RExC_parse++; /* Skip past the '{' */
9916
9917  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9918   || ! (endbrace == RExC_parse  /* nothing between the {} */
9919    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9920     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9921  {
9922   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9923   vFAIL("\\N{NAME} must be resolved by the lexer");
9924  }
9925
9926  if (endbrace == RExC_parse) {   /* empty: \N{} */
9927   bool ret = TRUE;
9928   if (node_p) {
9929    *node_p = reg_node(pRExC_state,NOTHING);
9930   }
9931   else if (in_char_class) {
9932    if (SIZE_ONLY && in_char_class) {
9933     if (strict) {
9934      RExC_parse++;   /* Position after the "}" */
9935      vFAIL("Zero length \\N{}");
9936     }
9937     else {
9938      ckWARNreg(RExC_parse,
9939        "Ignoring zero length \\N{} in character class");
9940     }
9941    }
9942    ret = FALSE;
9943   }
9944   else {
9945    return FALSE;
9946   }
9947   nextchar(pRExC_state);
9948   return ret;
9949  }
9950
9951  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9952  RExC_parse += 2; /* Skip past the 'U+' */
9953
9954  endchar = RExC_parse + strcspn(RExC_parse, ".}");
9955
9956  /* Code points are separated by dots.  If none, there is only one code
9957  * point, and is terminated by the brace */
9958  has_multiple_chars = (endchar < endbrace);
9959
9960  if (valuep && (! has_multiple_chars || in_char_class)) {
9961   /* We only pay attention to the first char of
9962   multichar strings being returned in char classes. I kinda wonder
9963   if this makes sense as it does change the behaviour
9964   from earlier versions, OTOH that behaviour was broken
9965   as well. XXX Solution is to recharacterize as
9966   [rest-of-class]|multi1|multi2... */
9967
9968   STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9969   I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9970    | PERL_SCAN_DISALLOW_PREFIX
9971    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9972
9973   *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9974
9975   /* The tokenizer should have guaranteed validity, but it's possible to
9976   * bypass it by using single quoting, so check */
9977   if (length_of_hex == 0
9978    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9979   {
9980    RExC_parse += length_of_hex; /* Includes all the valid */
9981    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9982        ? UTF8SKIP(RExC_parse)
9983        : 1;
9984    /* Guard against malformed utf8 */
9985    if (RExC_parse >= endchar) {
9986     RExC_parse = endchar;
9987    }
9988    vFAIL("Invalid hexadecimal number in \\N{U+...}");
9989   }
9990
9991   if (in_char_class && has_multiple_chars) {
9992    if (strict) {
9993     RExC_parse = endbrace;
9994     vFAIL("\\N{} in character class restricted to one character");
9995    }
9996    else {
9997     ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9998    }
9999   }
10000
10001   RExC_parse = endbrace + 1;
10002  }
10003  else if (! node_p || ! has_multiple_chars) {
10004
10005   /* Here, the input is legal, but not according to the caller's
10006   * options.  We fail without advancing the parse, so that the
10007   * caller can try again */
10008   RExC_parse = p;
10009   return FALSE;
10010  }
10011  else {
10012
10013   /* What is done here is to convert this to a sub-pattern of the form
10014   * (?:\x{char1}\x{char2}...)
10015   * and then call reg recursively.  That way, it retains its atomicness,
10016   * while not having to worry about special handling that some code
10017   * points may have.  toke.c has converted the original Unicode values
10018   * to native, so that we can just pass on the hex values unchanged.  We
10019   * do have to set a flag to keep recoding from happening in the
10020   * recursion */
10021
10022   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10023   STRLEN len;
10024   char *orig_end = RExC_end;
10025   I32 flags;
10026
10027   while (RExC_parse < endbrace) {
10028
10029    /* Convert to notation the rest of the code understands */
10030    sv_catpv(substitute_parse, "\\x{");
10031    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10032    sv_catpv(substitute_parse, "}");
10033
10034    /* Point to the beginning of the next character in the sequence. */
10035    RExC_parse = endchar + 1;
10036    endchar = RExC_parse + strcspn(RExC_parse, ".}");
10037   }
10038   sv_catpv(substitute_parse, ")");
10039
10040   RExC_parse = SvPV(substitute_parse, len);
10041
10042   /* Don't allow empty number */
10043   if (len < 8) {
10044    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10045   }
10046   RExC_end = RExC_parse + len;
10047
10048   /* The values are Unicode, and therefore not subject to recoding */
10049   RExC_override_recoding = 1;
10050
10051   if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10052    if (flags & RESTART_UTF8) {
10053     *flagp = RESTART_UTF8;
10054     return FALSE;
10055    }
10056    FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X",
10057     flags);
10058   }
10059   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10060
10061   RExC_parse = endbrace;
10062   RExC_end = orig_end;
10063   RExC_override_recoding = 0;
10064
10065   nextchar(pRExC_state);
10066  }
10067
10068  return TRUE;
10069 }
10070
10071
10072 /*
10073  * reg_recode
10074  *
10075  * It returns the code point in utf8 for the value in *encp.
10076  *    value: a code value in the source encoding
10077  *    encp:  a pointer to an Encode object
10078  *
10079  * If the result from Encode is not a single character,
10080  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10081  */
10082 STATIC UV
10083 S_reg_recode(pTHX_ const char value, SV **encp)
10084 {
10085  STRLEN numlen = 1;
10086  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10087  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10088  const STRLEN newlen = SvCUR(sv);
10089  UV uv = UNICODE_REPLACEMENT;
10090
10091  PERL_ARGS_ASSERT_REG_RECODE;
10092
10093  if (newlen)
10094   uv = SvUTF8(sv)
10095    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10096    : *(U8*)s;
10097
10098  if (!newlen || numlen != newlen) {
10099   uv = UNICODE_REPLACEMENT;
10100   *encp = NULL;
10101  }
10102  return uv;
10103 }
10104
10105 PERL_STATIC_INLINE U8
10106 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10107 {
10108  U8 op;
10109
10110  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10111
10112  if (! FOLD) {
10113   return EXACT;
10114  }
10115
10116  op = get_regex_charset(RExC_flags);
10117  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10118   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10119     been, so there is no hole */
10120  }
10121
10122  return op + EXACTF;
10123 }
10124
10125 PERL_STATIC_INLINE void
10126 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10127 {
10128  /* This knows the details about sizing an EXACTish node, setting flags for
10129  * it (by setting <*flagp>, and potentially populating it with a single
10130  * character.
10131  *
10132  * If <len> (the length in bytes) is non-zero, this function assumes that
10133  * the node has already been populated, and just does the sizing.  In this
10134  * case <code_point> should be the final code point that has already been
10135  * placed into the node.  This value will be ignored except that under some
10136  * circumstances <*flagp> is set based on it.
10137  *
10138  * If <len> is zero, the function assumes that the node is to contain only
10139  * the single character given by <code_point> and calculates what <len>
10140  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10141  * additionally will populate the node's STRING with <code_point>, if <len>
10142  * is 0.  In both cases <*flagp> is appropriately set
10143  *
10144  * It knows that under FOLD, the Latin Sharp S and UTF characters above
10145  * 255, must be folded (the former only when the rules indicate it can
10146  * match 'ss') */
10147
10148  bool len_passed_in = cBOOL(len != 0);
10149  U8 character[UTF8_MAXBYTES_CASE+1];
10150
10151  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10152
10153  if (! len_passed_in) {
10154   if (UTF) {
10155    if (FOLD && (! LOC || code_point > 255)) {
10156     _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
10157         character,
10158         &len,
10159         FOLD_FLAGS_FULL | ((LOC)
10160              ? FOLD_FLAGS_LOCALE
10161              : (ASCII_FOLD_RESTRICTED)
10162              ? FOLD_FLAGS_NOMIX_ASCII
10163              : 0));
10164    }
10165    else {
10166     uvchr_to_utf8( character, code_point);
10167     len = UTF8SKIP(character);
10168    }
10169   }
10170   else if (! FOLD
10171     || code_point != LATIN_SMALL_LETTER_SHARP_S
10172     || ASCII_FOLD_RESTRICTED
10173     || ! AT_LEAST_UNI_SEMANTICS)
10174   {
10175    *character = (U8) code_point;
10176    len = 1;
10177   }
10178   else {
10179    *character = 's';
10180    *(character + 1) = 's';
10181    len = 2;
10182   }
10183  }
10184
10185  if (SIZE_ONLY) {
10186   RExC_size += STR_SZ(len);
10187  }
10188  else {
10189   RExC_emit += STR_SZ(len);
10190   STR_LEN(node) = len;
10191   if (! len_passed_in) {
10192    Copy((char *) character, STRING(node), len, char);
10193   }
10194  }
10195
10196  *flagp |= HASWIDTH;
10197
10198  /* A single character node is SIMPLE, except for the special-cased SHARP S
10199  * under /di. */
10200  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10201   && (code_point != LATIN_SMALL_LETTER_SHARP_S
10202    || ! FOLD || ! DEPENDS_SEMANTICS))
10203  {
10204   *flagp |= SIMPLE;
10205  }
10206 }
10207
10208 /*
10209  - regatom - the lowest level
10210
10211    Try to identify anything special at the start of the pattern. If there
10212    is, then handle it as required. This may involve generating a single regop,
10213    such as for an assertion; or it may involve recursing, such as to
10214    handle a () structure.
10215
10216    If the string doesn't start with something special then we gobble up
10217    as much literal text as we can.
10218
10219    Once we have been able to handle whatever type of thing started the
10220    sequence, we return.
10221
10222    Note: we have to be careful with escapes, as they can be both literal
10223    and special, and in the case of \10 and friends, context determines which.
10224
10225    A summary of the code structure is:
10226
10227    switch (first_byte) {
10228   cases for each special:
10229    handle this special;
10230    break;
10231   case '\\':
10232    switch (2nd byte) {
10233     cases for each unambiguous special:
10234      handle this special;
10235      break;
10236     cases for each ambigous special/literal:
10237      disambiguate;
10238      if (special)  handle here
10239      else goto defchar;
10240     default: // unambiguously literal:
10241      goto defchar;
10242    }
10243   default:  // is a literal char
10244    // FALL THROUGH
10245   defchar:
10246    create EXACTish node for literal;
10247    while (more input and node isn't full) {
10248     switch (input_byte) {
10249     cases for each special;
10250      make sure parse pointer is set so that the next call to
10251       regatom will see this special first
10252      goto loopdone; // EXACTish node terminated by prev. char
10253     default:
10254      append char to EXACTISH node;
10255     }
10256     get next input byte;
10257    }
10258   loopdone:
10259    }
10260    return the generated node;
10261
10262    Specifically there are two separate switches for handling
10263    escape sequences, with the one for handling literal escapes requiring
10264    a dummy entry for all of the special escapes that are actually handled
10265    by the other.
10266
10267    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10268    TRYAGAIN.
10269    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10270    restarted.
10271    Otherwise does not return NULL.
10272 */
10273
10274 STATIC regnode *
10275 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10276 {
10277  dVAR;
10278  regnode *ret = NULL;
10279  I32 flags = 0;
10280  char *parse_start = RExC_parse;
10281  U8 op;
10282  int invert = 0;
10283
10284  GET_RE_DEBUG_FLAGS_DECL;
10285
10286  *flagp = WORST;  /* Tentatively. */
10287
10288  DEBUG_PARSE("atom");
10289
10290  PERL_ARGS_ASSERT_REGATOM;
10291
10292 tryagain:
10293  switch ((U8)*RExC_parse) {
10294  case '^':
10295   RExC_seen_zerolen++;
10296   nextchar(pRExC_state);
10297   if (RExC_flags & RXf_PMf_MULTILINE)
10298    ret = reg_node(pRExC_state, MBOL);
10299   else if (RExC_flags & RXf_PMf_SINGLELINE)
10300    ret = reg_node(pRExC_state, SBOL);
10301   else
10302    ret = reg_node(pRExC_state, BOL);
10303   Set_Node_Length(ret, 1); /* MJD */
10304   break;
10305  case '$':
10306   nextchar(pRExC_state);
10307   if (*RExC_parse)
10308    RExC_seen_zerolen++;
10309   if (RExC_flags & RXf_PMf_MULTILINE)
10310    ret = reg_node(pRExC_state, MEOL);
10311   else if (RExC_flags & RXf_PMf_SINGLELINE)
10312    ret = reg_node(pRExC_state, SEOL);
10313   else
10314    ret = reg_node(pRExC_state, EOL);
10315   Set_Node_Length(ret, 1); /* MJD */
10316   break;
10317  case '.':
10318   nextchar(pRExC_state);
10319   if (RExC_flags & RXf_PMf_SINGLELINE)
10320    ret = reg_node(pRExC_state, SANY);
10321   else
10322    ret = reg_node(pRExC_state, REG_ANY);
10323   *flagp |= HASWIDTH|SIMPLE;
10324   RExC_naughty++;
10325   Set_Node_Length(ret, 1); /* MJD */
10326   break;
10327  case '[':
10328  {
10329   char * const oregcomp_parse = ++RExC_parse;
10330   ret = regclass(pRExC_state, flagp,depth+1,
10331      FALSE, /* means parse the whole char class */
10332      TRUE, /* allow multi-char folds */
10333      FALSE, /* don't silence non-portable warnings. */
10334      NULL);
10335   if (*RExC_parse != ']') {
10336    RExC_parse = oregcomp_parse;
10337    vFAIL("Unmatched [");
10338   }
10339   if (ret == NULL) {
10340    if (*flagp & RESTART_UTF8)
10341     return NULL;
10342    FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10343     *flagp);
10344   }
10345   nextchar(pRExC_state);
10346   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10347   break;
10348  }
10349  case '(':
10350   nextchar(pRExC_state);
10351   ret = reg(pRExC_state, 2, &flags,depth+1);
10352   if (ret == NULL) {
10353     if (flags & TRYAGAIN) {
10354      if (RExC_parse == RExC_end) {
10355       /* Make parent create an empty node if needed. */
10356       *flagp |= TRYAGAIN;
10357       return(NULL);
10358      }
10359      goto tryagain;
10360     }
10361     if (flags & RESTART_UTF8) {
10362      *flagp = RESTART_UTF8;
10363      return NULL;
10364     }
10365     FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags);
10366   }
10367   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10368   break;
10369  case '|':
10370  case ')':
10371   if (flags & TRYAGAIN) {
10372    *flagp |= TRYAGAIN;
10373    return NULL;
10374   }
10375   vFAIL("Internal urp");
10376         /* Supposed to be caught earlier. */
10377   break;
10378  case '{':
10379   if (!regcurly(RExC_parse, FALSE)) {
10380    RExC_parse++;
10381    goto defchar;
10382   }
10383   /* FALL THROUGH */
10384  case '?':
10385  case '+':
10386  case '*':
10387   RExC_parse++;
10388   vFAIL("Quantifier follows nothing");
10389   break;
10390  case '\\':
10391   /* Special Escapes
10392
10393   This switch handles escape sequences that resolve to some kind
10394   of special regop and not to literal text. Escape sequnces that
10395   resolve to literal text are handled below in the switch marked
10396   "Literal Escapes".
10397
10398   Every entry in this switch *must* have a corresponding entry
10399   in the literal escape switch. However, the opposite is not
10400   required, as the default for this switch is to jump to the
10401   literal text handling code.
10402   */
10403   switch ((U8)*++RExC_parse) {
10404    U8 arg;
10405   /* Special Escapes */
10406   case 'A':
10407    RExC_seen_zerolen++;
10408    ret = reg_node(pRExC_state, SBOL);
10409    *flagp |= SIMPLE;
10410    goto finish_meta_pat;
10411   case 'G':
10412    ret = reg_node(pRExC_state, GPOS);
10413    RExC_seen |= REG_SEEN_GPOS;
10414    *flagp |= SIMPLE;
10415    goto finish_meta_pat;
10416   case 'K':
10417    RExC_seen_zerolen++;
10418    ret = reg_node(pRExC_state, KEEPS);
10419    *flagp |= SIMPLE;
10420    /* XXX:dmq : disabling in-place substitution seems to
10421    * be necessary here to avoid cases of memory corruption, as
10422    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10423    */
10424    RExC_seen |= REG_SEEN_LOOKBEHIND;
10425    goto finish_meta_pat;
10426   case 'Z':
10427    ret = reg_node(pRExC_state, SEOL);
10428    *flagp |= SIMPLE;
10429    RExC_seen_zerolen++;  /* Do not optimize RE away */
10430    goto finish_meta_pat;
10431   case 'z':
10432    ret = reg_node(pRExC_state, EOS);
10433    *flagp |= SIMPLE;
10434    RExC_seen_zerolen++;  /* Do not optimize RE away */
10435    goto finish_meta_pat;
10436   case 'C':
10437    ret = reg_node(pRExC_state, CANY);
10438    RExC_seen |= REG_SEEN_CANY;
10439    *flagp |= HASWIDTH|SIMPLE;
10440    goto finish_meta_pat;
10441   case 'X':
10442    ret = reg_node(pRExC_state, CLUMP);
10443    *flagp |= HASWIDTH;
10444    goto finish_meta_pat;
10445
10446   case 'W':
10447    invert = 1;
10448    /* FALLTHROUGH */
10449   case 'w':
10450    arg = ANYOF_WORDCHAR;
10451    goto join_posix;
10452
10453   case 'b':
10454    RExC_seen_zerolen++;
10455    RExC_seen |= REG_SEEN_LOOKBEHIND;
10456    op = BOUND + get_regex_charset(RExC_flags);
10457    if (op > BOUNDA) {  /* /aa is same as /a */
10458     op = BOUNDA;
10459    }
10460    ret = reg_node(pRExC_state, op);
10461    FLAGS(ret) = get_regex_charset(RExC_flags);
10462    *flagp |= SIMPLE;
10463    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10464     ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10465    }
10466    goto finish_meta_pat;
10467   case 'B':
10468    RExC_seen_zerolen++;
10469    RExC_seen |= REG_SEEN_LOOKBEHIND;
10470    op = NBOUND + get_regex_charset(RExC_flags);
10471    if (op > NBOUNDA) { /* /aa is same as /a */
10472     op = NBOUNDA;
10473    }
10474    ret = reg_node(pRExC_state, op);
10475    FLAGS(ret) = get_regex_charset(RExC_flags);
10476    *flagp |= SIMPLE;
10477    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10478     ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10479    }
10480    goto finish_meta_pat;
10481
10482   case 'D':
10483    invert = 1;
10484    /* FALLTHROUGH */
10485   case 'd':
10486    arg = ANYOF_DIGIT;
10487    goto join_posix;
10488
10489   case 'R':
10490    ret = reg_node(pRExC_state, LNBREAK);
10491    *flagp |= HASWIDTH|SIMPLE;
10492    goto finish_meta_pat;
10493
10494   case 'H':
10495    invert = 1;
10496    /* FALLTHROUGH */
10497   case 'h':
10498    arg = ANYOF_BLANK;
10499    op = POSIXU;
10500    goto join_posix_op_known;
10501
10502   case 'V':
10503    invert = 1;
10504    /* FALLTHROUGH */
10505   case 'v':
10506    arg = ANYOF_VERTWS;
10507    op = POSIXU;
10508    goto join_posix_op_known;
10509
10510   case 'S':
10511    invert = 1;
10512    /* FALLTHROUGH */
10513   case 's':
10514    arg = ANYOF_SPACE;
10515
10516   join_posix:
10517
10518    op = POSIXD + get_regex_charset(RExC_flags);
10519    if (op > POSIXA) {  /* /aa is same as /a */
10520     op = POSIXA;
10521    }
10522
10523   join_posix_op_known:
10524
10525    if (invert) {
10526     op += NPOSIXD - POSIXD;
10527    }
10528
10529    ret = reg_node(pRExC_state, op);
10530    if (! SIZE_ONLY) {
10531     FLAGS(ret) = namedclass_to_classnum(arg);
10532    }
10533
10534    *flagp |= HASWIDTH|SIMPLE;
10535    /* FALL THROUGH */
10536
10537   finish_meta_pat:
10538    nextchar(pRExC_state);
10539    Set_Node_Length(ret, 2); /* MJD */
10540    break;
10541   case 'p':
10542   case 'P':
10543    {
10544 #ifdef DEBUGGING
10545     char* parse_start = RExC_parse - 2;
10546 #endif
10547
10548     RExC_parse--;
10549
10550     ret = regclass(pRExC_state, flagp,depth+1,
10551        TRUE, /* means just parse this element */
10552        FALSE, /* don't allow multi-char folds */
10553        FALSE, /* don't silence non-portable warnings.
10554           It would be a bug if these returned
10555           non-portables */
10556        NULL);
10557     /* regclass() can only return RESTART_UTF8 if multi-char folds
10558     are allowed.  */
10559     if (!ret)
10560      FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10561       *flagp);
10562
10563     RExC_parse--;
10564
10565     Set_Node_Offset(ret, parse_start + 2);
10566     Set_Node_Cur_Length(ret);
10567     nextchar(pRExC_state);
10568    }
10569    break;
10570   case 'N':
10571    /* Handle \N and \N{NAME} with multiple code points here and not
10572    * below because it can be multicharacter. join_exact() will join
10573    * them up later on.  Also this makes sure that things like
10574    * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10575    * The options to the grok function call causes it to fail if the
10576    * sequence is just a single code point.  We then go treat it as
10577    * just another character in the current EXACT node, and hence it
10578    * gets uniform treatment with all the other characters.  The
10579    * special treatment for quantifiers is not needed for such single
10580    * character sequences */
10581    ++RExC_parse;
10582    if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10583         FALSE /* not strict */ )) {
10584     if (*flagp & RESTART_UTF8)
10585      return NULL;
10586     RExC_parse--;
10587     goto defchar;
10588    }
10589    break;
10590   case 'k':    /* Handle \k<NAME> and \k'NAME' */
10591   parse_named_seq:
10592   {
10593    char ch= RExC_parse[1];
10594    if (ch != '<' && ch != '\'' && ch != '{') {
10595     RExC_parse++;
10596     vFAIL2("Sequence %.2s... not terminated",parse_start);
10597    } else {
10598     /* this pretty much dupes the code for (?P=...) in reg(), if
10599     you change this make sure you change that */
10600     char* name_start = (RExC_parse += 2);
10601     U32 num = 0;
10602     SV *sv_dat = reg_scan_name(pRExC_state,
10603      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10604     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10605     if (RExC_parse == name_start || *RExC_parse != ch)
10606      vFAIL2("Sequence %.3s... not terminated",parse_start);
10607
10608     if (!SIZE_ONLY) {
10609      num = add_data( pRExC_state, 1, "S" );
10610      RExC_rxi->data->data[num]=(void*)sv_dat;
10611      SvREFCNT_inc_simple_void(sv_dat);
10612     }
10613
10614     RExC_sawback = 1;
10615     ret = reganode(pRExC_state,
10616        ((! FOLD)
10617         ? NREF
10618         : (ASCII_FOLD_RESTRICTED)
10619         ? NREFFA
10620         : (AT_LEAST_UNI_SEMANTICS)
10621          ? NREFFU
10622          : (LOC)
10623          ? NREFFL
10624          : NREFF),
10625         num);
10626     *flagp |= HASWIDTH;
10627
10628     /* override incorrect value set in reganode MJD */
10629     Set_Node_Offset(ret, parse_start+1);
10630     Set_Node_Cur_Length(ret); /* MJD */
10631     nextchar(pRExC_state);
10632
10633    }
10634    break;
10635   }
10636   case 'g':
10637   case '1': case '2': case '3': case '4':
10638   case '5': case '6': case '7': case '8': case '9':
10639    {
10640     I32 num;
10641     bool isg = *RExC_parse == 'g';
10642     bool isrel = 0;
10643     bool hasbrace = 0;
10644     if (isg) {
10645      RExC_parse++;
10646      if (*RExC_parse == '{') {
10647       RExC_parse++;
10648       hasbrace = 1;
10649      }
10650      if (*RExC_parse == '-') {
10651       RExC_parse++;
10652       isrel = 1;
10653      }
10654      if (hasbrace && !isDIGIT(*RExC_parse)) {
10655       if (isrel) RExC_parse--;
10656       RExC_parse -= 2;
10657       goto parse_named_seq;
10658     }   }
10659     num = atoi(RExC_parse);
10660     if (isg && num == 0)
10661      vFAIL("Reference to invalid group 0");
10662     if (isrel) {
10663      num = RExC_npar - num;
10664      if (num < 1)
10665       vFAIL("Reference to nonexistent or unclosed group");
10666     }
10667     if (!isg && num > 9 && num >= RExC_npar)
10668      /* Probably a character specified in octal, e.g. \35 */
10669      goto defchar;
10670     else {
10671      char * const parse_start = RExC_parse - 1; /* MJD */
10672      while (isDIGIT(*RExC_parse))
10673       RExC_parse++;
10674      if (parse_start == RExC_parse - 1)
10675       vFAIL("Unterminated \\g... pattern");
10676      if (hasbrace) {
10677       if (*RExC_parse != '}')
10678        vFAIL("Unterminated \\g{...} pattern");
10679       RExC_parse++;
10680      }
10681      if (!SIZE_ONLY) {
10682       if (num > (I32)RExC_rx->nparens)
10683        vFAIL("Reference to nonexistent group");
10684      }
10685      RExC_sawback = 1;
10686      ret = reganode(pRExC_state,
10687         ((! FOLD)
10688          ? REF
10689          : (ASCII_FOLD_RESTRICTED)
10690          ? REFFA
10691          : (AT_LEAST_UNI_SEMANTICS)
10692           ? REFFU
10693           : (LOC)
10694           ? REFFL
10695           : REFF),
10696          num);
10697      *flagp |= HASWIDTH;
10698
10699      /* override incorrect value set in reganode MJD */
10700      Set_Node_Offset(ret, parse_start+1);
10701      Set_Node_Cur_Length(ret); /* MJD */
10702      RExC_parse--;
10703      nextchar(pRExC_state);
10704     }
10705    }
10706    break;
10707   case '\0':
10708    if (RExC_parse >= RExC_end)
10709     FAIL("Trailing \\");
10710    /* FALL THROUGH */
10711   default:
10712    /* Do not generate "unrecognized" warnings here, we fall
10713    back into the quick-grab loop below */
10714    parse_start--;
10715    goto defchar;
10716   }
10717   break;
10718
10719  case '#':
10720   if (RExC_flags & RXf_PMf_EXTENDED) {
10721    if ( reg_skipcomment( pRExC_state ) )
10722     goto tryagain;
10723   }
10724   /* FALL THROUGH */
10725
10726  default:
10727
10728    parse_start = RExC_parse - 1;
10729
10730    RExC_parse++;
10731
10732   defchar: {
10733    STRLEN len = 0;
10734    UV ender;
10735    char *p;
10736    char *s;
10737 #define MAX_NODE_STRING_SIZE 127
10738    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10739    char *s0;
10740    U8 upper_parse = MAX_NODE_STRING_SIZE;
10741    STRLEN foldlen;
10742    U8 node_type;
10743    bool next_is_quantifier;
10744    char * oldp = NULL;
10745
10746    /* If a folding node contains only code points that don't
10747    * participate in folds, it can be changed into an EXACT node,
10748    * which allows the optimizer more things to look for */
10749    bool maybe_exact;
10750
10751    ender = 0;
10752    node_type = compute_EXACTish(pRExC_state);
10753    ret = reg_node(pRExC_state, node_type);
10754
10755    /* In pass1, folded, we use a temporary buffer instead of the
10756    * actual node, as the node doesn't exist yet */
10757    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10758
10759    s0 = s;
10760
10761   reparse:
10762
10763    /* We do the EXACTFish to EXACT node only if folding, and not if in
10764    * locale, as whether a character folds or not isn't known until
10765    * runtime */
10766    maybe_exact = FOLD && ! LOC;
10767
10768    /* XXX The node can hold up to 255 bytes, yet this only goes to
10769    * 127.  I (khw) do not know why.  Keeping it somewhat less than
10770    * 255 allows us to not have to worry about overflow due to
10771    * converting to utf8 and fold expansion, but that value is
10772    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10773    * split up by this limit into a single one using the real max of
10774    * 255.  Even at 127, this breaks under rare circumstances.  If
10775    * folding, we do not want to split a node at a character that is a
10776    * non-final in a multi-char fold, as an input string could just
10777    * happen to want to match across the node boundary.  The join
10778    * would solve that problem if the join actually happens.  But a
10779    * series of more than two nodes in a row each of 127 would cause
10780    * the first join to succeed to get to 254, but then there wouldn't
10781    * be room for the next one, which could at be one of those split
10782    * multi-char folds.  I don't know of any fool-proof solution.  One
10783    * could back off to end with only a code point that isn't such a
10784    * non-final, but it is possible for there not to be any in the
10785    * entire node. */
10786    for (p = RExC_parse - 1;
10787     len < upper_parse && p < RExC_end;
10788     len++)
10789    {
10790     oldp = p;
10791
10792     if (RExC_flags & RXf_PMf_EXTENDED)
10793      p = regwhite( pRExC_state, p );
10794     switch ((U8)*p) {
10795     case '^':
10796     case '$':
10797     case '.':
10798     case '[':
10799     case '(':
10800     case ')':
10801     case '|':
10802      goto loopdone;
10803     case '\\':
10804      /* Literal Escapes Switch
10805
10806      This switch is meant to handle escape sequences that
10807      resolve to a literal character.
10808
10809      Every escape sequence that represents something
10810      else, like an assertion or a char class, is handled
10811      in the switch marked 'Special Escapes' above in this
10812      routine, but also has an entry here as anything that
10813      isn't explicitly mentioned here will be treated as
10814      an unescaped equivalent literal.
10815      */
10816
10817      switch ((U8)*++p) {
10818      /* These are all the special escapes. */
10819      case 'A':             /* Start assertion */
10820      case 'b': case 'B':   /* Word-boundary assertion*/
10821      case 'C':             /* Single char !DANGEROUS! */
10822      case 'd': case 'D':   /* digit class */
10823      case 'g': case 'G':   /* generic-backref, pos assertion */
10824      case 'h': case 'H':   /* HORIZWS */
10825      case 'k': case 'K':   /* named backref, keep marker */
10826      case 'p': case 'P':   /* Unicode property */
10827        case 'R':   /* LNBREAK */
10828      case 's': case 'S':   /* space class */
10829      case 'v': case 'V':   /* VERTWS */
10830      case 'w': case 'W':   /* word class */
10831      case 'X':             /* eXtended Unicode "combining character sequence" */
10832      case 'z': case 'Z':   /* End of line/string assertion */
10833       --p;
10834       goto loopdone;
10835
10836      /* Anything after here is an escape that resolves to a
10837      literal. (Except digits, which may or may not)
10838      */
10839      case 'n':
10840       ender = '\n';
10841       p++;
10842       break;
10843      case 'N': /* Handle a single-code point named character. */
10844       /* The options cause it to fail if a multiple code
10845       * point sequence.  Handle those in the switch() above
10846       * */
10847       RExC_parse = p + 1;
10848       if (! grok_bslash_N(pRExC_state, NULL, &ender,
10849            flagp, depth, FALSE,
10850            FALSE /* not strict */ ))
10851       {
10852        if (*flagp & RESTART_UTF8)
10853         FAIL("panic: grok_bslash_N set RESTART_UTF8");
10854        RExC_parse = p = oldp;
10855        goto loopdone;
10856       }
10857       p = RExC_parse;
10858       if (ender > 0xff) {
10859        REQUIRE_UTF8;
10860       }
10861       break;
10862      case 'r':
10863       ender = '\r';
10864       p++;
10865       break;
10866      case 't':
10867       ender = '\t';
10868       p++;
10869       break;
10870      case 'f':
10871       ender = '\f';
10872       p++;
10873       break;
10874      case 'e':
10875       ender = ASCII_TO_NATIVE('\033');
10876       p++;
10877       break;
10878      case 'a':
10879       ender = ASCII_TO_NATIVE('\007');
10880       p++;
10881       break;
10882      case 'o':
10883       {
10884        UV result;
10885        const char* error_msg;
10886
10887        bool valid = grok_bslash_o(&p,
10888              &result,
10889              &error_msg,
10890              TRUE, /* out warnings */
10891              FALSE, /* not strict */
10892              TRUE, /* Output warnings
10893                 for non-
10894                 portables */
10895              UTF);
10896        if (! valid) {
10897         RExC_parse = p; /* going to die anyway; point
10898             to exact spot of failure */
10899         vFAIL(error_msg);
10900        }
10901        ender = result;
10902        if (PL_encoding && ender < 0x100) {
10903         goto recode_encoding;
10904        }
10905        if (ender > 0xff) {
10906         REQUIRE_UTF8;
10907        }
10908        break;
10909       }
10910      case 'x':
10911       {
10912        UV result = UV_MAX; /* initialize to erroneous
10913             value */
10914        const char* error_msg;
10915
10916        bool valid = grok_bslash_x(&p,
10917              &result,
10918              &error_msg,
10919              TRUE, /* out warnings */
10920              FALSE, /* not strict */
10921              TRUE, /* Output warnings
10922                 for non-
10923                 portables */
10924              UTF);
10925        if (! valid) {
10926         RExC_parse = p; /* going to die anyway; point
10927             to exact spot of failure */
10928         vFAIL(error_msg);
10929        }
10930        ender = result;
10931
10932        if (PL_encoding && ender < 0x100) {
10933         goto recode_encoding;
10934        }
10935        if (ender > 0xff) {
10936         REQUIRE_UTF8;
10937        }
10938        break;
10939       }
10940      case 'c':
10941       p++;
10942       ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10943       break;
10944      case '0': case '1': case '2': case '3':case '4':
10945      case '5': case '6': case '7':
10946       if (*p == '0' ||
10947        (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10948       {
10949        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10950        STRLEN numlen = 3;
10951        ender = grok_oct(p, &numlen, &flags, NULL);
10952        if (ender > 0xff) {
10953         REQUIRE_UTF8;
10954        }
10955        p += numlen;
10956        if (SIZE_ONLY   /* like \08, \178 */
10957         && numlen < 3
10958         && p < RExC_end
10959         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10960        {
10961         reg_warn_non_literal_string(
10962           p + 1,
10963           form_short_octal_warning(p, numlen));
10964        }
10965       }
10966       else {  /* Not to be treated as an octal constant, go
10967         find backref */
10968        --p;
10969        goto loopdone;
10970       }
10971       if (PL_encoding && ender < 0x100)
10972        goto recode_encoding;
10973       break;
10974      case '8': case '9': /* These are illegal unless backrefs */
10975       if (atoi(p) <= RExC_npar) {
10976        --p;   /* backup to backslash; handle as backref */
10977        goto loopdone;
10978       }
10979       goto unrecognized;
10980      recode_encoding:
10981       if (! RExC_override_recoding) {
10982        SV* enc = PL_encoding;
10983        ender = reg_recode((const char)(U8)ender, &enc);
10984        if (!enc && SIZE_ONLY)
10985         ckWARNreg(p, "Invalid escape in the specified encoding");
10986        REQUIRE_UTF8;
10987       }
10988       break;
10989      case '\0':
10990       if (p >= RExC_end)
10991        FAIL("Trailing \\");
10992       /* FALL THROUGH */
10993      default:
10994      unrecognized:
10995       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10996        /* Include any { following the alpha to emphasize
10997        * that it could be part of an escape at some point
10998        * in the future */
10999        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11000        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11001       }
11002       goto normal_default;
11003      } /* End of switch on '\' */
11004      break;
11005     default:    /* A literal character */
11006
11007      if (! SIZE_ONLY
11008       && RExC_flags & RXf_PMf_EXTENDED
11009       && ckWARN(WARN_DEPRECATED)
11010       && is_PATWS_non_low(p, UTF))
11011      {
11012       vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11013         "Escape literal pattern white space under /x");
11014      }
11015
11016     normal_default:
11017      if (UTF8_IS_START(*p) && UTF) {
11018       STRLEN numlen;
11019       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11020            &numlen, UTF8_ALLOW_DEFAULT);
11021       p += numlen;
11022      }
11023      else
11024       ender = (U8) *p++;
11025      break;
11026     } /* End of switch on the literal */
11027
11028     /* Here, have looked at the literal character and <ender>
11029     * contains its ordinal, <p> points to the character after it
11030     */
11031
11032     if ( RExC_flags & RXf_PMf_EXTENDED)
11033      p = regwhite( pRExC_state, p );
11034
11035     /* If the next thing is a quantifier, it applies to this
11036     * character only, which means that this character has to be in
11037     * its own node and can't just be appended to the string in an
11038     * existing node, so if there are already other characters in
11039     * the node, close the node with just them, and set up to do
11040     * this character again next time through, when it will be the
11041     * only thing in its new node */
11042     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11043     {
11044      p = oldp;
11045      goto loopdone;
11046     }
11047
11048     if (FOLD) {
11049      if (UTF
11050        /* See comments for join_exact() as to why we fold
11051        * this non-UTF at compile time */
11052       || (node_type == EXACTFU
11053        && ender == LATIN_SMALL_LETTER_SHARP_S))
11054      {
11055
11056
11057       /* Prime the casefolded buffer.  Locale rules, which
11058       * apply only to code points < 256, aren't known until
11059       * execution, so for them, just output the original
11060       * character using utf8.  If we start to fold non-UTF
11061       * patterns, be sure to update join_exact() */
11062       if (LOC && ender < 256) {
11063        if (UNI_IS_INVARIANT(ender)) {
11064         *s = (U8) ender;
11065         foldlen = 1;
11066        } else {
11067         *s = UTF8_TWO_BYTE_HI(ender);
11068         *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11069         foldlen = 2;
11070        }
11071       }
11072       else {
11073        UV folded = _to_uni_fold_flags(
11074           ender,
11075           (U8 *) s,
11076           &foldlen,
11077           FOLD_FLAGS_FULL
11078           | ((LOC) ?  FOLD_FLAGS_LOCALE
11079              : (ASCII_FOLD_RESTRICTED)
11080              ? FOLD_FLAGS_NOMIX_ASCII
11081              : 0)
11082            );
11083
11084        /* If this node only contains non-folding code
11085        * points so far, see if this new one is also
11086        * non-folding */
11087        if (maybe_exact) {
11088         if (folded != ender) {
11089          maybe_exact = FALSE;
11090         }
11091         else {
11092          /* Here the fold is the original; we have
11093          * to check further to see if anything
11094          * folds to it */
11095          if (! PL_utf8_foldable) {
11096           SV* swash = swash_init("utf8",
11097               "_Perl_Any_Folds",
11098               &PL_sv_undef, 1, 0);
11099           PL_utf8_foldable =
11100              _get_swash_invlist(swash);
11101           SvREFCNT_dec_NN(swash);
11102          }
11103          if (_invlist_contains_cp(PL_utf8_foldable,
11104                ender))
11105          {
11106           maybe_exact = FALSE;
11107          }
11108         }
11109        }
11110        ender = folded;
11111       }
11112       s += foldlen;
11113
11114       /* The loop increments <len> each time, as all but this
11115       * path (and the one just below for UTF) through it add
11116       * a single byte to the EXACTish node.  But this one
11117       * has changed len to be the correct final value, so
11118       * subtract one to cancel out the increment that
11119       * follows */
11120       len += foldlen - 1;
11121      }
11122      else {
11123       *(s++) = (char) ender;
11124       maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
11125      }
11126     }
11127     else if (UTF) {
11128      const STRLEN unilen = reguni(pRExC_state, ender, s);
11129      if (unilen > 0) {
11130      s   += unilen;
11131      len += unilen;
11132      }
11133
11134      /* See comment just above for - 1 */
11135      len--;
11136     }
11137     else {
11138      REGC((char)ender, s++);
11139     }
11140
11141     if (next_is_quantifier) {
11142
11143      /* Here, the next input is a quantifier, and to get here,
11144      * the current character is the only one in the node.
11145      * Also, here <len> doesn't include the final byte for this
11146      * character */
11147      len++;
11148      goto loopdone;
11149     }
11150
11151    } /* End of loop through literal characters */
11152
11153    /* Here we have either exhausted the input or ran out of room in
11154    * the node.  (If we encountered a character that can't be in the
11155    * node, transfer is made directly to <loopdone>, and so we
11156    * wouldn't have fallen off the end of the loop.)  In the latter
11157    * case, we artificially have to split the node into two, because
11158    * we just don't have enough space to hold everything.  This
11159    * creates a problem if the final character participates in a
11160    * multi-character fold in the non-final position, as a match that
11161    * should have occurred won't, due to the way nodes are matched,
11162    * and our artificial boundary.  So back off until we find a non-
11163    * problematic character -- one that isn't at the beginning or
11164    * middle of such a fold.  (Either it doesn't participate in any
11165    * folds, or appears only in the final position of all the folds it
11166    * does participate in.)  A better solution with far fewer false
11167    * positives, and that would fill the nodes more completely, would
11168    * be to actually have available all the multi-character folds to
11169    * test against, and to back-off only far enough to be sure that
11170    * this node isn't ending with a partial one.  <upper_parse> is set
11171    * further below (if we need to reparse the node) to include just
11172    * up through that final non-problematic character that this code
11173    * identifies, so when it is set to less than the full node, we can
11174    * skip the rest of this */
11175    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11176
11177     const STRLEN full_len = len;
11178
11179     assert(len >= MAX_NODE_STRING_SIZE);
11180
11181     /* Here, <s> points to the final byte of the final character.
11182     * Look backwards through the string until find a non-
11183     * problematic character */
11184
11185     if (! UTF) {
11186
11187      /* These two have no multi-char folds to non-UTF characters
11188      */
11189      if (ASCII_FOLD_RESTRICTED || LOC) {
11190       goto loopdone;
11191      }
11192
11193      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11194      len = s - s0 + 1;
11195     }
11196     else {
11197      if (!  PL_NonL1NonFinalFold) {
11198       PL_NonL1NonFinalFold = _new_invlist_C_array(
11199           NonL1_Perl_Non_Final_Folds_invlist);
11200      }
11201
11202      /* Point to the first byte of the final character */
11203      s = (char *) utf8_hop((U8 *) s, -1);
11204
11205      while (s >= s0) {   /* Search backwards until find
11206           non-problematic char */
11207       if (UTF8_IS_INVARIANT(*s)) {
11208
11209        /* There are no ascii characters that participate
11210        * in multi-char folds under /aa.  In EBCDIC, the
11211        * non-ascii invariants are all control characters,
11212        * so don't ever participate in any folds. */
11213        if (ASCII_FOLD_RESTRICTED
11214         || ! IS_NON_FINAL_FOLD(*s))
11215        {
11216         break;
11217        }
11218       }
11219       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11220
11221        /* No Latin1 characters participate in multi-char
11222        * folds under /l */
11223        if (LOC
11224         || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11225                 *s, *(s+1))))
11226        {
11227         break;
11228        }
11229       }
11230       else if (! _invlist_contains_cp(
11231           PL_NonL1NonFinalFold,
11232           valid_utf8_to_uvchr((U8 *) s, NULL)))
11233       {
11234        break;
11235       }
11236
11237       /* Here, the current character is problematic in that
11238       * it does occur in the non-final position of some
11239       * fold, so try the character before it, but have to
11240       * special case the very first byte in the string, so
11241       * we don't read outside the string */
11242       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11243      } /* End of loop backwards through the string */
11244
11245      /* If there were only problematic characters in the string,
11246      * <s> will point to before s0, in which case the length
11247      * should be 0, otherwise include the length of the
11248      * non-problematic character just found */
11249      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11250     }
11251
11252     /* Here, have found the final character, if any, that is
11253     * non-problematic as far as ending the node without splitting
11254     * it across a potential multi-char fold.  <len> contains the
11255     * number of bytes in the node up-to and including that
11256     * character, or is 0 if there is no such character, meaning
11257     * the whole node contains only problematic characters.  In
11258     * this case, give up and just take the node as-is.  We can't
11259     * do any better */
11260     if (len == 0) {
11261      len = full_len;
11262     } else {
11263
11264      /* Here, the node does contain some characters that aren't
11265      * problematic.  If one such is the final character in the
11266      * node, we are done */
11267      if (len == full_len) {
11268       goto loopdone;
11269      }
11270      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11271
11272       /* If the final character is problematic, but the
11273       * penultimate is not, back-off that last character to
11274       * later start a new node with it */
11275       p = oldp;
11276       goto loopdone;
11277      }
11278
11279      /* Here, the final non-problematic character is earlier
11280      * in the input than the penultimate character.  What we do
11281      * is reparse from the beginning, going up only as far as
11282      * this final ok one, thus guaranteeing that the node ends
11283      * in an acceptable character.  The reason we reparse is
11284      * that we know how far in the character is, but we don't
11285      * know how to correlate its position with the input parse.
11286      * An alternate implementation would be to build that
11287      * correlation as we go along during the original parse,
11288      * but that would entail extra work for every node, whereas
11289      * this code gets executed only when the string is too
11290      * large for the node, and the final two characters are
11291      * problematic, an infrequent occurrence.  Yet another
11292      * possible strategy would be to save the tail of the
11293      * string, and the next time regatom is called, initialize
11294      * with that.  The problem with this is that unless you
11295      * back off one more character, you won't be guaranteed
11296      * regatom will get called again, unless regbranch,
11297      * regpiece ... are also changed.  If you do back off that
11298      * extra character, so that there is input guaranteed to
11299      * force calling regatom, you can't handle the case where
11300      * just the first character in the node is acceptable.  I
11301      * (khw) decided to try this method which doesn't have that
11302      * pitfall; if performance issues are found, we can do a
11303      * combination of the current approach plus that one */
11304      upper_parse = len;
11305      len = 0;
11306      s = s0;
11307      goto reparse;
11308     }
11309    }   /* End of verifying node ends with an appropriate char */
11310
11311   loopdone:   /* Jumped to when encounters something that shouldn't be in
11312      the node */
11313
11314    /* If 'maybe_exact' is still set here, means there are no
11315    * code points in the node that participate in folds */
11316    if (FOLD && maybe_exact) {
11317     OP(ret) = EXACT;
11318    }
11319
11320    /* I (khw) don't know if you can get here with zero length, but the
11321    * old code handled this situation by creating a zero-length EXACT
11322    * node.  Might as well be NOTHING instead */
11323    if (len == 0) {
11324     OP(ret) = NOTHING;
11325    }
11326    else{
11327     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11328    }
11329
11330    RExC_parse = p - 1;
11331    Set_Node_Cur_Length(ret); /* MJD */
11332    nextchar(pRExC_state);
11333    {
11334     /* len is STRLEN which is unsigned, need to copy to signed */
11335     IV iv = len;
11336     if (iv < 0)
11337      vFAIL("Internal disaster");
11338    }
11339
11340   } /* End of label 'defchar:' */
11341   break;
11342  } /* End of giant switch on input character */
11343
11344  return(ret);
11345 }
11346
11347 STATIC char *
11348 S_regwhite( RExC_state_t *pRExC_state, char *p )
11349 {
11350  const char *e = RExC_end;
11351
11352  PERL_ARGS_ASSERT_REGWHITE;
11353
11354  while (p < e) {
11355   if (isSPACE(*p))
11356    ++p;
11357   else if (*p == '#') {
11358    bool ended = 0;
11359    do {
11360     if (*p++ == '\n') {
11361      ended = 1;
11362      break;
11363     }
11364    } while (p < e);
11365    if (!ended)
11366     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11367   }
11368   else
11369    break;
11370  }
11371  return p;
11372 }
11373
11374 STATIC char *
11375 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11376 {
11377  /* Returns the next non-pattern-white space, non-comment character (the
11378  * latter only if 'recognize_comment is true) in the string p, which is
11379  * ended by RExC_end.  If there is no line break ending a comment,
11380  * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11381  const char *e = RExC_end;
11382
11383  PERL_ARGS_ASSERT_REGPATWS;
11384
11385  while (p < e) {
11386   STRLEN len;
11387   if ((len = is_PATWS_safe(p, e, UTF))) {
11388    p += len;
11389   }
11390   else if (recognize_comment && *p == '#') {
11391    bool ended = 0;
11392    do {
11393     p++;
11394     if (is_LNBREAK_safe(p, e, UTF)) {
11395      ended = 1;
11396      break;
11397     }
11398    } while (p < e);
11399    if (!ended)
11400     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11401   }
11402   else
11403    break;
11404  }
11405  return p;
11406 }
11407
11408 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11409    Character classes ([:foo:]) can also be negated ([:^foo:]).
11410    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11411    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11412    but trigger failures because they are currently unimplemented. */
11413
11414 #define POSIXCC_DONE(c)   ((c) == ':')
11415 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11416 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11417
11418 PERL_STATIC_INLINE I32
11419 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11420 {
11421  dVAR;
11422  I32 namedclass = OOB_NAMEDCLASS;
11423
11424  PERL_ARGS_ASSERT_REGPPOSIXCC;
11425
11426  if (value == '[' && RExC_parse + 1 < RExC_end &&
11427   /* I smell either [: or [= or [. -- POSIX has been here, right? */
11428   POSIXCC(UCHARAT(RExC_parse)))
11429  {
11430   const char c = UCHARAT(RExC_parse);
11431   char* const s = RExC_parse++;
11432
11433   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11434    RExC_parse++;
11435   if (RExC_parse == RExC_end) {
11436    if (strict) {
11437
11438     /* Try to give a better location for the error (than the end of
11439     * the string) by looking for the matching ']' */
11440     RExC_parse = s;
11441     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11442      RExC_parse++;
11443     }
11444     vFAIL2("Unmatched '%c' in POSIX class", c);
11445    }
11446    /* Grandfather lone [:, [=, [. */
11447    RExC_parse = s;
11448   }
11449   else {
11450    const char* const t = RExC_parse++; /* skip over the c */
11451    assert(*t == c);
11452
11453    if (UCHARAT(RExC_parse) == ']') {
11454     const char *posixcc = s + 1;
11455     RExC_parse++; /* skip over the ending ] */
11456
11457     if (*s == ':') {
11458      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11459      const I32 skip = t - posixcc;
11460
11461      /* Initially switch on the length of the name.  */
11462      switch (skip) {
11463      case 4:
11464       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11465               this is the Perl \w
11466               */
11467        namedclass = ANYOF_WORDCHAR;
11468       break;
11469      case 5:
11470       /* Names all of length 5.  */
11471       /* alnum alpha ascii blank cntrl digit graph lower
11472       print punct space upper  */
11473       /* Offset 4 gives the best switch position.  */
11474       switch (posixcc[4]) {
11475       case 'a':
11476        if (memEQ(posixcc, "alph", 4)) /* alpha */
11477         namedclass = ANYOF_ALPHA;
11478        break;
11479       case 'e':
11480        if (memEQ(posixcc, "spac", 4)) /* space */
11481         namedclass = ANYOF_PSXSPC;
11482        break;
11483       case 'h':
11484        if (memEQ(posixcc, "grap", 4)) /* graph */
11485         namedclass = ANYOF_GRAPH;
11486        break;
11487       case 'i':
11488        if (memEQ(posixcc, "asci", 4)) /* ascii */
11489         namedclass = ANYOF_ASCII;
11490        break;
11491       case 'k':
11492        if (memEQ(posixcc, "blan", 4)) /* blank */
11493         namedclass = ANYOF_BLANK;
11494        break;
11495       case 'l':
11496        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11497         namedclass = ANYOF_CNTRL;
11498        break;
11499       case 'm':
11500        if (memEQ(posixcc, "alnu", 4)) /* alnum */
11501         namedclass = ANYOF_ALPHANUMERIC;
11502        break;
11503       case 'r':
11504        if (memEQ(posixcc, "lowe", 4)) /* lower */
11505         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11506        else if (memEQ(posixcc, "uppe", 4)) /* upper */
11507         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11508        break;
11509       case 't':
11510        if (memEQ(posixcc, "digi", 4)) /* digit */
11511         namedclass = ANYOF_DIGIT;
11512        else if (memEQ(posixcc, "prin", 4)) /* print */
11513         namedclass = ANYOF_PRINT;
11514        else if (memEQ(posixcc, "punc", 4)) /* punct */
11515         namedclass = ANYOF_PUNCT;
11516        break;
11517       }
11518       break;
11519      case 6:
11520       if (memEQ(posixcc, "xdigit", 6))
11521        namedclass = ANYOF_XDIGIT;
11522       break;
11523      }
11524
11525      if (namedclass == OOB_NAMEDCLASS)
11526       Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11527          t - s - 1, s + 1);
11528
11529      /* The #defines are structured so each complement is +1 to
11530      * the normal one */
11531      if (complement) {
11532       namedclass++;
11533      }
11534      assert (posixcc[skip] == ':');
11535      assert (posixcc[skip+1] == ']');
11536     } else if (!SIZE_ONLY) {
11537      /* [[=foo=]] and [[.foo.]] are still future. */
11538
11539      /* adjust RExC_parse so the warning shows after
11540      the class closes */
11541      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11542       RExC_parse++;
11543      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11544     }
11545    } else {
11546     /* Maternal grandfather:
11547     * "[:" ending in ":" but not in ":]" */
11548     if (strict) {
11549      vFAIL("Unmatched '[' in POSIX class");
11550     }
11551
11552     /* Grandfather lone [:, [=, [. */
11553     RExC_parse = s;
11554    }
11555   }
11556  }
11557
11558  return namedclass;
11559 }
11560
11561 STATIC bool
11562 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11563 {
11564  /* This applies some heuristics at the current parse position (which should
11565  * be at a '[') to see if what follows might be intended to be a [:posix:]
11566  * class.  It returns true if it really is a posix class, of course, but it
11567  * also can return true if it thinks that what was intended was a posix
11568  * class that didn't quite make it.
11569  *
11570  * It will return true for
11571  *      [:alphanumerics:
11572  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11573  *                         ')' indicating the end of the (?[
11574  *      [:any garbage including %^&$ punctuation:]
11575  *
11576  * This is designed to be called only from S_handle_regex_sets; it could be
11577  * easily adapted to be called from the spot at the beginning of regclass()
11578  * that checks to see in a normal bracketed class if the surrounding []
11579  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11580  * change long-standing behavior, so I (khw) didn't do that */
11581  char* p = RExC_parse + 1;
11582  char first_char = *p;
11583
11584  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11585
11586  assert(*(p - 1) == '[');
11587
11588  if (! POSIXCC(first_char)) {
11589   return FALSE;
11590  }
11591
11592  p++;
11593  while (p < RExC_end && isWORDCHAR(*p)) p++;
11594
11595  if (p >= RExC_end) {
11596   return FALSE;
11597  }
11598
11599  if (p - RExC_parse > 2    /* Got at least 1 word character */
11600   && (*p == first_char
11601    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11602  {
11603   return TRUE;
11604  }
11605
11606  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11607
11608  return (p
11609    && p - RExC_parse > 2 /* [:] evaluates to colon;
11610          [::] is a bad posix class. */
11611    && first_char == *(p - 1));
11612 }
11613
11614 STATIC regnode *
11615 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11616     char * const oregcomp_parse)
11617 {
11618  /* Handle the (?[...]) construct to do set operations */
11619
11620  U8 curchar;
11621  UV start, end; /* End points of code point ranges */
11622  SV* result_string;
11623  char *save_end, *save_parse;
11624  SV* final;
11625  STRLEN len;
11626  regnode* node;
11627  AV* stack;
11628  const bool save_fold = FOLD;
11629
11630  GET_RE_DEBUG_FLAGS_DECL;
11631
11632  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11633
11634  if (LOC) {
11635   vFAIL("(?[...]) not valid in locale");
11636  }
11637  RExC_uni_semantics = 1;
11638
11639  /* This will return only an ANYOF regnode, or (unlikely) something smaller
11640  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11641  * call regclass to handle '[]' so as to not have to reinvent its parsing
11642  * rules here (throwing away the size it computes each time).  And, we exit
11643  * upon an unescaped ']' that isn't one ending a regclass.  To do both
11644  * these things, we need to realize that something preceded by a backslash
11645  * is escaped, so we have to keep track of backslashes */
11646  if (SIZE_ONLY) {
11647
11648   Perl_ck_warner_d(aTHX_
11649    packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11650    "The regex_sets feature is experimental" REPORT_LOCATION,
11651    (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11652
11653   while (RExC_parse < RExC_end) {
11654    SV* current = NULL;
11655    RExC_parse = regpatws(pRExC_state, RExC_parse,
11656         TRUE); /* means recognize comments */
11657    switch (*RExC_parse) {
11658     default:
11659      break;
11660     case '\\':
11661      /* Skip the next byte (which could cause us to end up in
11662      * the middle of a UTF-8 character, but since none of those
11663      * are confusable with anything we currently handle in this
11664      * switch (invariants all), it's safe.  We'll just hit the
11665      * default: case next time and keep on incrementing until
11666      * we find one of the invariants we do handle. */
11667      RExC_parse++;
11668      break;
11669     case '[':
11670     {
11671      /* If this looks like it is a [:posix:] class, leave the
11672      * parse pointer at the '[' to fool regclass() into
11673      * thinking it is part of a '[[:posix:]]'.  That function
11674      * will use strict checking to force a syntax error if it
11675      * doesn't work out to a legitimate class */
11676      bool is_posix_class
11677          = could_it_be_a_POSIX_class(pRExC_state);
11678      if (! is_posix_class) {
11679       RExC_parse++;
11680      }
11681
11682      /* regclass() can only return RESTART_UTF8 if multi-char
11683      folds are allowed.  */
11684      if (!regclass(pRExC_state, flagp,depth+1,
11685         is_posix_class, /* parse the whole char
11686              class only if not a
11687              posix class */
11688         FALSE, /* don't allow multi-char folds */
11689         TRUE, /* silence non-portable warnings. */
11690         &current))
11691       FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11692        *flagp);
11693
11694      /* function call leaves parse pointing to the ']', except
11695      * if we faked it */
11696      if (is_posix_class) {
11697       RExC_parse--;
11698      }
11699
11700      SvREFCNT_dec(current);   /* In case it returned something */
11701      break;
11702     }
11703
11704     case ']':
11705      RExC_parse++;
11706      if (RExC_parse < RExC_end
11707       && *RExC_parse == ')')
11708      {
11709       node = reganode(pRExC_state, ANYOF, 0);
11710       RExC_size += ANYOF_SKIP;
11711       nextchar(pRExC_state);
11712       Set_Node_Length(node,
11713         RExC_parse - oregcomp_parse + 1); /* MJD */
11714       return node;
11715      }
11716      goto no_close;
11717    }
11718    RExC_parse++;
11719   }
11720
11721   no_close:
11722   FAIL("Syntax error in (?[...])");
11723  }
11724
11725  /* Pass 2 only after this.  Everything in this construct is a
11726  * metacharacter.  Operands begin with either a '\' (for an escape
11727  * sequence), or a '[' for a bracketed character class.  Any other
11728  * character should be an operator, or parenthesis for grouping.  Both
11729  * types of operands are handled by calling regclass() to parse them.  It
11730  * is called with a parameter to indicate to return the computed inversion
11731  * list.  The parsing here is implemented via a stack.  Each entry on the
11732  * stack is a single character representing one of the operators, or the
11733  * '('; or else a pointer to an operand inversion list. */
11734
11735 #define IS_OPERAND(a)  (! SvIOK(a))
11736
11737  /* The stack starts empty.  It is a syntax error if the first thing parsed
11738  * is a binary operator; everything else is pushed on the stack.  When an
11739  * operand is parsed, the top of the stack is examined.  If it is a binary
11740  * operator, the item before it should be an operand, and both are replaced
11741  * by the result of doing that operation on the new operand and the one on
11742  * the stack.   Thus a sequence of binary operands is reduced to a single
11743  * one before the next one is parsed.
11744  *
11745  * A unary operator may immediately follow a binary in the input, for
11746  * example
11747  *      [a] + ! [b]
11748  * When an operand is parsed and the top of the stack is a unary operator,
11749  * the operation is performed, and then the stack is rechecked to see if
11750  * this new operand is part of a binary operation; if so, it is handled as
11751  * above.
11752  *
11753  * A '(' is simply pushed on the stack; it is valid only if the stack is
11754  * empty, or the top element of the stack is an operator or another '('
11755  * (for which the parenthesized expression will become an operand).  By the
11756  * time the corresponding ')' is parsed everything in between should have
11757  * been parsed and evaluated to a single operand (or else is a syntax
11758  * error), and is handled as a regular operand */
11759
11760  sv_2mortal((SV *)(stack = newAV()));
11761
11762  while (RExC_parse < RExC_end) {
11763   I32 top_index = av_tindex(stack);
11764   SV** top_ptr;
11765   SV* current = NULL;
11766
11767   /* Skip white space */
11768   RExC_parse = regpatws(pRExC_state, RExC_parse,
11769         TRUE); /* means recognize comments */
11770   if (RExC_parse >= RExC_end) {
11771    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11772   }
11773   if ((curchar = UCHARAT(RExC_parse)) == ']') {
11774    break;
11775   }
11776
11777   switch (curchar) {
11778
11779    case '?':
11780     if (av_tindex(stack) >= 0   /* This makes sure that we can
11781            safely subtract 1 from
11782            RExC_parse in the next clause.
11783            If we have something on the
11784            stack, we have parsed something
11785            */
11786      && UCHARAT(RExC_parse - 1) == '('
11787      && RExC_parse < RExC_end)
11788     {
11789      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11790      * This happens when we have some thing like
11791      *
11792      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11793      *   ...
11794      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
11795      *
11796      * Here we would be handling the interpolated
11797      * '$thai_or_lao'.  We handle this by a recursive call to
11798      * ourselves which returns the inversion list the
11799      * interpolated expression evaluates to.  We use the flags
11800      * from the interpolated pattern. */
11801      U32 save_flags = RExC_flags;
11802      const char * const save_parse = ++RExC_parse;
11803
11804      parse_lparen_question_flags(pRExC_state);
11805
11806      if (RExC_parse == save_parse  /* Makes sure there was at
11807              least one flag (or this
11808              embedding wasn't compiled)
11809             */
11810       || RExC_parse >= RExC_end - 4
11811       || UCHARAT(RExC_parse) != ':'
11812       || UCHARAT(++RExC_parse) != '('
11813       || UCHARAT(++RExC_parse) != '?'
11814       || UCHARAT(++RExC_parse) != '[')
11815      {
11816
11817       /* In combination with the above, this moves the
11818       * pointer to the point just after the first erroneous
11819       * character (or if there are no flags, to where they
11820       * should have been) */
11821       if (RExC_parse >= RExC_end - 4) {
11822        RExC_parse = RExC_end;
11823       }
11824       else if (RExC_parse != save_parse) {
11825        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11826       }
11827       vFAIL("Expecting '(?flags:(?[...'");
11828      }
11829      RExC_parse++;
11830      (void) handle_regex_sets(pRExC_state, &current, flagp,
11831              depth+1, oregcomp_parse);
11832
11833      /* Here, 'current' contains the embedded expression's
11834      * inversion list, and RExC_parse points to the trailing
11835      * ']'; the next character should be the ')' which will be
11836      * paired with the '(' that has been put on the stack, so
11837      * the whole embedded expression reduces to '(operand)' */
11838      RExC_parse++;
11839
11840      RExC_flags = save_flags;
11841      goto handle_operand;
11842     }
11843     /* FALL THROUGH */
11844
11845    default:
11846     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11847     vFAIL("Unexpected character");
11848
11849    case '\\':
11850     /* regclass() can only return RESTART_UTF8 if multi-char
11851     folds are allowed.  */
11852     if (!regclass(pRExC_state, flagp,depth+1,
11853        TRUE, /* means parse just the next thing */
11854        FALSE, /* don't allow multi-char folds */
11855        FALSE, /* don't silence non-portable warnings.  */
11856        &current))
11857      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11858       *flagp);
11859     /* regclass() will return with parsing just the \ sequence,
11860     * leaving the parse pointer at the next thing to parse */
11861     RExC_parse--;
11862     goto handle_operand;
11863
11864    case '[':   /* Is a bracketed character class */
11865    {
11866     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11867
11868     if (! is_posix_class) {
11869      RExC_parse++;
11870     }
11871
11872     /* regclass() can only return RESTART_UTF8 if multi-char
11873     folds are allowed.  */
11874     if(!regclass(pRExC_state, flagp,depth+1,
11875        is_posix_class, /* parse the whole char class
11876             only if not a posix class */
11877        FALSE, /* don't allow multi-char folds */
11878        FALSE, /* don't silence non-portable warnings.  */
11879        &current))
11880      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11881       *flagp);
11882     /* function call leaves parse pointing to the ']', except if we
11883     * faked it */
11884     if (is_posix_class) {
11885      RExC_parse--;
11886     }
11887
11888     goto handle_operand;
11889    }
11890
11891    case '&':
11892    case '|':
11893    case '+':
11894    case '-':
11895    case '^':
11896     if (top_index < 0
11897      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11898      || ! IS_OPERAND(*top_ptr))
11899     {
11900      RExC_parse++;
11901      vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11902     }
11903     av_push(stack, newSVuv(curchar));
11904     break;
11905
11906    case '!':
11907     av_push(stack, newSVuv(curchar));
11908     break;
11909
11910    case '(':
11911     if (top_index >= 0) {
11912      top_ptr = av_fetch(stack, top_index, FALSE);
11913      assert(top_ptr);
11914      if (IS_OPERAND(*top_ptr)) {
11915       RExC_parse++;
11916       vFAIL("Unexpected '(' with no preceding operator");
11917      }
11918     }
11919     av_push(stack, newSVuv(curchar));
11920     break;
11921
11922    case ')':
11923    {
11924     SV* lparen;
11925     if (top_index < 1
11926      || ! (current = av_pop(stack))
11927      || ! IS_OPERAND(current)
11928      || ! (lparen = av_pop(stack))
11929      || IS_OPERAND(lparen)
11930      || SvUV(lparen) != '(')
11931     {
11932      SvREFCNT_dec(current);
11933      RExC_parse++;
11934      vFAIL("Unexpected ')'");
11935     }
11936     top_index -= 2;
11937     SvREFCNT_dec_NN(lparen);
11938
11939     /* FALL THROUGH */
11940    }
11941
11942    handle_operand:
11943
11944     /* Here, we have an operand to process, in 'current' */
11945
11946     if (top_index < 0) {    /* Just push if stack is empty */
11947      av_push(stack, current);
11948     }
11949     else {
11950      SV* top = av_pop(stack);
11951      SV *prev = NULL;
11952      char current_operator;
11953
11954      if (IS_OPERAND(top)) {
11955       SvREFCNT_dec_NN(top);
11956       SvREFCNT_dec_NN(current);
11957       vFAIL("Operand with no preceding operator");
11958      }
11959      current_operator = (char) SvUV(top);
11960      switch (current_operator) {
11961       case '(':   /* Push the '(' back on followed by the new
11962          operand */
11963        av_push(stack, top);
11964        av_push(stack, current);
11965        SvREFCNT_inc(top);  /* Counters the '_dec' done
11966             just after the 'break', so
11967             it doesn't get wrongly freed
11968             */
11969        break;
11970
11971       case '!':
11972        _invlist_invert(current);
11973
11974        /* Unlike binary operators, the top of the stack,
11975        * now that this unary one has been popped off, may
11976        * legally be an operator, and we now have operand
11977        * for it. */
11978        top_index--;
11979        SvREFCNT_dec_NN(top);
11980        goto handle_operand;
11981
11982       case '&':
11983        prev = av_pop(stack);
11984        _invlist_intersection(prev,
11985             current,
11986             &current);
11987        av_push(stack, current);
11988        break;
11989
11990       case '|':
11991       case '+':
11992        prev = av_pop(stack);
11993        _invlist_union(prev, current, &current);
11994        av_push(stack, current);
11995        break;
11996
11997       case '-':
11998        prev = av_pop(stack);;
11999        _invlist_subtract(prev, current, &current);
12000        av_push(stack, current);
12001        break;
12002
12003       case '^':   /* The union minus the intersection */
12004       {
12005        SV* i = NULL;
12006        SV* u = NULL;
12007        SV* element;
12008
12009        prev = av_pop(stack);
12010        _invlist_union(prev, current, &u);
12011        _invlist_intersection(prev, current, &i);
12012        /* _invlist_subtract will overwrite current
12013         without freeing what it already contains */
12014        element = current;
12015        _invlist_subtract(u, i, &current);
12016        av_push(stack, current);
12017        SvREFCNT_dec_NN(i);
12018        SvREFCNT_dec_NN(u);
12019        SvREFCNT_dec_NN(element);
12020        break;
12021       }
12022
12023       default:
12024        Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12025     }
12026     SvREFCNT_dec_NN(top);
12027     SvREFCNT_dec(prev);
12028    }
12029   }
12030
12031   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12032  }
12033
12034  if (av_tindex(stack) < 0   /* Was empty */
12035   || ((final = av_pop(stack)) == NULL)
12036   || ! IS_OPERAND(final)
12037   || av_tindex(stack) >= 0)  /* More left on stack */
12038  {
12039   vFAIL("Incomplete expression within '(?[ ])'");
12040  }
12041
12042  /* Here, 'final' is the resultant inversion list from evaluating the
12043  * expression.  Return it if so requested */
12044  if (return_invlist) {
12045   *return_invlist = final;
12046   return END;
12047  }
12048
12049  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
12050  * expecting a string of ranges and individual code points */
12051  invlist_iterinit(final);
12052  result_string = newSVpvs("");
12053  while (invlist_iternext(final, &start, &end)) {
12054   if (start == end) {
12055    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12056   }
12057   else {
12058    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12059              start,          end);
12060   }
12061  }
12062
12063  save_parse = RExC_parse;
12064  RExC_parse = SvPV(result_string, len);
12065  save_end = RExC_end;
12066  RExC_end = RExC_parse + len;
12067
12068  /* We turn off folding around the call, as the class we have constructed
12069  * already has all folding taken into consideration, and we don't want
12070  * regclass() to add to that */
12071  RExC_flags &= ~RXf_PMf_FOLD;
12072  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12073  */
12074  node = regclass(pRExC_state, flagp,depth+1,
12075      FALSE, /* means parse the whole char class */
12076      FALSE, /* don't allow multi-char folds */
12077      TRUE, /* silence non-portable warnings.  The above may very
12078        well have generated non-portable code points, but
12079        they're valid on this machine */
12080      NULL);
12081  if (!node)
12082   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12083      PTR2UV(flagp));
12084  if (save_fold) {
12085   RExC_flags |= RXf_PMf_FOLD;
12086  }
12087  RExC_parse = save_parse + 1;
12088  RExC_end = save_end;
12089  SvREFCNT_dec_NN(final);
12090  SvREFCNT_dec_NN(result_string);
12091
12092  nextchar(pRExC_state);
12093  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12094  return node;
12095 }
12096 #undef IS_OPERAND
12097
12098 /* The names of properties whose definitions are not known at compile time are
12099  * stored in this SV, after a constant heading.  So if the length has been
12100  * changed since initialization, then there is a run-time definition. */
12101 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12102
12103 STATIC regnode *
12104 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12105     const bool stop_at_1,  /* Just parse the next thing, don't
12106           look for a full character class */
12107     bool allow_multi_folds,
12108     const bool silence_non_portable,   /* Don't output warnings
12109              about too large
12110              characters */
12111     SV** ret_invlist)  /* Return an inversion list, not a node */
12112 {
12113  /* parse a bracketed class specification.  Most of these will produce an
12114  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12115  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
12116  * under /i with multi-character folds: it will be rewritten following the
12117  * paradigm of this example, where the <multi-fold>s are characters which
12118  * fold to multiple character sequences:
12119  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12120  * gets effectively rewritten as:
12121  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12122  * reg() gets called (recursively) on the rewritten version, and this
12123  * function will return what it constructs.  (Actually the <multi-fold>s
12124  * aren't physically removed from the [abcdefghi], it's just that they are
12125  * ignored in the recursion by means of a flag:
12126  * <RExC_in_multi_char_class>.)
12127  *
12128  * ANYOF nodes contain a bit map for the first 256 characters, with the
12129  * corresponding bit set if that character is in the list.  For characters
12130  * above 255, a range list or swash is used.  There are extra bits for \w,
12131  * etc. in locale ANYOFs, as what these match is not determinable at
12132  * compile time
12133  *
12134  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12135  * to be restarted.  This can only happen if ret_invlist is non-NULL.
12136  */
12137
12138  dVAR;
12139  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12140  IV range = 0;
12141  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12142  regnode *ret;
12143  STRLEN numlen;
12144  IV namedclass = OOB_NAMEDCLASS;
12145  char *rangebegin = NULL;
12146  bool need_class = 0;
12147  SV *listsv = NULL;
12148  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12149          than just initialized.  */
12150  SV* properties = NULL;    /* Code points that match \p{} \P{} */
12151  SV* posixes = NULL;     /* Code points that match classes like, [:word:],
12152        extended beyond the Latin1 range */
12153  UV element_count = 0;   /* Number of distinct elements in the class.
12154        Optimizations may be possible if this is tiny */
12155  AV * multi_char_matches = NULL; /* Code points that fold to more than one
12156          character; used under /i */
12157  UV n;
12158  char * stop_ptr = RExC_end;    /* where to stop parsing */
12159  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12160             space? */
12161  const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12162
12163  /* Unicode properties are stored in a swash; this holds the current one
12164  * being parsed.  If this swash is the only above-latin1 component of the
12165  * character class, an optimization is to pass it directly on to the
12166  * execution engine.  Otherwise, it is set to NULL to indicate that there
12167  * are other things in the class that have to be dealt with at execution
12168  * time */
12169  SV* swash = NULL;  /* Code points that match \p{} \P{} */
12170
12171  /* Set if a component of this character class is user-defined; just passed
12172  * on to the engine */
12173  bool has_user_defined_property = FALSE;
12174
12175  /* inversion list of code points this node matches only when the target
12176  * string is in UTF-8.  (Because is under /d) */
12177  SV* depends_list = NULL;
12178
12179  /* inversion list of code points this node matches.  For much of the
12180  * function, it includes only those that match regardless of the utf8ness
12181  * of the target string */
12182  SV* cp_list = NULL;
12183
12184 #ifdef EBCDIC
12185  /* In a range, counts how many 0-2 of the ends of it came from literals,
12186  * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12187  UV literal_endpoint = 0;
12188 #endif
12189  bool invert = FALSE;    /* Is this class to be complemented */
12190
12191  /* Is there any thing like \W or [:^digit:] that matches above the legal
12192  * Unicode range? */
12193  bool runtime_posix_matches_above_Unicode = FALSE;
12194
12195  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12196   case we need to change the emitted regop to an EXACT. */
12197  const char * orig_parse = RExC_parse;
12198  const I32 orig_size = RExC_size;
12199  GET_RE_DEBUG_FLAGS_DECL;
12200
12201  PERL_ARGS_ASSERT_REGCLASS;
12202 #ifndef DEBUGGING
12203  PERL_UNUSED_ARG(depth);
12204 #endif
12205
12206  DEBUG_PARSE("clas");
12207
12208  /* Assume we are going to generate an ANYOF node. */
12209  ret = reganode(pRExC_state, ANYOF, 0);
12210
12211  if (SIZE_ONLY) {
12212   RExC_size += ANYOF_SKIP;
12213   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12214  }
12215  else {
12216   ANYOF_FLAGS(ret) = 0;
12217
12218   RExC_emit += ANYOF_SKIP;
12219   if (LOC) {
12220    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12221   }
12222   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12223   initial_listsv_len = SvCUR(listsv);
12224   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12225  }
12226
12227  if (skip_white) {
12228   RExC_parse = regpatws(pRExC_state, RExC_parse,
12229        FALSE /* means don't recognize comments */);
12230  }
12231
12232  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12233   RExC_parse++;
12234   invert = TRUE;
12235   allow_multi_folds = FALSE;
12236   RExC_naughty++;
12237   if (skip_white) {
12238    RExC_parse = regpatws(pRExC_state, RExC_parse,
12239         FALSE /* means don't recognize comments */);
12240   }
12241  }
12242
12243  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12244  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12245   const char *s = RExC_parse;
12246   const char  c = *s++;
12247
12248   while (isWORDCHAR(*s))
12249    s++;
12250   if (*s && c == *s && s[1] == ']') {
12251    SAVEFREESV(RExC_rx_sv);
12252    ckWARN3reg(s+2,
12253      "POSIX syntax [%c %c] belongs inside character classes",
12254      c, c);
12255    (void)ReREFCNT_inc(RExC_rx_sv);
12256   }
12257  }
12258
12259  /* If the caller wants us to just parse a single element, accomplish this
12260  * by faking the loop ending condition */
12261  if (stop_at_1 && RExC_end > RExC_parse) {
12262   stop_ptr = RExC_parse + 1;
12263  }
12264
12265  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12266  if (UCHARAT(RExC_parse) == ']')
12267   goto charclassloop;
12268
12269 parseit:
12270  while (1) {
12271   if  (RExC_parse >= stop_ptr) {
12272    break;
12273   }
12274
12275   if (skip_white) {
12276    RExC_parse = regpatws(pRExC_state, RExC_parse,
12277         FALSE /* means don't recognize comments */);
12278   }
12279
12280   if  (UCHARAT(RExC_parse) == ']') {
12281    break;
12282   }
12283
12284  charclassloop:
12285
12286   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12287   save_value = value;
12288   save_prevvalue = prevvalue;
12289
12290   if (!range) {
12291    rangebegin = RExC_parse;
12292    element_count++;
12293   }
12294   if (UTF) {
12295    value = utf8n_to_uvchr((U8*)RExC_parse,
12296         RExC_end - RExC_parse,
12297         &numlen, UTF8_ALLOW_DEFAULT);
12298    RExC_parse += numlen;
12299   }
12300   else
12301    value = UCHARAT(RExC_parse++);
12302
12303   if (value == '['
12304    && RExC_parse < RExC_end
12305    && POSIXCC(UCHARAT(RExC_parse)))
12306   {
12307    namedclass = regpposixcc(pRExC_state, value, strict);
12308   }
12309   else if (value == '\\') {
12310    if (UTF) {
12311     value = utf8n_to_uvchr((U8*)RExC_parse,
12312         RExC_end - RExC_parse,
12313         &numlen, UTF8_ALLOW_DEFAULT);
12314     RExC_parse += numlen;
12315    }
12316    else
12317     value = UCHARAT(RExC_parse++);
12318
12319    /* Some compilers cannot handle switching on 64-bit integer
12320    * values, therefore value cannot be an UV.  Yes, this will
12321    * be a problem later if we want switch on Unicode.
12322    * A similar issue a little bit later when switching on
12323    * namedclass. --jhi */
12324
12325    /* If the \ is escaping white space when white space is being
12326    * skipped, it means that that white space is wanted literally, and
12327    * is already in 'value'.  Otherwise, need to translate the escape
12328    * into what it signifies. */
12329    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12330
12331    case 'w': namedclass = ANYOF_WORDCHAR; break;
12332    case 'W': namedclass = ANYOF_NWORDCHAR; break;
12333    case 's': namedclass = ANYOF_SPACE; break;
12334    case 'S': namedclass = ANYOF_NSPACE; break;
12335    case 'd': namedclass = ANYOF_DIGIT; break;
12336    case 'D': namedclass = ANYOF_NDIGIT; break;
12337    case 'v': namedclass = ANYOF_VERTWS; break;
12338    case 'V': namedclass = ANYOF_NVERTWS; break;
12339    case 'h': namedclass = ANYOF_HORIZWS; break;
12340    case 'H': namedclass = ANYOF_NHORIZWS; break;
12341    case 'N':  /* Handle \N{NAME} in class */
12342     {
12343      /* We only pay attention to the first char of
12344      multichar strings being returned. I kinda wonder
12345      if this makes sense as it does change the behaviour
12346      from earlier versions, OTOH that behaviour was broken
12347      as well. */
12348      if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12349          TRUE, /* => charclass */
12350          strict))
12351      {
12352       if (*flagp & RESTART_UTF8)
12353        FAIL("panic: grok_bslash_N set RESTART_UTF8");
12354       goto parseit;
12355      }
12356     }
12357     break;
12358    case 'p':
12359    case 'P':
12360     {
12361     char *e;
12362
12363     /* We will handle any undefined properties ourselves */
12364     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12365
12366     if (RExC_parse >= RExC_end)
12367      vFAIL2("Empty \\%c{}", (U8)value);
12368     if (*RExC_parse == '{') {
12369      const U8 c = (U8)value;
12370      e = strchr(RExC_parse++, '}');
12371      if (!e)
12372       vFAIL2("Missing right brace on \\%c{}", c);
12373      while (isSPACE(UCHARAT(RExC_parse)))
12374       RExC_parse++;
12375      if (e == RExC_parse)
12376       vFAIL2("Empty \\%c{}", c);
12377      n = e - RExC_parse;
12378      while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12379       n--;
12380     }
12381     else {
12382      e = RExC_parse;
12383      n = 1;
12384     }
12385     if (!SIZE_ONLY) {
12386      SV* invlist;
12387      char* name;
12388
12389      if (UCHARAT(RExC_parse) == '^') {
12390       RExC_parse++;
12391       n--;
12392       /* toggle.  (The rhs xor gets the single bit that
12393       * differs between P and p; the other xor inverts just
12394       * that bit) */
12395       value ^= 'P' ^ 'p';
12396
12397       while (isSPACE(UCHARAT(RExC_parse))) {
12398        RExC_parse++;
12399        n--;
12400       }
12401      }
12402      /* Try to get the definition of the property into
12403      * <invlist>.  If /i is in effect, the effective property
12404      * will have its name be <__NAME_i>.  The design is
12405      * discussed in commit
12406      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12407      Newx(name, n + sizeof("_i__\n"), char);
12408
12409      sprintf(name, "%s%.*s%s\n",
12410          (FOLD) ? "__" : "",
12411          (int)n,
12412          RExC_parse,
12413          (FOLD) ? "_i" : ""
12414      );
12415
12416      /* Look up the property name, and get its swash and
12417      * inversion list, if the property is found  */
12418      if (swash) {
12419       SvREFCNT_dec_NN(swash);
12420      }
12421      swash = _core_swash_init("utf8", name, &PL_sv_undef,
12422            1, /* binary */
12423            0, /* not tr/// */
12424            NULL, /* No inversion list */
12425            &swash_init_flags
12426            );
12427      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12428       if (swash) {
12429        SvREFCNT_dec_NN(swash);
12430        swash = NULL;
12431       }
12432
12433       /* Here didn't find it.  It could be a user-defined
12434       * property that will be available at run-time.  If we
12435       * accept only compile-time properties, is an error;
12436       * otherwise add it to the list for run-time look up */
12437       if (ret_invlist) {
12438        RExC_parse = e + 1;
12439        vFAIL3("Property '%.*s' is unknown", (int) n, name);
12440       }
12441       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12442           (value == 'p' ? '+' : '!'),
12443           name);
12444       has_user_defined_property = TRUE;
12445
12446       /* We don't know yet, so have to assume that the
12447       * property could match something in the Latin1 range,
12448       * hence something that isn't utf8.  Note that this
12449       * would cause things in <depends_list> to match
12450       * inappropriately, except that any \p{}, including
12451       * this one forces Unicode semantics, which means there
12452       * is <no depends_list> */
12453       ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12454      }
12455      else {
12456
12457       /* Here, did get the swash and its inversion list.  If
12458       * the swash is from a user-defined property, then this
12459       * whole character class should be regarded as such */
12460       has_user_defined_property =
12461          (swash_init_flags
12462          & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12463
12464       /* Invert if asking for the complement */
12465       if (value == 'P') {
12466        _invlist_union_complement_2nd(properties,
12467               invlist,
12468               &properties);
12469
12470        /* The swash can't be used as-is, because we've
12471        * inverted things; delay removing it to here after
12472        * have copied its invlist above */
12473        SvREFCNT_dec_NN(swash);
12474        swash = NULL;
12475       }
12476       else {
12477        _invlist_union(properties, invlist, &properties);
12478       }
12479      }
12480      Safefree(name);
12481     }
12482     RExC_parse = e + 1;
12483     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12484             named */
12485
12486     /* \p means they want Unicode semantics */
12487     RExC_uni_semantics = 1;
12488     }
12489     break;
12490    case 'n': value = '\n';   break;
12491    case 'r': value = '\r';   break;
12492    case 't': value = '\t';   break;
12493    case 'f': value = '\f';   break;
12494    case 'b': value = '\b';   break;
12495    case 'e': value = ASCII_TO_NATIVE('\033');break;
12496    case 'a': value = ASCII_TO_NATIVE('\007');break;
12497    case 'o':
12498     RExC_parse--; /* function expects to be pointed at the 'o' */
12499     {
12500      const char* error_msg;
12501      bool valid = grok_bslash_o(&RExC_parse,
12502            &value,
12503            &error_msg,
12504            SIZE_ONLY,   /* warnings in pass
12505                1 only */
12506            strict,
12507            silence_non_portable,
12508            UTF);
12509      if (! valid) {
12510       vFAIL(error_msg);
12511      }
12512     }
12513     if (PL_encoding && value < 0x100) {
12514      goto recode_encoding;
12515     }
12516     break;
12517    case 'x':
12518     RExC_parse--; /* function expects to be pointed at the 'x' */
12519     {
12520      const char* error_msg;
12521      bool valid = grok_bslash_x(&RExC_parse,
12522            &value,
12523            &error_msg,
12524            TRUE, /* Output warnings */
12525            strict,
12526            silence_non_portable,
12527            UTF);
12528      if (! valid) {
12529       vFAIL(error_msg);
12530      }
12531     }
12532     if (PL_encoding && value < 0x100)
12533      goto recode_encoding;
12534     break;
12535    case 'c':
12536     value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12537     break;
12538    case '0': case '1': case '2': case '3': case '4':
12539    case '5': case '6': case '7':
12540     {
12541      /* Take 1-3 octal digits */
12542      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12543      numlen = (strict) ? 4 : 3;
12544      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12545      RExC_parse += numlen;
12546      if (numlen != 3) {
12547       if (strict) {
12548        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12549        vFAIL("Need exactly 3 octal digits");
12550       }
12551       else if (! SIZE_ONLY /* like \08, \178 */
12552         && numlen < 3
12553         && RExC_parse < RExC_end
12554         && isDIGIT(*RExC_parse)
12555         && ckWARN(WARN_REGEXP))
12556       {
12557        SAVEFREESV(RExC_rx_sv);
12558        reg_warn_non_literal_string(
12559         RExC_parse + 1,
12560         form_short_octal_warning(RExC_parse, numlen));
12561        (void)ReREFCNT_inc(RExC_rx_sv);
12562       }
12563      }
12564      if (PL_encoding && value < 0x100)
12565       goto recode_encoding;
12566      break;
12567     }
12568    recode_encoding:
12569     if (! RExC_override_recoding) {
12570      SV* enc = PL_encoding;
12571      value = reg_recode((const char)(U8)value, &enc);
12572      if (!enc) {
12573       if (strict) {
12574        vFAIL("Invalid escape in the specified encoding");
12575       }
12576       else if (SIZE_ONLY) {
12577        ckWARNreg(RExC_parse,
12578         "Invalid escape in the specified encoding");
12579       }
12580      }
12581      break;
12582     }
12583    default:
12584     /* Allow \_ to not give an error */
12585     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12586      if (strict) {
12587       vFAIL2("Unrecognized escape \\%c in character class",
12588        (int)value);
12589      }
12590      else {
12591       SAVEFREESV(RExC_rx_sv);
12592       ckWARN2reg(RExC_parse,
12593        "Unrecognized escape \\%c in character class passed through",
12594        (int)value);
12595       (void)ReREFCNT_inc(RExC_rx_sv);
12596      }
12597     }
12598     break;
12599    }   /* End of switch on char following backslash */
12600   } /* end of handling backslash escape sequences */
12601 #ifdef EBCDIC
12602   else
12603    literal_endpoint++;
12604 #endif
12605
12606   /* Here, we have the current token in 'value' */
12607
12608   /* What matches in a locale is not known until runtime.  This includes
12609   * what the Posix classes (like \w, [:space:]) match.  Room must be
12610   * reserved (one time per class) to store such classes, either if Perl
12611   * is compiled so that locale nodes always should have this space, or
12612   * if there is such class info to be stored.  The space will contain a
12613   * bit for each named class that is to be matched against.  This isn't
12614   * needed for \p{} and pseudo-classes, as they are not affected by
12615   * locale, and hence are dealt with separately */
12616   if (LOC
12617    && ! need_class
12618    && (ANYOF_LOCALE == ANYOF_CLASS
12619     || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12620   {
12621    need_class = 1;
12622    if (SIZE_ONLY) {
12623     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12624    }
12625    else {
12626     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12627     ANYOF_CLASS_ZERO(ret);
12628    }
12629    ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12630   }
12631
12632   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12633
12634    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12635    * literal, as is the character that began the false range, i.e.
12636    * the 'a' in the examples */
12637    if (range) {
12638     if (!SIZE_ONLY) {
12639      const int w = (RExC_parse >= rangebegin)
12640         ? RExC_parse - rangebegin
12641         : 0;
12642      if (strict) {
12643       vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12644      }
12645      else {
12646       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12647       ckWARN4reg(RExC_parse,
12648         "False [] range \"%*.*s\"",
12649         w, w, rangebegin);
12650       (void)ReREFCNT_inc(RExC_rx_sv);
12651       cp_list = add_cp_to_invlist(cp_list, '-');
12652       cp_list = add_cp_to_invlist(cp_list, prevvalue);
12653      }
12654     }
12655
12656     range = 0; /* this was not a true range */
12657     element_count += 2; /* So counts for three values */
12658    }
12659
12660    if (! SIZE_ONLY) {
12661     U8 classnum = namedclass_to_classnum(namedclass);
12662     if (namedclass >= ANYOF_MAX) {  /* If a special class */
12663      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12664
12665       /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12666       * /l make a difference in what these match.  There
12667       * would be problems if these characters had folds
12668       * other than themselves, as cp_list is subject to
12669       * folding. */
12670       if (classnum != _CC_VERTSPACE) {
12671        assert(   namedclass == ANYOF_HORIZWS
12672         || namedclass == ANYOF_NHORIZWS);
12673
12674        /* It turns out that \h is just a synonym for
12675        * XPosixBlank */
12676        classnum = _CC_BLANK;
12677       }
12678
12679       _invlist_union_maybe_complement_2nd(
12680         cp_list,
12681         PL_XPosix_ptrs[classnum],
12682         cBOOL(namedclass % 2), /* Complement if odd
12683               (NHORIZWS, NVERTWS)
12684               */
12685         &cp_list);
12686      }
12687     }
12688     else if (classnum == _CC_ASCII) {
12689 #ifdef HAS_ISASCII
12690      if (LOC) {
12691       ANYOF_CLASS_SET(ret, namedclass);
12692      }
12693      else
12694 #endif  /* Not isascii(); just use the hard-coded definition for it */
12695      {
12696       _invlist_union_maybe_complement_2nd(
12697         posixes,
12698         PL_ASCII,
12699         cBOOL(namedclass % 2), /* Complement if odd
12700               (NASCII) */
12701         &posixes);
12702
12703       /* The code points 128-255 added above will be
12704       * subtracted out below under /d, so the flag needs to
12705       * be set */
12706       if (namedclass == ANYOF_NASCII && DEPENDS_SEMANTICS) {
12707        ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12708       }
12709      }
12710     }
12711     else {  /* Garden variety class */
12712
12713      /* The ascii range inversion list */
12714      SV* ascii_source = PL_Posix_ptrs[classnum];
12715
12716      /* The full Latin1 range inversion list */
12717      SV* l1_source = PL_L1Posix_ptrs[classnum];
12718
12719      /* This code is structured into two major clauses.  The
12720      * first is for classes whose complete definitions may not
12721      * already be known.  It not, the Latin1 definition
12722      * (guaranteed to already known) is used plus code is
12723      * generated to load the rest at run-time (only if needed).
12724      * If the complete definition is known, it drops down to
12725      * the second clause, where the complete definition is
12726      * known */
12727
12728      if (classnum < _FIRST_NON_SWASH_CC) {
12729
12730       /* Here, the class has a swash, which may or not
12731       * already be loaded */
12732
12733       /* The name of the property to use to match the full
12734       * eXtended Unicode range swash for this character
12735       * class */
12736       const char *Xname = swash_property_names[classnum];
12737
12738       /* If returning the inversion list, we can't defer
12739       * getting this until runtime */
12740       if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12741        PL_utf8_swash_ptrs[classnum] =
12742         _core_swash_init("utf8", Xname, &PL_sv_undef,
12743            1, /* binary */
12744            0, /* not tr/// */
12745            NULL, /* No inversion list */
12746            NULL  /* No flags */
12747            );
12748        assert(PL_utf8_swash_ptrs[classnum]);
12749       }
12750       if ( !  PL_utf8_swash_ptrs[classnum]) {
12751        if (namedclass % 2 == 0) { /* A non-complemented
12752               class */
12753         /* If not /a matching, there are code points we
12754         * don't know at compile time.  Arrange for the
12755         * unknown matches to be loaded at run-time, if
12756         * needed */
12757         if (! AT_LEAST_ASCII_RESTRICTED) {
12758          Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12759                 Xname);
12760         }
12761         if (LOC) {  /* Under locale, set run-time
12762            lookup */
12763          ANYOF_CLASS_SET(ret, namedclass);
12764         }
12765         else {
12766          /* Add the current class's code points to
12767          * the running total */
12768          _invlist_union(posixes,
12769             (AT_LEAST_ASCII_RESTRICTED)
12770               ? ascii_source
12771               : l1_source,
12772             &posixes);
12773         }
12774        }
12775        else {  /* A complemented class */
12776         if (AT_LEAST_ASCII_RESTRICTED) {
12777          /* Under /a should match everything above
12778          * ASCII, plus the complement of the set's
12779          * ASCII matches */
12780          _invlist_union_complement_2nd(posixes,
12781                 ascii_source,
12782                 &posixes);
12783         }
12784         else {
12785          /* Arrange for the unknown matches to be
12786          * loaded at run-time, if needed */
12787          Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12788                 Xname);
12789          runtime_posix_matches_above_Unicode = TRUE;
12790          if (LOC) {
12791           ANYOF_CLASS_SET(ret, namedclass);
12792          }
12793          else {
12794
12795           /* We want to match everything in
12796           * Latin1, except those things that
12797           * l1_source matches */
12798           SV* scratch_list = NULL;
12799           _invlist_subtract(PL_Latin1, l1_source,
12800               &scratch_list);
12801
12802           /* Add the list from this class to the
12803           * running total */
12804           if (! posixes) {
12805            posixes = scratch_list;
12806           }
12807           else {
12808            _invlist_union(posixes,
12809               scratch_list,
12810               &posixes);
12811            SvREFCNT_dec_NN(scratch_list);
12812           }
12813           if (DEPENDS_SEMANTICS) {
12814            ANYOF_FLAGS(ret)
12815             |= ANYOF_NON_UTF8_LATIN1_ALL;
12816           }
12817          }
12818         }
12819        }
12820        goto namedclass_done;
12821       }
12822
12823       /* Here, there is a swash loaded for the class.  If no
12824       * inversion list for it yet, get it */
12825       if (! PL_XPosix_ptrs[classnum]) {
12826        PL_XPosix_ptrs[classnum]
12827        = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12828       }
12829      }
12830
12831      /* Here there is an inversion list already loaded for the
12832      * entire class */
12833
12834      if (namedclass % 2 == 0) {  /* A non-complemented class,
12835             like ANYOF_PUNCT */
12836       if (! LOC) {
12837        /* For non-locale, just add it to any existing list
12838        * */
12839        _invlist_union(posixes,
12840           (AT_LEAST_ASCII_RESTRICTED)
12841            ? ascii_source
12842            : PL_XPosix_ptrs[classnum],
12843           &posixes);
12844       }
12845       else {  /* Locale */
12846        SV* scratch_list = NULL;
12847
12848        /* For above Latin1 code points, we use the full
12849        * Unicode range */
12850        _invlist_intersection(PL_AboveLatin1,
12851             PL_XPosix_ptrs[classnum],
12852             &scratch_list);
12853        /* And set the output to it, adding instead if
12854        * there already is an output.  Checking if
12855        * 'posixes' is NULL first saves an extra clone.
12856        * Its reference count will be decremented at the
12857        * next union, etc, or if this is the only
12858        * instance, at the end of the routine */
12859        if (! posixes) {
12860         posixes = scratch_list;
12861        }
12862        else {
12863         _invlist_union(posixes, scratch_list, &posixes);
12864         SvREFCNT_dec_NN(scratch_list);
12865        }
12866
12867 #ifndef HAS_ISBLANK
12868        if (namedclass != ANYOF_BLANK) {
12869 #endif
12870         /* Set this class in the node for runtime
12871         * matching */
12872         ANYOF_CLASS_SET(ret, namedclass);
12873 #ifndef HAS_ISBLANK
12874        }
12875        else {
12876         /* No isblank(), use the hard-coded ASCII-range
12877         * blanks, adding them to the running total. */
12878
12879         _invlist_union(posixes, ascii_source, &posixes);
12880        }
12881 #endif
12882       }
12883      }
12884      else {  /* A complemented class, like ANYOF_NPUNCT */
12885       if (! LOC) {
12886        _invlist_union_complement_2nd(
12887             posixes,
12888             (AT_LEAST_ASCII_RESTRICTED)
12889              ? ascii_source
12890              : PL_XPosix_ptrs[classnum],
12891             &posixes);
12892        /* Under /d, everything in the upper half of the
12893        * Latin1 range matches this complement */
12894        if (DEPENDS_SEMANTICS) {
12895         ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12896        }
12897       }
12898       else {  /* Locale */
12899        SV* scratch_list = NULL;
12900        _invlist_subtract(PL_AboveLatin1,
12901            PL_XPosix_ptrs[classnum],
12902            &scratch_list);
12903        if (! posixes) {
12904         posixes = scratch_list;
12905        }
12906        else {
12907         _invlist_union(posixes, scratch_list, &posixes);
12908         SvREFCNT_dec_NN(scratch_list);
12909        }
12910 #ifndef HAS_ISBLANK
12911        if (namedclass != ANYOF_NBLANK) {
12912 #endif
12913         ANYOF_CLASS_SET(ret, namedclass);
12914 #ifndef HAS_ISBLANK
12915        }
12916        else {
12917         /* Get the list of all code points in Latin1
12918         * that are not ASCII blanks, and add them to
12919         * the running total */
12920         _invlist_subtract(PL_Latin1, ascii_source,
12921             &scratch_list);
12922         _invlist_union(posixes, scratch_list, &posixes);
12923         SvREFCNT_dec_NN(scratch_list);
12924        }
12925 #endif
12926       }
12927      }
12928     }
12929    namedclass_done:
12930     continue;   /* Go get next character */
12931    }
12932   } /* end of namedclass \blah */
12933
12934   /* Here, we have a single value.  If 'range' is set, it is the ending
12935   * of a range--check its validity.  Later, we will handle each
12936   * individual code point in the range.  If 'range' isn't set, this
12937   * could be the beginning of a range, so check for that by looking
12938   * ahead to see if the next real character to be processed is the range
12939   * indicator--the minus sign */
12940
12941   if (skip_white) {
12942    RExC_parse = regpatws(pRExC_state, RExC_parse,
12943         FALSE /* means don't recognize comments */);
12944   }
12945
12946   if (range) {
12947    if (prevvalue > value) /* b-a */ {
12948     const int w = RExC_parse - rangebegin;
12949     Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12950     range = 0; /* not a valid range */
12951    }
12952   }
12953   else {
12954    prevvalue = value; /* save the beginning of the potential range */
12955    if (! stop_at_1     /* Can't be a range if parsing just one thing */
12956     && *RExC_parse == '-')
12957    {
12958     char* next_char_ptr = RExC_parse + 1;
12959     if (skip_white) {   /* Get the next real char after the '-' */
12960      next_char_ptr = regpatws(pRExC_state,
12961            RExC_parse + 1,
12962            FALSE); /* means don't recognize
12963               comments */
12964     }
12965
12966     /* If the '-' is at the end of the class (just before the ']',
12967     * it is a literal minus; otherwise it is a range */
12968     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12969      RExC_parse = next_char_ptr;
12970
12971      /* a bad range like \w-, [:word:]- ? */
12972      if (namedclass > OOB_NAMEDCLASS) {
12973       if (strict || ckWARN(WARN_REGEXP)) {
12974        const int w =
12975         RExC_parse >= rangebegin ?
12976         RExC_parse - rangebegin : 0;
12977        if (strict) {
12978         vFAIL4("False [] range \"%*.*s\"",
12979          w, w, rangebegin);
12980        }
12981        else {
12982         vWARN4(RExC_parse,
12983          "False [] range \"%*.*s\"",
12984          w, w, rangebegin);
12985        }
12986       }
12987       if (!SIZE_ONLY) {
12988        cp_list = add_cp_to_invlist(cp_list, '-');
12989       }
12990       element_count++;
12991      } else
12992       range = 1; /* yeah, it's a range! */
12993      continue; /* but do it the next time */
12994     }
12995    }
12996   }
12997
12998   /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12999   * if not */
13000
13001   /* non-Latin1 code point implies unicode semantics.  Must be set in
13002   * pass1 so is there for the whole of pass 2 */
13003   if (value > 255) {
13004    RExC_uni_semantics = 1;
13005   }
13006
13007   /* Ready to process either the single value, or the completed range.
13008   * For single-valued non-inverted ranges, we consider the possibility
13009   * of multi-char folds.  (We made a conscious decision to not do this
13010   * for the other cases because it can often lead to non-intuitive
13011   * results.  For example, you have the peculiar case that:
13012   *  "s s" =~ /^[^\xDF]+$/i => Y
13013   *  "ss"  =~ /^[^\xDF]+$/i => N
13014   *
13015   * See [perl #89750] */
13016   if (FOLD && allow_multi_folds && value == prevvalue) {
13017    if (value == LATIN_SMALL_LETTER_SHARP_S
13018     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13019               value)))
13020    {
13021     /* Here <value> is indeed a multi-char fold.  Get what it is */
13022
13023     U8 foldbuf[UTF8_MAXBYTES_CASE];
13024     STRLEN foldlen;
13025
13026     UV folded = _to_uni_fold_flags(
13027         value,
13028         foldbuf,
13029         &foldlen,
13030         FOLD_FLAGS_FULL
13031         | ((LOC) ?  FOLD_FLAGS_LOCALE
13032            : (ASCII_FOLD_RESTRICTED)
13033            ? FOLD_FLAGS_NOMIX_ASCII
13034            : 0)
13035         );
13036
13037     /* Here, <folded> should be the first character of the
13038     * multi-char fold of <value>, with <foldbuf> containing the
13039     * whole thing.  But, if this fold is not allowed (because of
13040     * the flags), <fold> will be the same as <value>, and should
13041     * be processed like any other character, so skip the special
13042     * handling */
13043     if (folded != value) {
13044
13045      /* Skip if we are recursed, currently parsing the class
13046      * again.  Otherwise add this character to the list of
13047      * multi-char folds. */
13048      if (! RExC_in_multi_char_class) {
13049       AV** this_array_ptr;
13050       AV* this_array;
13051       STRLEN cp_count = utf8_length(foldbuf,
13052              foldbuf + foldlen);
13053       SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13054
13055       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13056
13057
13058       if (! multi_char_matches) {
13059        multi_char_matches = newAV();
13060       }
13061
13062       /* <multi_char_matches> is actually an array of arrays.
13063       * There will be one or two top-level elements: [2],
13064       * and/or [3].  The [2] element is an array, each
13065       * element thereof is a character which folds to two
13066       * characters; likewise for [3].  (Unicode guarantees a
13067       * maximum of 3 characters in any fold.)  When we
13068       * rewrite the character class below, we will do so
13069       * such that the longest folds are written first, so
13070       * that it prefers the longest matching strings first.
13071       * This is done even if it turns out that any
13072       * quantifier is non-greedy, out of programmer
13073       * laziness.  Tom Christiansen has agreed that this is
13074       * ok.  This makes the test for the ligature 'ffi' come
13075       * before the test for 'ff' */
13076       if (av_exists(multi_char_matches, cp_count)) {
13077        this_array_ptr = (AV**) av_fetch(multi_char_matches,
13078                cp_count, FALSE);
13079        this_array = *this_array_ptr;
13080       }
13081       else {
13082        this_array = newAV();
13083        av_store(multi_char_matches, cp_count,
13084          (SV*) this_array);
13085       }
13086       av_push(this_array, multi_fold);
13087      }
13088
13089      /* This element should not be processed further in this
13090      * class */
13091      element_count--;
13092      value = save_value;
13093      prevvalue = save_prevvalue;
13094      continue;
13095     }
13096    }
13097   }
13098
13099   /* Deal with this element of the class */
13100   if (! SIZE_ONLY) {
13101 #ifndef EBCDIC
13102    cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13103 #else
13104    SV* this_range = _new_invlist(1);
13105    _append_range_to_invlist(this_range, prevvalue, value);
13106
13107    /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13108    * If this range was specified using something like 'i-j', we want
13109    * to include only the 'i' and the 'j', and not anything in
13110    * between, so exclude non-ASCII, non-alphabetics from it.
13111    * However, if the range was specified with something like
13112    * [\x89-\x91] or [\x89-j], all code points within it should be
13113    * included.  literal_endpoint==2 means both ends of the range used
13114    * a literal character, not \x{foo} */
13115    if (literal_endpoint == 2
13116     && (prevvalue >= 'a' && value <= 'z')
13117      || (prevvalue >= 'A' && value <= 'Z'))
13118    {
13119     _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13120          &this_range);
13121    }
13122    _invlist_union(cp_list, this_range, &cp_list);
13123    literal_endpoint = 0;
13124 #endif
13125   }
13126
13127   range = 0; /* this range (if it was one) is done now */
13128  } /* End of loop through all the text within the brackets */
13129
13130  /* If anything in the class expands to more than one character, we have to
13131  * deal with them by building up a substitute parse string, and recursively
13132  * calling reg() on it, instead of proceeding */
13133  if (multi_char_matches) {
13134   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13135   I32 cp_count;
13136   STRLEN len;
13137   char *save_end = RExC_end;
13138   char *save_parse = RExC_parse;
13139   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
13140          a "|" */
13141   I32 reg_flags;
13142
13143   assert(! invert);
13144 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
13145   because too confusing */
13146   if (invert) {
13147    sv_catpv(substitute_parse, "(?:");
13148   }
13149 #endif
13150
13151   /* Look at the longest folds first */
13152   for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13153
13154    if (av_exists(multi_char_matches, cp_count)) {
13155     AV** this_array_ptr;
13156     SV* this_sequence;
13157
13158     this_array_ptr = (AV**) av_fetch(multi_char_matches,
13159             cp_count, FALSE);
13160     while ((this_sequence = av_pop(*this_array_ptr)) !=
13161                 &PL_sv_undef)
13162     {
13163      if (! first_time) {
13164       sv_catpv(substitute_parse, "|");
13165      }
13166      first_time = FALSE;
13167
13168      sv_catpv(substitute_parse, SvPVX(this_sequence));
13169     }
13170    }
13171   }
13172
13173   /* If the character class contains anything else besides these
13174   * multi-character folds, have to include it in recursive parsing */
13175   if (element_count) {
13176    sv_catpv(substitute_parse, "|[");
13177    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13178    sv_catpv(substitute_parse, "]");
13179   }
13180
13181   sv_catpv(substitute_parse, ")");
13182 #if 0
13183   if (invert) {
13184    /* This is a way to get the parse to skip forward a whole named
13185    * sequence instead of matching the 2nd character when it fails the
13186    * first */
13187    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13188   }
13189 #endif
13190
13191   RExC_parse = SvPV(substitute_parse, len);
13192   RExC_end = RExC_parse + len;
13193   RExC_in_multi_char_class = 1;
13194   RExC_emit = (regnode *)orig_emit;
13195
13196   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13197
13198   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13199
13200   RExC_parse = save_parse;
13201   RExC_end = save_end;
13202   RExC_in_multi_char_class = 0;
13203   SvREFCNT_dec_NN(multi_char_matches);
13204   return ret;
13205  }
13206
13207  /* If the character class contains only a single element, it may be
13208  * optimizable into another node type which is smaller and runs faster.
13209  * Check if this is the case for this class */
13210  if (element_count == 1 && ! ret_invlist) {
13211   U8 op = END;
13212   U8 arg = 0;
13213
13214   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13215            [:digit:] or \p{foo} */
13216
13217    /* All named classes are mapped into POSIXish nodes, with its FLAG
13218    * argument giving which class it is */
13219    switch ((I32)namedclass) {
13220     case ANYOF_UNIPROP:
13221      break;
13222
13223     /* These don't depend on the charset modifiers.  They always
13224     * match under /u rules */
13225     case ANYOF_NHORIZWS:
13226     case ANYOF_HORIZWS:
13227      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13228      /* FALLTHROUGH */
13229
13230     case ANYOF_NVERTWS:
13231     case ANYOF_VERTWS:
13232      op = POSIXU;
13233      goto join_posix;
13234
13235     /* The actual POSIXish node for all the rest depends on the
13236     * charset modifier.  The ones in the first set depend only on
13237     * ASCII or, if available on this platform, locale */
13238     case ANYOF_ASCII:
13239     case ANYOF_NASCII:
13240 #ifdef HAS_ISASCII
13241      op = (LOC) ? POSIXL : POSIXA;
13242 #else
13243      op = POSIXA;
13244 #endif
13245      goto join_posix;
13246
13247     case ANYOF_NCASED:
13248     case ANYOF_LOWER:
13249     case ANYOF_NLOWER:
13250     case ANYOF_UPPER:
13251     case ANYOF_NUPPER:
13252      /* under /a could be alpha */
13253      if (FOLD) {
13254       if (ASCII_RESTRICTED) {
13255        namedclass = ANYOF_ALPHA + (namedclass % 2);
13256       }
13257       else if (! LOC) {
13258        break;
13259       }
13260      }
13261      /* FALLTHROUGH */
13262
13263     /* The rest have more possibilities depending on the charset.
13264     * We take advantage of the enum ordering of the charset
13265     * modifiers to get the exact node type, */
13266     default:
13267      op = POSIXD + get_regex_charset(RExC_flags);
13268      if (op > POSIXA) { /* /aa is same as /a */
13269       op = POSIXA;
13270      }
13271 #ifndef HAS_ISBLANK
13272      if (op == POSIXL
13273       && (namedclass == ANYOF_BLANK
13274        || namedclass == ANYOF_NBLANK))
13275      {
13276       op = POSIXA;
13277      }
13278 #endif
13279
13280     join_posix:
13281      /* The odd numbered ones are the complements of the
13282      * next-lower even number one */
13283      if (namedclass % 2 == 1) {
13284       invert = ! invert;
13285       namedclass--;
13286      }
13287      arg = namedclass_to_classnum(namedclass);
13288      break;
13289    }
13290   }
13291   else if (value == prevvalue) {
13292
13293    /* Here, the class consists of just a single code point */
13294
13295    if (invert) {
13296     if (! LOC && value == '\n') {
13297      op = REG_ANY; /* Optimize [^\n] */
13298      *flagp |= HASWIDTH|SIMPLE;
13299      RExC_naughty++;
13300     }
13301    }
13302    else if (value < 256 || UTF) {
13303
13304     /* Optimize a single value into an EXACTish node, but not if it
13305     * would require converting the pattern to UTF-8. */
13306     op = compute_EXACTish(pRExC_state);
13307    }
13308   } /* Otherwise is a range */
13309   else if (! LOC) {   /* locale could vary these */
13310    if (prevvalue == '0') {
13311     if (value == '9') {
13312      arg = _CC_DIGIT;
13313      op = POSIXA;
13314     }
13315    }
13316   }
13317
13318   /* Here, we have changed <op> away from its initial value iff we found
13319   * an optimization */
13320   if (op != END) {
13321
13322    /* Throw away this ANYOF regnode, and emit the calculated one,
13323    * which should correspond to the beginning, not current, state of
13324    * the parse */
13325    const char * cur_parse = RExC_parse;
13326    RExC_parse = (char *)orig_parse;
13327    if ( SIZE_ONLY) {
13328     if (! LOC) {
13329
13330      /* To get locale nodes to not use the full ANYOF size would
13331      * require moving the code above that writes the portions
13332      * of it that aren't in other nodes to after this point.
13333      * e.g.  ANYOF_CLASS_SET */
13334      RExC_size = orig_size;
13335     }
13336    }
13337    else {
13338     RExC_emit = (regnode *)orig_emit;
13339     if (PL_regkind[op] == POSIXD) {
13340      if (invert) {
13341       op += NPOSIXD - POSIXD;
13342      }
13343     }
13344    }
13345
13346    ret = reg_node(pRExC_state, op);
13347
13348    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13349     if (! SIZE_ONLY) {
13350      FLAGS(ret) = arg;
13351     }
13352     *flagp |= HASWIDTH|SIMPLE;
13353    }
13354    else if (PL_regkind[op] == EXACT) {
13355     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13356    }
13357
13358    RExC_parse = (char *) cur_parse;
13359
13360    SvREFCNT_dec(posixes);
13361    SvREFCNT_dec(cp_list);
13362    return ret;
13363   }
13364  }
13365
13366  if (SIZE_ONLY)
13367   return ret;
13368  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13369
13370  /* If folding, we calculate all characters that could fold to or from the
13371  * ones already on the list */
13372  if (FOLD && cp_list) {
13373   UV start, end; /* End points of code point ranges */
13374
13375   SV* fold_intersection = NULL;
13376
13377   /* If the highest code point is within Latin1, we can use the
13378   * compiled-in Alphas list, and not have to go out to disk.  This
13379   * yields two false positives, the masculine and feminine ordinal
13380   * indicators, which are weeded out below using the
13381   * IS_IN_SOME_FOLD_L1() macro */
13382   if (invlist_highest(cp_list) < 256) {
13383    _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13384               &fold_intersection);
13385   }
13386   else {
13387
13388    /* Here, there are non-Latin1 code points, so we will have to go
13389    * fetch the list of all the characters that participate in folds
13390    */
13391    if (! PL_utf8_foldable) {
13392     SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13393          &PL_sv_undef, 1, 0);
13394     PL_utf8_foldable = _get_swash_invlist(swash);
13395     SvREFCNT_dec_NN(swash);
13396    }
13397
13398    /* This is a hash that for a particular fold gives all characters
13399    * that are involved in it */
13400    if (! PL_utf8_foldclosures) {
13401
13402     /* If we were unable to find any folds, then we likely won't be
13403     * able to find the closures.  So just create an empty list.
13404     * Folding will effectively be restricted to the non-Unicode
13405     * rules hard-coded into Perl.  (This case happens legitimately
13406     * during compilation of Perl itself before the Unicode tables
13407     * are generated) */
13408     if (_invlist_len(PL_utf8_foldable) == 0) {
13409      PL_utf8_foldclosures = newHV();
13410     }
13411     else {
13412      /* If the folds haven't been read in, call a fold function
13413      * to force that */
13414      if (! PL_utf8_tofold) {
13415       U8 dummy[UTF8_MAXBYTES+1];
13416
13417       /* This string is just a short named one above \xff */
13418       to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13419       assert(PL_utf8_tofold); /* Verify that worked */
13420      }
13421      PL_utf8_foldclosures =
13422          _swash_inversion_hash(PL_utf8_tofold);
13423     }
13424    }
13425
13426    /* Only the characters in this class that participate in folds need
13427    * be checked.  Get the intersection of this class and all the
13428    * possible characters that are foldable.  This can quickly narrow
13429    * down a large class */
13430    _invlist_intersection(PL_utf8_foldable, cp_list,
13431         &fold_intersection);
13432   }
13433
13434   /* Now look at the foldable characters in this class individually */
13435   invlist_iterinit(fold_intersection);
13436   while (invlist_iternext(fold_intersection, &start, &end)) {
13437    UV j;
13438
13439    /* Locale folding for Latin1 characters is deferred until runtime */
13440    if (LOC && start < 256) {
13441     start = 256;
13442    }
13443
13444    /* Look at every character in the range */
13445    for (j = start; j <= end; j++) {
13446
13447     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13448     STRLEN foldlen;
13449     SV** listp;
13450
13451     if (j < 256) {
13452
13453      /* We have the latin1 folding rules hard-coded here so that
13454      * an innocent-looking character class, like /[ks]/i won't
13455      * have to go out to disk to find the possible matches.
13456      * XXX It would be better to generate these via regen, in
13457      * case a new version of the Unicode standard adds new
13458      * mappings, though that is not really likely, and may be
13459      * caught by the default: case of the switch below. */
13460
13461      if (IS_IN_SOME_FOLD_L1(j)) {
13462
13463       /* ASCII is always matched; non-ASCII is matched only
13464       * under Unicode rules */
13465       if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13466        cp_list =
13467         add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13468       }
13469       else {
13470        depends_list =
13471        add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13472       }
13473      }
13474
13475      if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13476       && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13477      {
13478       /* Certain Latin1 characters have matches outside
13479       * Latin1.  To get here, <j> is one of those
13480       * characters.   None of these matches is valid for
13481       * ASCII characters under /aa, which is why the 'if'
13482       * just above excludes those.  These matches only
13483       * happen when the target string is utf8.  The code
13484       * below adds the single fold closures for <j> to the
13485       * inversion list. */
13486       switch (j) {
13487        case 'k':
13488        case 'K':
13489         cp_list =
13490          add_cp_to_invlist(cp_list, KELVIN_SIGN);
13491         break;
13492        case 's':
13493        case 'S':
13494         cp_list = add_cp_to_invlist(cp_list,
13495              LATIN_SMALL_LETTER_LONG_S);
13496         break;
13497        case MICRO_SIGN:
13498         cp_list = add_cp_to_invlist(cp_list,
13499              GREEK_CAPITAL_LETTER_MU);
13500         cp_list = add_cp_to_invlist(cp_list,
13501              GREEK_SMALL_LETTER_MU);
13502         break;
13503        case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13504        case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13505         cp_list =
13506          add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13507         break;
13508        case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13509         cp_list = add_cp_to_invlist(cp_list,
13510           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13511         break;
13512        case LATIN_SMALL_LETTER_SHARP_S:
13513         cp_list = add_cp_to_invlist(cp_list,
13514             LATIN_CAPITAL_LETTER_SHARP_S);
13515         break;
13516        case 'F': case 'f':
13517        case 'I': case 'i':
13518        case 'L': case 'l':
13519        case 'T': case 't':
13520        case 'A': case 'a':
13521        case 'H': case 'h':
13522        case 'J': case 'j':
13523        case 'N': case 'n':
13524        case 'W': case 'w':
13525        case 'Y': case 'y':
13526         /* These all are targets of multi-character
13527         * folds from code points that require UTF8 to
13528         * express, so they can't match unless the
13529         * target string is in UTF-8, so no action here
13530         * is necessary, as regexec.c properly handles
13531         * the general case for UTF-8 matching and
13532         * multi-char folds */
13533         break;
13534        default:
13535         /* Use deprecated warning to increase the
13536         * chances of this being output */
13537         ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13538         break;
13539       }
13540      }
13541      continue;
13542     }
13543
13544     /* Here is an above Latin1 character.  We don't have the rules
13545     * hard-coded for it.  First, get its fold.  This is the simple
13546     * fold, as the multi-character folds have been handled earlier
13547     * and separated out */
13548     _to_uni_fold_flags(j, foldbuf, &foldlen,
13549            ((LOC)
13550            ? FOLD_FLAGS_LOCALE
13551            : (ASCII_FOLD_RESTRICTED)
13552             ? FOLD_FLAGS_NOMIX_ASCII
13553             : 0));
13554
13555     /* Single character fold of above Latin1.  Add everything in
13556     * its fold closure to the list that this node should match.
13557     * The fold closures data structure is a hash with the keys
13558     * being the UTF-8 of every character that is folded to, like
13559     * 'k', and the values each an array of all code points that
13560     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13561     * Multi-character folds are not included */
13562     if ((listp = hv_fetch(PL_utf8_foldclosures,
13563          (char *) foldbuf, foldlen, FALSE)))
13564     {
13565      AV* list = (AV*) *listp;
13566      IV k;
13567      for (k = 0; k <= av_len(list); k++) {
13568       SV** c_p = av_fetch(list, k, FALSE);
13569       UV c;
13570       if (c_p == NULL) {
13571        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13572       }
13573       c = SvUV(*c_p);
13574
13575       /* /aa doesn't allow folds between ASCII and non-; /l
13576       * doesn't allow them between above and below 256 */
13577       if ((ASCII_FOLD_RESTRICTED
13578         && (isASCII(c) != isASCII(j)))
13579        || (LOC && ((c < 256) != (j < 256))))
13580       {
13581        continue;
13582       }
13583
13584       /* Folds involving non-ascii Latin1 characters
13585       * under /d are added to a separate list */
13586       if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13587       {
13588        cp_list = add_cp_to_invlist(cp_list, c);
13589       }
13590       else {
13591       depends_list = add_cp_to_invlist(depends_list, c);
13592       }
13593      }
13594     }
13595    }
13596   }
13597   SvREFCNT_dec_NN(fold_intersection);
13598  }
13599
13600  /* And combine the result (if any) with any inversion list from posix
13601  * classes.  The lists are kept separate up to now because we don't want to
13602  * fold the classes (folding of those is automatically handled by the swash
13603  * fetching code) */
13604  if (posixes) {
13605   if (! DEPENDS_SEMANTICS) {
13606    if (cp_list) {
13607     _invlist_union(cp_list, posixes, &cp_list);
13608     SvREFCNT_dec_NN(posixes);
13609    }
13610    else {
13611     cp_list = posixes;
13612    }
13613   }
13614   else {
13615    /* Under /d, we put into a separate list the Latin1 things that
13616    * match only when the target string is utf8 */
13617    SV* nonascii_but_latin1_properties = NULL;
13618    _invlist_intersection(posixes, PL_Latin1,
13619         &nonascii_but_latin1_properties);
13620    _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13621        &nonascii_but_latin1_properties);
13622    _invlist_subtract(posixes, nonascii_but_latin1_properties,
13623        &posixes);
13624    if (cp_list) {
13625     _invlist_union(cp_list, posixes, &cp_list);
13626     SvREFCNT_dec_NN(posixes);
13627    }
13628    else {
13629     cp_list = posixes;
13630    }
13631
13632    if (depends_list) {
13633     _invlist_union(depends_list, nonascii_but_latin1_properties,
13634        &depends_list);
13635     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13636    }
13637    else {
13638     depends_list = nonascii_but_latin1_properties;
13639    }
13640   }
13641  }
13642
13643  /* And combine the result (if any) with any inversion list from properties.
13644  * The lists are kept separate up to now so that we can distinguish the two
13645  * in regards to matching above-Unicode.  A run-time warning is generated
13646  * if a Unicode property is matched against a non-Unicode code point. But,
13647  * we allow user-defined properties to match anything, without any warning,
13648  * and we also suppress the warning if there is a portion of the character
13649  * class that isn't a Unicode property, and which matches above Unicode, \W
13650  * or [\x{110000}] for example.
13651  * (Note that in this case, unlike the Posix one above, there is no
13652  * <depends_list>, because having a Unicode property forces Unicode
13653  * semantics */
13654  if (properties) {
13655   bool warn_super = ! has_user_defined_property;
13656   if (cp_list) {
13657
13658    /* If it matters to the final outcome, see if a non-property
13659    * component of the class matches above Unicode.  If so, the
13660    * warning gets suppressed.  This is true even if just a single
13661    * such code point is specified, as though not strictly correct if
13662    * another such code point is matched against, the fact that they
13663    * are using above-Unicode code points indicates they should know
13664    * the issues involved */
13665    if (warn_super) {
13666     bool non_prop_matches_above_Unicode =
13667        runtime_posix_matches_above_Unicode
13668        | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13669     if (invert) {
13670      non_prop_matches_above_Unicode =
13671            !  non_prop_matches_above_Unicode;
13672     }
13673     warn_super = ! non_prop_matches_above_Unicode;
13674    }
13675
13676    _invlist_union(properties, cp_list, &cp_list);
13677    SvREFCNT_dec_NN(properties);
13678   }
13679   else {
13680    cp_list = properties;
13681   }
13682
13683   if (warn_super) {
13684    OP(ret) = ANYOF_WARN_SUPER;
13685   }
13686  }
13687
13688  /* Here, we have calculated what code points should be in the character
13689  * class.
13690  *
13691  * Now we can see about various optimizations.  Fold calculation (which we
13692  * did above) needs to take place before inversion.  Otherwise /[^k]/i
13693  * would invert to include K, which under /i would match k, which it
13694  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13695  * folded until runtime */
13696
13697  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13698  * at compile time.  Besides not inverting folded locale now, we can't
13699  * invert if there are things such as \w, which aren't known until runtime
13700  * */
13701  if (invert
13702   && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13703   && ! depends_list
13704   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13705  {
13706   _invlist_invert(cp_list);
13707
13708   /* Any swash can't be used as-is, because we've inverted things */
13709   if (swash) {
13710    SvREFCNT_dec_NN(swash);
13711    swash = NULL;
13712   }
13713
13714   /* Clear the invert flag since have just done it here */
13715   invert = FALSE;
13716  }
13717
13718  if (ret_invlist) {
13719   *ret_invlist = cp_list;
13720   SvREFCNT_dec(swash);
13721
13722   /* Discard the generated node */
13723   if (SIZE_ONLY) {
13724    RExC_size = orig_size;
13725   }
13726   else {
13727    RExC_emit = orig_emit;
13728   }
13729   return orig_emit;
13730  }
13731
13732  /* If we didn't do folding, it's because some information isn't available
13733  * until runtime; set the run-time fold flag for these.  (We don't have to
13734  * worry about properties folding, as that is taken care of by the swash
13735  * fetching) */
13736  if (FOLD && LOC)
13737  {
13738  ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13739  }
13740
13741  /* Some character classes are equivalent to other nodes.  Such nodes take
13742  * up less room and generally fewer operations to execute than ANYOF nodes.
13743  * Above, we checked for and optimized into some such equivalents for
13744  * certain common classes that are easy to test.  Getting to this point in
13745  * the code means that the class didn't get optimized there.  Since this
13746  * code is only executed in Pass 2, it is too late to save space--it has
13747  * been allocated in Pass 1, and currently isn't given back.  But turning
13748  * things into an EXACTish node can allow the optimizer to join it to any
13749  * adjacent such nodes.  And if the class is equivalent to things like /./,
13750  * expensive run-time swashes can be avoided.  Now that we have more
13751  * complete information, we can find things necessarily missed by the
13752  * earlier code.  I (khw) am not sure how much to look for here.  It would
13753  * be easy, but perhaps too slow, to check any candidates against all the
13754  * node types they could possibly match using _invlistEQ(). */
13755
13756  if (cp_list
13757   && ! invert
13758   && ! depends_list
13759   && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13760   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13761  {
13762   UV start, end;
13763   U8 op = END;  /* The optimzation node-type */
13764   const char * cur_parse= RExC_parse;
13765
13766   invlist_iterinit(cp_list);
13767   if (! invlist_iternext(cp_list, &start, &end)) {
13768
13769    /* Here, the list is empty.  This happens, for example, when a
13770    * Unicode property is the only thing in the character class, and
13771    * it doesn't match anything.  (perluniprops.pod notes such
13772    * properties) */
13773    op = OPFAIL;
13774    *flagp |= HASWIDTH|SIMPLE;
13775   }
13776   else if (start == end) {    /* The range is a single code point */
13777    if (! invlist_iternext(cp_list, &start, &end)
13778
13779      /* Don't do this optimization if it would require changing
13780      * the pattern to UTF-8 */
13781     && (start < 256 || UTF))
13782    {
13783     /* Here, the list contains a single code point.  Can optimize
13784     * into an EXACT node */
13785
13786     value = start;
13787
13788     if (! FOLD) {
13789      op = EXACT;
13790     }
13791     else if (LOC) {
13792
13793      /* A locale node under folding with one code point can be
13794      * an EXACTFL, as its fold won't be calculated until
13795      * runtime */
13796      op = EXACTFL;
13797     }
13798     else {
13799
13800      /* Here, we are generally folding, but there is only one
13801      * code point to match.  If we have to, we use an EXACT
13802      * node, but it would be better for joining with adjacent
13803      * nodes in the optimization pass if we used the same
13804      * EXACTFish node that any such are likely to be.  We can
13805      * do this iff the code point doesn't participate in any
13806      * folds.  For example, an EXACTF of a colon is the same as
13807      * an EXACT one, since nothing folds to or from a colon. */
13808      if (value < 256) {
13809       if (IS_IN_SOME_FOLD_L1(value)) {
13810        op = EXACT;
13811       }
13812      }
13813      else {
13814       if (! PL_utf8_foldable) {
13815        SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13816             &PL_sv_undef, 1, 0);
13817        PL_utf8_foldable = _get_swash_invlist(swash);
13818        SvREFCNT_dec_NN(swash);
13819       }
13820       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13821        op = EXACT;
13822       }
13823      }
13824
13825      /* If we haven't found the node type, above, it means we
13826      * can use the prevailing one */
13827      if (op == END) {
13828       op = compute_EXACTish(pRExC_state);
13829      }
13830     }
13831    }
13832   }
13833   else if (start == 0) {
13834    if (end == UV_MAX) {
13835     op = SANY;
13836     *flagp |= HASWIDTH|SIMPLE;
13837     RExC_naughty++;
13838    }
13839    else if (end == '\n' - 1
13840      && invlist_iternext(cp_list, &start, &end)
13841      && start == '\n' + 1 && end == UV_MAX)
13842    {
13843     op = REG_ANY;
13844     *flagp |= HASWIDTH|SIMPLE;
13845     RExC_naughty++;
13846    }
13847   }
13848   invlist_iterfinish(cp_list);
13849
13850   if (op != END) {
13851    RExC_parse = (char *)orig_parse;
13852    RExC_emit = (regnode *)orig_emit;
13853
13854    ret = reg_node(pRExC_state, op);
13855
13856    RExC_parse = (char *)cur_parse;
13857
13858    if (PL_regkind[op] == EXACT) {
13859     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13860    }
13861
13862    SvREFCNT_dec_NN(cp_list);
13863    return ret;
13864   }
13865  }
13866
13867  /* Here, <cp_list> contains all the code points we can determine at
13868  * compile time that match under all conditions.  Go through it, and
13869  * for things that belong in the bitmap, put them there, and delete from
13870  * <cp_list>.  While we are at it, see if everything above 255 is in the
13871  * list, and if so, set a flag to speed up execution */
13872  ANYOF_BITMAP_ZERO(ret);
13873  if (cp_list) {
13874
13875   /* This gets set if we actually need to modify things */
13876   bool change_invlist = FALSE;
13877
13878   UV start, end;
13879
13880   /* Start looking through <cp_list> */
13881   invlist_iterinit(cp_list);
13882   while (invlist_iternext(cp_list, &start, &end)) {
13883    UV high;
13884    int i;
13885
13886    if (end == UV_MAX && start <= 256) {
13887     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13888    }
13889
13890    /* Quit if are above what we should change */
13891    if (start > 255) {
13892     break;
13893    }
13894
13895    change_invlist = TRUE;
13896
13897    /* Set all the bits in the range, up to the max that we are doing */
13898    high = (end < 255) ? end : 255;
13899    for (i = start; i <= (int) high; i++) {
13900     if (! ANYOF_BITMAP_TEST(ret, i)) {
13901      ANYOF_BITMAP_SET(ret, i);
13902      prevvalue = value;
13903      value = i;
13904     }
13905    }
13906   }
13907   invlist_iterfinish(cp_list);
13908
13909   /* Done with loop; remove any code points that are in the bitmap from
13910   * <cp_list> */
13911   if (change_invlist) {
13912    _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13913   }
13914
13915   /* If have completely emptied it, remove it completely */
13916   if (_invlist_len(cp_list) == 0) {
13917    SvREFCNT_dec_NN(cp_list);
13918    cp_list = NULL;
13919   }
13920  }
13921
13922  if (invert) {
13923   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13924  }
13925
13926  /* Here, the bitmap has been populated with all the Latin1 code points that
13927  * always match.  Can now add to the overall list those that match only
13928  * when the target string is UTF-8 (<depends_list>). */
13929  if (depends_list) {
13930   if (cp_list) {
13931    _invlist_union(cp_list, depends_list, &cp_list);
13932    SvREFCNT_dec_NN(depends_list);
13933   }
13934   else {
13935    cp_list = depends_list;
13936   }
13937  }
13938
13939  /* If there is a swash and more than one element, we can't use the swash in
13940  * the optimization below. */
13941  if (swash && element_count > 1) {
13942   SvREFCNT_dec_NN(swash);
13943   swash = NULL;
13944  }
13945
13946  if (! cp_list
13947   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13948  {
13949   ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13950  }
13951  else {
13952   /* av[0] stores the character class description in its textual form:
13953   *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13954   *       appropriate swash, and is also useful for dumping the regnode.
13955   * av[1] if NULL, is a placeholder to later contain the swash computed
13956   *       from av[0].  But if no further computation need be done, the
13957   *       swash is stored there now.
13958   * av[2] stores the cp_list inversion list for use in addition or
13959   *       instead of av[0]; used only if av[1] is NULL
13960   * av[3] is set if any component of the class is from a user-defined
13961   *       property; used only if av[1] is NULL */
13962   AV * const av = newAV();
13963   SV *rv;
13964
13965   av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13966       ? SvREFCNT_inc(listsv) : &PL_sv_undef);
13967   if (swash) {
13968    av_store(av, 1, swash);
13969    SvREFCNT_dec_NN(cp_list);
13970   }
13971   else {
13972    av_store(av, 1, NULL);
13973    if (cp_list) {
13974     av_store(av, 2, cp_list);
13975     av_store(av, 3, newSVuv(has_user_defined_property));
13976    }
13977   }
13978
13979   rv = newRV_noinc(MUTABLE_SV(av));
13980   n = add_data(pRExC_state, 1, "s");
13981   RExC_rxi->data->data[n] = (void*)rv;
13982   ARG_SET(ret, n);
13983  }
13984
13985  *flagp |= HASWIDTH|SIMPLE;
13986  return ret;
13987 }
13988 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13989
13990
13991 /* reg_skipcomment()
13992
13993    Absorbs an /x style # comments from the input stream.
13994    Returns true if there is more text remaining in the stream.
13995    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13996    terminates the pattern without including a newline.
13997
13998    Note its the callers responsibility to ensure that we are
13999    actually in /x mode
14000
14001 */
14002
14003 STATIC bool
14004 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14005 {
14006  bool ended = 0;
14007
14008  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14009
14010  while (RExC_parse < RExC_end)
14011   if (*RExC_parse++ == '\n') {
14012    ended = 1;
14013    break;
14014   }
14015  if (!ended) {
14016   /* we ran off the end of the pattern without ending
14017   the comment, so we have to add an \n when wrapping */
14018   RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14019   return 0;
14020  } else
14021   return 1;
14022 }
14023
14024 /* nextchar()
14025
14026    Advances the parse position, and optionally absorbs
14027    "whitespace" from the inputstream.
14028
14029    Without /x "whitespace" means (?#...) style comments only,
14030    with /x this means (?#...) and # comments and whitespace proper.
14031
14032    Returns the RExC_parse point from BEFORE the scan occurs.
14033
14034    This is the /x friendly way of saying RExC_parse++.
14035 */
14036
14037 STATIC char*
14038 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14039 {
14040  char* const retval = RExC_parse++;
14041
14042  PERL_ARGS_ASSERT_NEXTCHAR;
14043
14044  for (;;) {
14045   if (RExC_end - RExC_parse >= 3
14046    && *RExC_parse == '('
14047    && RExC_parse[1] == '?'
14048    && RExC_parse[2] == '#')
14049   {
14050    while (*RExC_parse != ')') {
14051     if (RExC_parse == RExC_end)
14052      FAIL("Sequence (?#... not terminated");
14053     RExC_parse++;
14054    }
14055    RExC_parse++;
14056    continue;
14057   }
14058   if (RExC_flags & RXf_PMf_EXTENDED) {
14059    if (isSPACE(*RExC_parse)) {
14060     RExC_parse++;
14061     continue;
14062    }
14063    else if (*RExC_parse == '#') {
14064     if ( reg_skipcomment( pRExC_state ) )
14065      continue;
14066    }
14067   }
14068   return retval;
14069  }
14070 }
14071
14072 /*
14073 - reg_node - emit a node
14074 */
14075 STATIC regnode *   /* Location. */
14076 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14077 {
14078  dVAR;
14079  regnode *ptr;
14080  regnode * const ret = RExC_emit;
14081  GET_RE_DEBUG_FLAGS_DECL;
14082
14083  PERL_ARGS_ASSERT_REG_NODE;
14084
14085  if (SIZE_ONLY) {
14086   SIZE_ALIGN(RExC_size);
14087   RExC_size += 1;
14088   return(ret);
14089  }
14090  if (RExC_emit >= RExC_emit_bound)
14091   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14092     op, RExC_emit, RExC_emit_bound);
14093
14094  NODE_ALIGN_FILL(ret);
14095  ptr = ret;
14096  FILL_ADVANCE_NODE(ptr, op);
14097  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
14098 #ifdef RE_TRACK_PATTERN_OFFSETS
14099  if (RExC_offsets) {         /* MJD */
14100   MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14101    "reg_node", __LINE__,
14102    PL_reg_name[op],
14103    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14104     ? "Overwriting end of array!\n" : "OK",
14105    (UV)(RExC_emit - RExC_emit_start),
14106    (UV)(RExC_parse - RExC_start),
14107    (UV)RExC_offsets[0]));
14108   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14109  }
14110 #endif
14111  RExC_emit = ptr;
14112  return(ret);
14113 }
14114
14115 /*
14116 - reganode - emit a node with an argument
14117 */
14118 STATIC regnode *   /* Location. */
14119 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14120 {
14121  dVAR;
14122  regnode *ptr;
14123  regnode * const ret = RExC_emit;
14124  GET_RE_DEBUG_FLAGS_DECL;
14125
14126  PERL_ARGS_ASSERT_REGANODE;
14127
14128  if (SIZE_ONLY) {
14129   SIZE_ALIGN(RExC_size);
14130   RExC_size += 2;
14131   /*
14132   We can't do this:
14133
14134   assert(2==regarglen[op]+1);
14135
14136   Anything larger than this has to allocate the extra amount.
14137   If we changed this to be:
14138
14139   RExC_size += (1 + regarglen[op]);
14140
14141   then it wouldn't matter. Its not clear what side effect
14142   might come from that so its not done so far.
14143   -- dmq
14144   */
14145   return(ret);
14146  }
14147  if (RExC_emit >= RExC_emit_bound)
14148   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14149     op, RExC_emit, RExC_emit_bound);
14150
14151  NODE_ALIGN_FILL(ret);
14152  ptr = ret;
14153  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14154  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
14155 #ifdef RE_TRACK_PATTERN_OFFSETS
14156  if (RExC_offsets) {         /* MJD */
14157   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14158    "reganode",
14159    __LINE__,
14160    PL_reg_name[op],
14161    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14162    "Overwriting end of array!\n" : "OK",
14163    (UV)(RExC_emit - RExC_emit_start),
14164    (UV)(RExC_parse - RExC_start),
14165    (UV)RExC_offsets[0]));
14166   Set_Cur_Node_Offset;
14167  }
14168 #endif
14169  RExC_emit = ptr;
14170  return(ret);
14171 }
14172
14173 /*
14174 - reguni - emit (if appropriate) a Unicode character
14175 */
14176 STATIC STRLEN
14177 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14178 {
14179  dVAR;
14180
14181  PERL_ARGS_ASSERT_REGUNI;
14182
14183  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14184 }
14185
14186 /*
14187 - reginsert - insert an operator in front of already-emitted operand
14188 *
14189 * Means relocating the operand.
14190 */
14191 STATIC void
14192 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14193 {
14194  dVAR;
14195  regnode *src;
14196  regnode *dst;
14197  regnode *place;
14198  const int offset = regarglen[(U8)op];
14199  const int size = NODE_STEP_REGNODE + offset;
14200  GET_RE_DEBUG_FLAGS_DECL;
14201
14202  PERL_ARGS_ASSERT_REGINSERT;
14203  PERL_UNUSED_ARG(depth);
14204 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14205  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14206  if (SIZE_ONLY) {
14207   RExC_size += size;
14208   return;
14209  }
14210
14211  src = RExC_emit;
14212  RExC_emit += size;
14213  dst = RExC_emit;
14214  if (RExC_open_parens) {
14215   int paren;
14216   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14217   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14218    if ( RExC_open_parens[paren] >= opnd ) {
14219     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14220     RExC_open_parens[paren] += size;
14221    } else {
14222     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14223    }
14224    if ( RExC_close_parens[paren] >= opnd ) {
14225     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14226     RExC_close_parens[paren] += size;
14227    } else {
14228     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14229    }
14230   }
14231  }
14232
14233  while (src > opnd) {
14234   StructCopy(--src, --dst, regnode);
14235 #ifdef RE_TRACK_PATTERN_OFFSETS
14236   if (RExC_offsets) {     /* MJD 20010112 */
14237    MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14238     "reg_insert",
14239     __LINE__,
14240     PL_reg_name[op],
14241     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14242      ? "Overwriting end of array!\n" : "OK",
14243     (UV)(src - RExC_emit_start),
14244     (UV)(dst - RExC_emit_start),
14245     (UV)RExC_offsets[0]));
14246    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14247    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14248   }
14249 #endif
14250  }
14251
14252
14253  place = opnd;  /* Op node, where operand used to be. */
14254 #ifdef RE_TRACK_PATTERN_OFFSETS
14255  if (RExC_offsets) {         /* MJD */
14256   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14257    "reginsert",
14258    __LINE__,
14259    PL_reg_name[op],
14260    (UV)(place - RExC_emit_start) > RExC_offsets[0]
14261    ? "Overwriting end of array!\n" : "OK",
14262    (UV)(place - RExC_emit_start),
14263    (UV)(RExC_parse - RExC_start),
14264    (UV)RExC_offsets[0]));
14265   Set_Node_Offset(place, RExC_parse);
14266   Set_Node_Length(place, 1);
14267  }
14268 #endif
14269  src = NEXTOPER(place);
14270  FILL_ADVANCE_NODE(place, op);
14271  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
14272  Zero(src, offset, regnode);
14273 }
14274
14275 /*
14276 - regtail - set the next-pointer at the end of a node chain of p to val.
14277 - SEE ALSO: regtail_study
14278 */
14279 /* TODO: All three parms should be const */
14280 STATIC void
14281 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14282 {
14283  dVAR;
14284  regnode *scan;
14285  GET_RE_DEBUG_FLAGS_DECL;
14286
14287  PERL_ARGS_ASSERT_REGTAIL;
14288 #ifndef DEBUGGING
14289  PERL_UNUSED_ARG(depth);
14290 #endif
14291
14292  if (SIZE_ONLY)
14293   return;
14294
14295  /* Find last node. */
14296  scan = p;
14297  for (;;) {
14298   regnode * const temp = regnext(scan);
14299   DEBUG_PARSE_r({
14300    SV * const mysv=sv_newmortal();
14301    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14302    regprop(RExC_rx, mysv, scan);
14303    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14304     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14305      (temp == NULL ? "->" : ""),
14306      (temp == NULL ? PL_reg_name[OP(val)] : "")
14307    );
14308   });
14309   if (temp == NULL)
14310    break;
14311   scan = temp;
14312  }
14313
14314  if (reg_off_by_arg[OP(scan)]) {
14315   ARG_SET(scan, val - scan);
14316  }
14317  else {
14318   NEXT_OFF(scan) = val - scan;
14319  }
14320 }
14321
14322 #ifdef DEBUGGING
14323 /*
14324 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14325 - Look for optimizable sequences at the same time.
14326 - currently only looks for EXACT chains.
14327
14328 This is experimental code. The idea is to use this routine to perform
14329 in place optimizations on branches and groups as they are constructed,
14330 with the long term intention of removing optimization from study_chunk so
14331 that it is purely analytical.
14332
14333 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14334 to control which is which.
14335
14336 */
14337 /* TODO: All four parms should be const */
14338
14339 STATIC U8
14340 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14341 {
14342  dVAR;
14343  regnode *scan;
14344  U8 exact = PSEUDO;
14345 #ifdef EXPERIMENTAL_INPLACESCAN
14346  I32 min = 0;
14347 #endif
14348  GET_RE_DEBUG_FLAGS_DECL;
14349
14350  PERL_ARGS_ASSERT_REGTAIL_STUDY;
14351
14352
14353  if (SIZE_ONLY)
14354   return exact;
14355
14356  /* Find last node. */
14357
14358  scan = p;
14359  for (;;) {
14360   regnode * const temp = regnext(scan);
14361 #ifdef EXPERIMENTAL_INPLACESCAN
14362   if (PL_regkind[OP(scan)] == EXACT) {
14363    bool has_exactf_sharp_s; /* Unexamined in this routine */
14364    if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14365     return EXACT;
14366   }
14367 #endif
14368   if ( exact ) {
14369    switch (OP(scan)) {
14370     case EXACT:
14371     case EXACTF:
14372     case EXACTFA:
14373     case EXACTFU:
14374     case EXACTFU_SS:
14375     case EXACTFU_TRICKYFOLD:
14376     case EXACTFL:
14377       if( exact == PSEUDO )
14378        exact= OP(scan);
14379       else if ( exact != OP(scan) )
14380        exact= 0;
14381     case NOTHING:
14382      break;
14383     default:
14384      exact= 0;
14385    }
14386   }
14387   DEBUG_PARSE_r({
14388    SV * const mysv=sv_newmortal();
14389    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14390    regprop(RExC_rx, mysv, scan);
14391    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14392     SvPV_nolen_const(mysv),
14393     REG_NODE_NUM(scan),
14394     PL_reg_name[exact]);
14395   });
14396   if (temp == NULL)
14397    break;
14398   scan = temp;
14399  }
14400  DEBUG_PARSE_r({
14401   SV * const mysv_val=sv_newmortal();
14402   DEBUG_PARSE_MSG("");
14403   regprop(RExC_rx, mysv_val, val);
14404   PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14405      SvPV_nolen_const(mysv_val),
14406      (IV)REG_NODE_NUM(val),
14407      (IV)(val - scan)
14408   );
14409  });
14410  if (reg_off_by_arg[OP(scan)]) {
14411   ARG_SET(scan, val - scan);
14412  }
14413  else {
14414   NEXT_OFF(scan) = val - scan;
14415  }
14416
14417  return exact;
14418 }
14419 #endif
14420
14421 /*
14422  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14423  */
14424 #ifdef DEBUGGING
14425 static void
14426 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14427 {
14428  int bit;
14429  int set=0;
14430  regex_charset cs;
14431
14432  for (bit=0; bit<32; bit++) {
14433   if (flags & (1<<bit)) {
14434    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
14435     continue;
14436    }
14437    if (!set++ && lead)
14438     PerlIO_printf(Perl_debug_log, "%s",lead);
14439    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14440   }
14441  }
14442  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14443    if (!set++ && lead) {
14444     PerlIO_printf(Perl_debug_log, "%s",lead);
14445    }
14446    switch (cs) {
14447     case REGEX_UNICODE_CHARSET:
14448      PerlIO_printf(Perl_debug_log, "UNICODE");
14449      break;
14450     case REGEX_LOCALE_CHARSET:
14451      PerlIO_printf(Perl_debug_log, "LOCALE");
14452      break;
14453     case REGEX_ASCII_RESTRICTED_CHARSET:
14454      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14455      break;
14456     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14457      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14458      break;
14459     default:
14460      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14461      break;
14462    }
14463  }
14464  if (lead)  {
14465   if (set)
14466    PerlIO_printf(Perl_debug_log, "\n");
14467   else
14468    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14469  }
14470 }
14471 #endif
14472
14473 void
14474 Perl_regdump(pTHX_ const regexp *r)
14475 {
14476 #ifdef DEBUGGING
14477  dVAR;
14478  SV * const sv = sv_newmortal();
14479  SV *dsv= sv_newmortal();
14480  RXi_GET_DECL(r,ri);
14481  GET_RE_DEBUG_FLAGS_DECL;
14482
14483  PERL_ARGS_ASSERT_REGDUMP;
14484
14485  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14486
14487  /* Header fields of interest. */
14488  if (r->anchored_substr) {
14489   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14490    RE_SV_DUMPLEN(r->anchored_substr), 30);
14491   PerlIO_printf(Perl_debug_log,
14492      "anchored %s%s at %"IVdf" ",
14493      s, RE_SV_TAIL(r->anchored_substr),
14494      (IV)r->anchored_offset);
14495  } else if (r->anchored_utf8) {
14496   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14497    RE_SV_DUMPLEN(r->anchored_utf8), 30);
14498   PerlIO_printf(Perl_debug_log,
14499      "anchored utf8 %s%s at %"IVdf" ",
14500      s, RE_SV_TAIL(r->anchored_utf8),
14501      (IV)r->anchored_offset);
14502  }
14503  if (r->float_substr) {
14504   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14505    RE_SV_DUMPLEN(r->float_substr), 30);
14506   PerlIO_printf(Perl_debug_log,
14507      "floating %s%s at %"IVdf"..%"UVuf" ",
14508      s, RE_SV_TAIL(r->float_substr),
14509      (IV)r->float_min_offset, (UV)r->float_max_offset);
14510  } else if (r->float_utf8) {
14511   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14512    RE_SV_DUMPLEN(r->float_utf8), 30);
14513   PerlIO_printf(Perl_debug_log,
14514      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14515      s, RE_SV_TAIL(r->float_utf8),
14516      (IV)r->float_min_offset, (UV)r->float_max_offset);
14517  }
14518  if (r->check_substr || r->check_utf8)
14519   PerlIO_printf(Perl_debug_log,
14520      (const char *)
14521      (r->check_substr == r->float_substr
14522      && r->check_utf8 == r->float_utf8
14523      ? "(checking floating" : "(checking anchored"));
14524  if (r->extflags & RXf_NOSCAN)
14525   PerlIO_printf(Perl_debug_log, " noscan");
14526  if (r->extflags & RXf_CHECK_ALL)
14527   PerlIO_printf(Perl_debug_log, " isall");
14528  if (r->check_substr || r->check_utf8)
14529   PerlIO_printf(Perl_debug_log, ") ");
14530
14531  if (ri->regstclass) {
14532   regprop(r, sv, ri->regstclass);
14533   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14534  }
14535  if (r->extflags & RXf_ANCH) {
14536   PerlIO_printf(Perl_debug_log, "anchored");
14537   if (r->extflags & RXf_ANCH_BOL)
14538    PerlIO_printf(Perl_debug_log, "(BOL)");
14539   if (r->extflags & RXf_ANCH_MBOL)
14540    PerlIO_printf(Perl_debug_log, "(MBOL)");
14541   if (r->extflags & RXf_ANCH_SBOL)
14542    PerlIO_printf(Perl_debug_log, "(SBOL)");
14543   if (r->extflags & RXf_ANCH_GPOS)
14544    PerlIO_printf(Perl_debug_log, "(GPOS)");
14545   PerlIO_putc(Perl_debug_log, ' ');
14546  }
14547  if (r->extflags & RXf_GPOS_SEEN)
14548   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14549  if (r->intflags & PREGf_SKIP)
14550   PerlIO_printf(Perl_debug_log, "plus ");
14551  if (r->intflags & PREGf_IMPLICIT)
14552   PerlIO_printf(Perl_debug_log, "implicit ");
14553  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14554  if (r->extflags & RXf_EVAL_SEEN)
14555   PerlIO_printf(Perl_debug_log, "with eval ");
14556  PerlIO_printf(Perl_debug_log, "\n");
14557  DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14558 #else
14559  PERL_ARGS_ASSERT_REGDUMP;
14560  PERL_UNUSED_CONTEXT;
14561  PERL_UNUSED_ARG(r);
14562 #endif /* DEBUGGING */
14563 }
14564
14565 /*
14566 - regprop - printable representation of opcode
14567 */
14568 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14569 STMT_START { \
14570   if (do_sep) {                           \
14571    Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14572    if (flags & ANYOF_INVERT)           \
14573     /*make sure the invert info is in each */ \
14574     sv_catpvs(sv, "^");             \
14575    do_sep = 0;                         \
14576   }                                       \
14577 } STMT_END
14578
14579 void
14580 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14581 {
14582 #ifdef DEBUGGING
14583  dVAR;
14584  int k;
14585
14586  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14587  static const char * const anyofs[] = {
14588 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14589  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14590  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14591  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14592  || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14593  || _CC_VERTSPACE != 16
14594   #error Need to adjust order of anyofs[]
14595 #endif
14596   "[\\w]",
14597   "[\\W]",
14598   "[\\d]",
14599   "[\\D]",
14600   "[:alpha:]",
14601   "[:^alpha:]",
14602   "[:lower:]",
14603   "[:^lower:]",
14604   "[:upper:]",
14605   "[:^upper:]",
14606   "[:punct:]",
14607   "[:^punct:]",
14608   "[:print:]",
14609   "[:^print:]",
14610   "[:alnum:]",
14611   "[:^alnum:]",
14612   "[:graph:]",
14613   "[:^graph:]",
14614   "[:cased:]",
14615   "[:^cased:]",
14616   "[\\s]",
14617   "[\\S]",
14618   "[:blank:]",
14619   "[:^blank:]",
14620   "[:xdigit:]",
14621   "[:^xdigit:]",
14622   "[:space:]",
14623   "[:^space:]",
14624   "[:cntrl:]",
14625   "[:^cntrl:]",
14626   "[:ascii:]",
14627   "[:^ascii:]",
14628   "[\\v]",
14629   "[\\V]"
14630  };
14631  RXi_GET_DECL(prog,progi);
14632  GET_RE_DEBUG_FLAGS_DECL;
14633
14634  PERL_ARGS_ASSERT_REGPROP;
14635
14636  sv_setpvs(sv, "");
14637
14638  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
14639   /* It would be nice to FAIL() here, but this may be called from
14640   regexec.c, and it would be hard to supply pRExC_state. */
14641   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14642  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14643
14644  k = PL_regkind[OP(o)];
14645
14646  if (k == EXACT) {
14647   sv_catpvs(sv, " ");
14648   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14649   * is a crude hack but it may be the best for now since
14650   * we have no flag "this EXACTish node was UTF-8"
14651   * --jhi */
14652   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14653     PERL_PV_ESCAPE_UNI_DETECT |
14654     PERL_PV_ESCAPE_NONASCII   |
14655     PERL_PV_PRETTY_ELLIPSES   |
14656     PERL_PV_PRETTY_LTGT       |
14657     PERL_PV_PRETTY_NOCLEAR
14658     );
14659  } else if (k == TRIE) {
14660   /* print the details of the trie in dumpuntil instead, as
14661   * progi->data isn't available here */
14662   const char op = OP(o);
14663   const U32 n = ARG(o);
14664   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14665    (reg_ac_data *)progi->data->data[n] :
14666    NULL;
14667   const reg_trie_data * const trie
14668    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14669
14670   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14671   DEBUG_TRIE_COMPILE_r(
14672    Perl_sv_catpvf(aTHX_ sv,
14673     "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14674     (UV)trie->startstate,
14675     (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14676     (UV)trie->wordcount,
14677     (UV)trie->minlen,
14678     (UV)trie->maxlen,
14679     (UV)TRIE_CHARCOUNT(trie),
14680     (UV)trie->uniquecharcount
14681    )
14682   );
14683   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14684    int i;
14685    int rangestart = -1;
14686    U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14687    sv_catpvs(sv, "[");
14688    for (i = 0; i <= 256; i++) {
14689     if (i < 256 && BITMAP_TEST(bitmap,i)) {
14690      if (rangestart == -1)
14691       rangestart = i;
14692     } else if (rangestart != -1) {
14693      if (i <= rangestart + 3)
14694       for (; rangestart < i; rangestart++)
14695        put_byte(sv, rangestart);
14696      else {
14697       put_byte(sv, rangestart);
14698       sv_catpvs(sv, "-");
14699       put_byte(sv, i - 1);
14700      }
14701      rangestart = -1;
14702     }
14703    }
14704    sv_catpvs(sv, "]");
14705   }
14706
14707  } else if (k == CURLY) {
14708   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14709    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14710   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14711  }
14712  else if (k == WHILEM && o->flags)   /* Ordinal/of */
14713   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14714  else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14715   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14716   if ( RXp_PAREN_NAMES(prog) ) {
14717    if ( k != REF || (OP(o) < NREF)) {
14718     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14719     SV **name= av_fetch(list, ARG(o), 0 );
14720     if (name)
14721      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14722    }
14723    else {
14724     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14725     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14726     I32 *nums=(I32*)SvPVX(sv_dat);
14727     SV **name= av_fetch(list, nums[0], 0 );
14728     I32 n;
14729     if (name) {
14730      for ( n=0; n<SvIVX(sv_dat); n++ ) {
14731       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14732          (n ? "," : ""), (IV)nums[n]);
14733      }
14734      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14735     }
14736    }
14737   }
14738  } else if (k == GOSUB)
14739   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14740  else if (k == VERB) {
14741   if (!o->flags)
14742    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14743       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14744  } else if (k == LOGICAL)
14745   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14746  else if (k == ANYOF) {
14747   int i, rangestart = -1;
14748   const U8 flags = ANYOF_FLAGS(o);
14749   int do_sep = 0;
14750
14751
14752   if (flags & ANYOF_LOCALE)
14753    sv_catpvs(sv, "{loc}");
14754   if (flags & ANYOF_LOC_FOLD)
14755    sv_catpvs(sv, "{i}");
14756   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14757   if (flags & ANYOF_INVERT)
14758    sv_catpvs(sv, "^");
14759
14760   /* output what the standard cp 0-255 bitmap matches */
14761   for (i = 0; i <= 256; i++) {
14762    if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14763     if (rangestart == -1)
14764      rangestart = i;
14765    } else if (rangestart != -1) {
14766     if (i <= rangestart + 3)
14767      for (; rangestart < i; rangestart++)
14768       put_byte(sv, rangestart);
14769     else {
14770      put_byte(sv, rangestart);
14771      sv_catpvs(sv, "-");
14772      put_byte(sv, i - 1);
14773     }
14774     do_sep = 1;
14775     rangestart = -1;
14776    }
14777   }
14778
14779   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14780   /* output any special charclass tests (used entirely under use locale) */
14781   if (ANYOF_CLASS_TEST_ANY_SET(o))
14782    for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14783     if (ANYOF_CLASS_TEST(o,i)) {
14784      sv_catpv(sv, anyofs[i]);
14785      do_sep = 1;
14786     }
14787
14788   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14789
14790   if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14791    sv_catpvs(sv, "{non-utf8-latin1-all}");
14792   }
14793
14794   /* output information about the unicode matching */
14795   if (flags & ANYOF_UNICODE_ALL)
14796    sv_catpvs(sv, "{unicode_all}");
14797   else if (ANYOF_NONBITMAP(o))
14798    sv_catpvs(sv, "{unicode}");
14799   if (flags & ANYOF_NONBITMAP_NON_UTF8)
14800    sv_catpvs(sv, "{outside bitmap}");
14801
14802   if (ANYOF_NONBITMAP(o)) {
14803    SV *lv; /* Set if there is something outside the bit map */
14804    SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14805    bool byte_output = FALSE;   /* If something in the bitmap has been
14806           output */
14807
14808    if (lv && lv != &PL_sv_undef) {
14809     if (sw) {
14810      U8 s[UTF8_MAXBYTES_CASE+1];
14811
14812      for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14813       uvchr_to_utf8(s, i);
14814
14815       if (i < 256
14816        && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
14817                things already
14818                output as part
14819                of the bitmap */
14820        && swash_fetch(sw, s, TRUE))
14821       {
14822        if (rangestart == -1)
14823         rangestart = i;
14824       } else if (rangestart != -1) {
14825        byte_output = TRUE;
14826        if (i <= rangestart + 3)
14827         for (; rangestart < i; rangestart++) {
14828          put_byte(sv, rangestart);
14829         }
14830        else {
14831         put_byte(sv, rangestart);
14832         sv_catpvs(sv, "-");
14833         put_byte(sv, i-1);
14834        }
14835        rangestart = -1;
14836       }
14837      }
14838     }
14839
14840     {
14841      char *s = savesvpv(lv);
14842      char * const origs = s;
14843
14844      while (*s && *s != '\n')
14845       s++;
14846
14847      if (*s == '\n') {
14848       const char * const t = ++s;
14849
14850       if (byte_output) {
14851        sv_catpvs(sv, " ");
14852       }
14853
14854       while (*s) {
14855        if (*s == '\n') {
14856
14857         /* Truncate very long output */
14858         if (s - origs > 256) {
14859          Perl_sv_catpvf(aTHX_ sv,
14860             "%.*s...",
14861             (int) (s - origs - 1),
14862             t);
14863          goto out_dump;
14864         }
14865         *s = ' ';
14866        }
14867        else if (*s == '\t') {
14868         *s = '-';
14869        }
14870        s++;
14871       }
14872       if (s[-1] == ' ')
14873        s[-1] = 0;
14874
14875       sv_catpv(sv, t);
14876      }
14877
14878     out_dump:
14879
14880      Safefree(origs);
14881     }
14882     SvREFCNT_dec_NN(lv);
14883    }
14884   }
14885
14886   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14887  }
14888  else if (k == POSIXD || k == NPOSIXD) {
14889   U8 index = FLAGS(o) * 2;
14890   if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14891    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14892   }
14893   else {
14894    sv_catpv(sv, anyofs[index]);
14895   }
14896  }
14897  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14898   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14899 #else
14900  PERL_UNUSED_CONTEXT;
14901  PERL_UNUSED_ARG(sv);
14902  PERL_UNUSED_ARG(o);
14903  PERL_UNUSED_ARG(prog);
14904 #endif /* DEBUGGING */
14905 }
14906
14907 SV *
14908 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14909 {    /* Assume that RE_INTUIT is set */
14910  dVAR;
14911  struct regexp *const prog = ReANY(r);
14912  GET_RE_DEBUG_FLAGS_DECL;
14913
14914  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14915  PERL_UNUSED_CONTEXT;
14916
14917  DEBUG_COMPILE_r(
14918   {
14919    const char * const s = SvPV_nolen_const(prog->check_substr
14920      ? prog->check_substr : prog->check_utf8);
14921
14922    if (!PL_colorset) reginitcolors();
14923    PerlIO_printf(Perl_debug_log,
14924      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14925      PL_colors[4],
14926      prog->check_substr ? "" : "utf8 ",
14927      PL_colors[5],PL_colors[0],
14928      s,
14929      PL_colors[1],
14930      (strlen(s) > 60 ? "..." : ""));
14931   } );
14932
14933  return prog->check_substr ? prog->check_substr : prog->check_utf8;
14934 }
14935
14936 /*
14937    pregfree()
14938
14939    handles refcounting and freeing the perl core regexp structure. When
14940    it is necessary to actually free the structure the first thing it
14941    does is call the 'free' method of the regexp_engine associated to
14942    the regexp, allowing the handling of the void *pprivate; member
14943    first. (This routine is not overridable by extensions, which is why
14944    the extensions free is called first.)
14945
14946    See regdupe and regdupe_internal if you change anything here.
14947 */
14948 #ifndef PERL_IN_XSUB_RE
14949 void
14950 Perl_pregfree(pTHX_ REGEXP *r)
14951 {
14952  SvREFCNT_dec(r);
14953 }
14954
14955 void
14956 Perl_pregfree2(pTHX_ REGEXP *rx)
14957 {
14958  dVAR;
14959  struct regexp *const r = ReANY(rx);
14960  GET_RE_DEBUG_FLAGS_DECL;
14961
14962  PERL_ARGS_ASSERT_PREGFREE2;
14963
14964  if (r->mother_re) {
14965   ReREFCNT_dec(r->mother_re);
14966  } else {
14967   CALLREGFREE_PVT(rx); /* free the private data */
14968   SvREFCNT_dec(RXp_PAREN_NAMES(r));
14969   Safefree(r->xpv_len_u.xpvlenu_pv);
14970  }
14971  if (r->substrs) {
14972   SvREFCNT_dec(r->anchored_substr);
14973   SvREFCNT_dec(r->anchored_utf8);
14974   SvREFCNT_dec(r->float_substr);
14975   SvREFCNT_dec(r->float_utf8);
14976   Safefree(r->substrs);
14977  }
14978  RX_MATCH_COPY_FREE(rx);
14979 #ifdef PERL_ANY_COW
14980  SvREFCNT_dec(r->saved_copy);
14981 #endif
14982  Safefree(r->offs);
14983  SvREFCNT_dec(r->qr_anoncv);
14984  rx->sv_u.svu_rx = 0;
14985 }
14986
14987 /*  reg_temp_copy()
14988
14989  This is a hacky workaround to the structural issue of match results
14990  being stored in the regexp structure which is in turn stored in
14991  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14992  could be PL_curpm in multiple contexts, and could require multiple
14993  result sets being associated with the pattern simultaneously, such
14994  as when doing a recursive match with (??{$qr})
14995
14996  The solution is to make a lightweight copy of the regexp structure
14997  when a qr// is returned from the code executed by (??{$qr}) this
14998  lightweight copy doesn't actually own any of its data except for
14999  the starp/end and the actual regexp structure itself.
15000
15001 */
15002
15003
15004 REGEXP *
15005 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15006 {
15007  struct regexp *ret;
15008  struct regexp *const r = ReANY(rx);
15009  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15010
15011  PERL_ARGS_ASSERT_REG_TEMP_COPY;
15012
15013  if (!ret_x)
15014   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15015  else {
15016   SvOK_off((SV *)ret_x);
15017   if (islv) {
15018    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15019    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
15020    made both spots point to the same regexp body.) */
15021    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15022    assert(!SvPVX(ret_x));
15023    ret_x->sv_u.svu_rx = temp->sv_any;
15024    temp->sv_any = NULL;
15025    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15026    SvREFCNT_dec_NN(temp);
15027    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15028    ing below will not set it. */
15029    SvCUR_set(ret_x, SvCUR(rx));
15030   }
15031  }
15032  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15033  sv_force_normal(sv) is called.  */
15034  SvFAKE_on(ret_x);
15035  ret = ReANY(ret_x);
15036
15037  SvFLAGS(ret_x) |= SvUTF8(rx);
15038  /* We share the same string buffer as the original regexp, on which we
15039  hold a reference count, incremented when mother_re is set below.
15040  The string pointer is copied here, being part of the regexp struct.
15041  */
15042  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15043   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15044  if (r->offs) {
15045   const I32 npar = r->nparens+1;
15046   Newx(ret->offs, npar, regexp_paren_pair);
15047   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15048  }
15049  if (r->substrs) {
15050   Newx(ret->substrs, 1, struct reg_substr_data);
15051   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15052
15053   SvREFCNT_inc_void(ret->anchored_substr);
15054   SvREFCNT_inc_void(ret->anchored_utf8);
15055   SvREFCNT_inc_void(ret->float_substr);
15056   SvREFCNT_inc_void(ret->float_utf8);
15057
15058   /* check_substr and check_utf8, if non-NULL, point to either their
15059   anchored or float namesakes, and don't hold a second reference.  */
15060  }
15061  RX_MATCH_COPIED_off(ret_x);
15062 #ifdef PERL_ANY_COW
15063  ret->saved_copy = NULL;
15064 #endif
15065  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15066  SvREFCNT_inc_void(ret->qr_anoncv);
15067
15068  return ret_x;
15069 }
15070 #endif
15071
15072 /* regfree_internal()
15073
15074    Free the private data in a regexp. This is overloadable by
15075    extensions. Perl takes care of the regexp structure in pregfree(),
15076    this covers the *pprivate pointer which technically perl doesn't
15077    know about, however of course we have to handle the
15078    regexp_internal structure when no extension is in use.
15079
15080    Note this is called before freeing anything in the regexp
15081    structure.
15082  */
15083
15084 void
15085 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15086 {
15087  dVAR;
15088  struct regexp *const r = ReANY(rx);
15089  RXi_GET_DECL(r,ri);
15090  GET_RE_DEBUG_FLAGS_DECL;
15091
15092  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15093
15094  DEBUG_COMPILE_r({
15095   if (!PL_colorset)
15096    reginitcolors();
15097   {
15098    SV *dsv= sv_newmortal();
15099    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15100     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15101    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15102     PL_colors[4],PL_colors[5],s);
15103   }
15104  });
15105 #ifdef RE_TRACK_PATTERN_OFFSETS
15106  if (ri->u.offsets)
15107   Safefree(ri->u.offsets);             /* 20010421 MJD */
15108 #endif
15109  if (ri->code_blocks) {
15110   int n;
15111   for (n = 0; n < ri->num_code_blocks; n++)
15112    SvREFCNT_dec(ri->code_blocks[n].src_regex);
15113   Safefree(ri->code_blocks);
15114  }
15115
15116  if (ri->data) {
15117   int n = ri->data->count;
15118
15119   while (--n >= 0) {
15120   /* If you add a ->what type here, update the comment in regcomp.h */
15121    switch (ri->data->what[n]) {
15122    case 'a':
15123    case 'r':
15124    case 's':
15125    case 'S':
15126    case 'u':
15127     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15128     break;
15129    case 'f':
15130     Safefree(ri->data->data[n]);
15131     break;
15132    case 'l':
15133    case 'L':
15134     break;
15135    case 'T':
15136     { /* Aho Corasick add-on structure for a trie node.
15137      Used in stclass optimization only */
15138      U32 refcount;
15139      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15140      OP_REFCNT_LOCK;
15141      refcount = --aho->refcount;
15142      OP_REFCNT_UNLOCK;
15143      if ( !refcount ) {
15144       PerlMemShared_free(aho->states);
15145       PerlMemShared_free(aho->fail);
15146       /* do this last!!!! */
15147       PerlMemShared_free(ri->data->data[n]);
15148       PerlMemShared_free(ri->regstclass);
15149      }
15150     }
15151     break;
15152    case 't':
15153     {
15154      /* trie structure. */
15155      U32 refcount;
15156      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15157      OP_REFCNT_LOCK;
15158      refcount = --trie->refcount;
15159      OP_REFCNT_UNLOCK;
15160      if ( !refcount ) {
15161       PerlMemShared_free(trie->charmap);
15162       PerlMemShared_free(trie->states);
15163       PerlMemShared_free(trie->trans);
15164       if (trie->bitmap)
15165        PerlMemShared_free(trie->bitmap);
15166       if (trie->jump)
15167        PerlMemShared_free(trie->jump);
15168       PerlMemShared_free(trie->wordinfo);
15169       /* do this last!!!! */
15170       PerlMemShared_free(ri->data->data[n]);
15171      }
15172     }
15173     break;
15174    default:
15175     Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15176    }
15177   }
15178   Safefree(ri->data->what);
15179   Safefree(ri->data);
15180  }
15181
15182  Safefree(ri);
15183 }
15184
15185 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15186 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15187 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15188
15189 /*
15190    re_dup - duplicate a regexp.
15191
15192    This routine is expected to clone a given regexp structure. It is only
15193    compiled under USE_ITHREADS.
15194
15195    After all of the core data stored in struct regexp is duplicated
15196    the regexp_engine.dupe method is used to copy any private data
15197    stored in the *pprivate pointer. This allows extensions to handle
15198    any duplication it needs to do.
15199
15200    See pregfree() and regfree_internal() if you change anything here.
15201 */
15202 #if defined(USE_ITHREADS)
15203 #ifndef PERL_IN_XSUB_RE
15204 void
15205 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15206 {
15207  dVAR;
15208  I32 npar;
15209  const struct regexp *r = ReANY(sstr);
15210  struct regexp *ret = ReANY(dstr);
15211
15212  PERL_ARGS_ASSERT_RE_DUP_GUTS;
15213
15214  npar = r->nparens+1;
15215  Newx(ret->offs, npar, regexp_paren_pair);
15216  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15217
15218  if (ret->substrs) {
15219   /* Do it this way to avoid reading from *r after the StructCopy().
15220   That way, if any of the sv_dup_inc()s dislodge *r from the L1
15221   cache, it doesn't matter.  */
15222   const bool anchored = r->check_substr
15223    ? r->check_substr == r->anchored_substr
15224    : r->check_utf8 == r->anchored_utf8;
15225   Newx(ret->substrs, 1, struct reg_substr_data);
15226   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15227
15228   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15229   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15230   ret->float_substr = sv_dup_inc(ret->float_substr, param);
15231   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15232
15233   /* check_substr and check_utf8, if non-NULL, point to either their
15234   anchored or float namesakes, and don't hold a second reference.  */
15235
15236   if (ret->check_substr) {
15237    if (anchored) {
15238     assert(r->check_utf8 == r->anchored_utf8);
15239     ret->check_substr = ret->anchored_substr;
15240     ret->check_utf8 = ret->anchored_utf8;
15241    } else {
15242     assert(r->check_substr == r->float_substr);
15243     assert(r->check_utf8 == r->float_utf8);
15244     ret->check_substr = ret->float_substr;
15245     ret->check_utf8 = ret->float_utf8;
15246    }
15247   } else if (ret->check_utf8) {
15248    if (anchored) {
15249     ret->check_utf8 = ret->anchored_utf8;
15250    } else {
15251     ret->check_utf8 = ret->float_utf8;
15252    }
15253   }
15254  }
15255
15256  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15257  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15258
15259  if (ret->pprivate)
15260   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15261
15262  if (RX_MATCH_COPIED(dstr))
15263   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15264  else
15265   ret->subbeg = NULL;
15266 #ifdef PERL_ANY_COW
15267  ret->saved_copy = NULL;
15268 #endif
15269
15270  /* Whether mother_re be set or no, we need to copy the string.  We
15271  cannot refrain from copying it when the storage points directly to
15272  our mother regexp, because that's
15273    1: a buffer in a different thread
15274    2: something we no longer hold a reference on
15275    so we need to copy it locally.  */
15276  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15277  ret->mother_re   = NULL;
15278  ret->gofs = 0;
15279 }
15280 #endif /* PERL_IN_XSUB_RE */
15281
15282 /*
15283    regdupe_internal()
15284
15285    This is the internal complement to regdupe() which is used to copy
15286    the structure pointed to by the *pprivate pointer in the regexp.
15287    This is the core version of the extension overridable cloning hook.
15288    The regexp structure being duplicated will be copied by perl prior
15289    to this and will be provided as the regexp *r argument, however
15290    with the /old/ structures pprivate pointer value. Thus this routine
15291    may override any copying normally done by perl.
15292
15293    It returns a pointer to the new regexp_internal structure.
15294 */
15295
15296 void *
15297 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15298 {
15299  dVAR;
15300  struct regexp *const r = ReANY(rx);
15301  regexp_internal *reti;
15302  int len;
15303  RXi_GET_DECL(r,ri);
15304
15305  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15306
15307  len = ProgLen(ri);
15308
15309  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15310  Copy(ri->program, reti->program, len+1, regnode);
15311
15312  reti->num_code_blocks = ri->num_code_blocks;
15313  if (ri->code_blocks) {
15314   int n;
15315   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15316     struct reg_code_block);
15317   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15318     struct reg_code_block);
15319   for (n = 0; n < ri->num_code_blocks; n++)
15320    reti->code_blocks[n].src_regex = (REGEXP*)
15321      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15322  }
15323  else
15324   reti->code_blocks = NULL;
15325
15326  reti->regstclass = NULL;
15327
15328  if (ri->data) {
15329   struct reg_data *d;
15330   const int count = ri->data->count;
15331   int i;
15332
15333   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15334     char, struct reg_data);
15335   Newx(d->what, count, U8);
15336
15337   d->count = count;
15338   for (i = 0; i < count; i++) {
15339    d->what[i] = ri->data->what[i];
15340    switch (d->what[i]) {
15341     /* see also regcomp.h and regfree_internal() */
15342    case 'a': /* actually an AV, but the dup function is identical.  */
15343    case 'r':
15344    case 's':
15345    case 'S':
15346    case 'u': /* actually an HV, but the dup function is identical.  */
15347     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15348     break;
15349    case 'f':
15350     /* This is cheating. */
15351     Newx(d->data[i], 1, struct regnode_charclass_class);
15352     StructCopy(ri->data->data[i], d->data[i],
15353        struct regnode_charclass_class);
15354     reti->regstclass = (regnode*)d->data[i];
15355     break;
15356    case 'T':
15357     /* Trie stclasses are readonly and can thus be shared
15358     * without duplication. We free the stclass in pregfree
15359     * when the corresponding reg_ac_data struct is freed.
15360     */
15361     reti->regstclass= ri->regstclass;
15362     /* Fall through */
15363    case 't':
15364     OP_REFCNT_LOCK;
15365     ((reg_trie_data*)ri->data->data[i])->refcount++;
15366     OP_REFCNT_UNLOCK;
15367     /* Fall through */
15368    case 'l':
15369    case 'L':
15370     d->data[i] = ri->data->data[i];
15371     break;
15372    default:
15373     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15374    }
15375   }
15376
15377   reti->data = d;
15378  }
15379  else
15380   reti->data = NULL;
15381
15382  reti->name_list_idx = ri->name_list_idx;
15383
15384 #ifdef RE_TRACK_PATTERN_OFFSETS
15385  if (ri->u.offsets) {
15386   Newx(reti->u.offsets, 2*len+1, U32);
15387   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15388  }
15389 #else
15390  SetProgLen(reti,len);
15391 #endif
15392
15393  return (void*)reti;
15394 }
15395
15396 #endif    /* USE_ITHREADS */
15397
15398 #ifndef PERL_IN_XSUB_RE
15399
15400 /*
15401  - regnext - dig the "next" pointer out of a node
15402  */
15403 regnode *
15404 Perl_regnext(pTHX_ regnode *p)
15405 {
15406  dVAR;
15407  I32 offset;
15408
15409  if (!p)
15410   return(NULL);
15411
15412  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
15413   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15414  }
15415
15416  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15417  if (offset == 0)
15418   return(NULL);
15419
15420  return(p+offset);
15421 }
15422 #endif
15423
15424 STATIC void
15425 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15426 {
15427  va_list args;
15428  STRLEN l1 = strlen(pat1);
15429  STRLEN l2 = strlen(pat2);
15430  char buf[512];
15431  SV *msv;
15432  const char *message;
15433
15434  PERL_ARGS_ASSERT_RE_CROAK2;
15435
15436  if (l1 > 510)
15437   l1 = 510;
15438  if (l1 + l2 > 510)
15439   l2 = 510 - l1;
15440  Copy(pat1, buf, l1 , char);
15441  Copy(pat2, buf + l1, l2 , char);
15442  buf[l1 + l2] = '\n';
15443  buf[l1 + l2 + 1] = '\0';
15444 #ifdef I_STDARG
15445  /* ANSI variant takes additional second argument */
15446  va_start(args, pat2);
15447 #else
15448  va_start(args);
15449 #endif
15450  msv = vmess(buf, &args);
15451  va_end(args);
15452  message = SvPV_const(msv,l1);
15453  if (l1 > 512)
15454   l1 = 512;
15455  Copy(message, buf, l1 , char);
15456  buf[l1-1] = '\0';   /* Overwrite \n */
15457  Perl_croak(aTHX_ "%s", buf);
15458 }
15459
15460 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15461
15462 #ifndef PERL_IN_XSUB_RE
15463 void
15464 Perl_save_re_context(pTHX)
15465 {
15466  dVAR;
15467
15468  struct re_save_state *state;
15469
15470  SAVEVPTR(PL_curcop);
15471  SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15472
15473  state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15474  PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15475  SSPUSHUV(SAVEt_RE_STATE);
15476
15477  Copy(&PL_reg_state, state, 1, struct re_save_state);
15478
15479  PL_reg_oldsaved = NULL;
15480  PL_reg_oldsavedlen = 0;
15481  PL_reg_oldsavedoffset = 0;
15482  PL_reg_oldsavedcoffset = 0;
15483  PL_reg_maxiter = 0;
15484  PL_reg_leftiter = 0;
15485  PL_reg_poscache = NULL;
15486  PL_reg_poscache_size = 0;
15487 #ifdef PERL_ANY_COW
15488  PL_nrs = NULL;
15489 #endif
15490
15491  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15492  if (PL_curpm) {
15493   const REGEXP * const rx = PM_GETRE(PL_curpm);
15494   if (rx) {
15495    U32 i;
15496    for (i = 1; i <= RX_NPARENS(rx); i++) {
15497     char digits[TYPE_CHARS(long)];
15498     const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15499     GV *const *const gvp
15500      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15501
15502     if (gvp) {
15503      GV * const gv = *gvp;
15504      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15505       save_scalar(gv);
15506     }
15507    }
15508   }
15509  }
15510 }
15511 #endif
15512
15513 #ifdef DEBUGGING
15514
15515 STATIC void
15516 S_put_byte(pTHX_ SV *sv, int c)
15517 {
15518  PERL_ARGS_ASSERT_PUT_BYTE;
15519
15520  /* Our definition of isPRINT() ignores locales, so only bytes that are
15521  not part of UTF-8 are considered printable. I assume that the same
15522  holds for UTF-EBCDIC.
15523  Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15524  which Wikipedia says:
15525
15526  EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15527  ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15528  identical, to the ASCII delete (DEL) or rubout control character. ...
15529  it is typically mapped to hexadecimal code 9F, in order to provide a
15530  unique character mapping in both directions)
15531
15532  So the old condition can be simplified to !isPRINT(c)  */
15533  if (!isPRINT(c)) {
15534   if (c < 256) {
15535    Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15536   }
15537   else {
15538    Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15539   }
15540  }
15541  else {
15542   const char string = c;
15543   if (c == '-' || c == ']' || c == '\\' || c == '^')
15544    sv_catpvs(sv, "\\");
15545   sv_catpvn(sv, &string, 1);
15546  }
15547 }
15548
15549
15550 #define CLEAR_OPTSTART \
15551  if (optstart) STMT_START { \
15552    DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15553    optstart=NULL; \
15554  } STMT_END
15555
15556 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15557
15558 STATIC const regnode *
15559 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15560    const regnode *last, const regnode *plast,
15561    SV* sv, I32 indent, U32 depth)
15562 {
15563  dVAR;
15564  U8 op = PSEUDO; /* Arbitrary non-END op. */
15565  const regnode *next;
15566  const regnode *optstart= NULL;
15567
15568  RXi_GET_DECL(r,ri);
15569  GET_RE_DEBUG_FLAGS_DECL;
15570
15571  PERL_ARGS_ASSERT_DUMPUNTIL;
15572
15573 #ifdef DEBUG_DUMPUNTIL
15574  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15575   last ? last-start : 0,plast ? plast-start : 0);
15576 #endif
15577
15578  if (plast && plast < last)
15579   last= plast;
15580
15581  while (PL_regkind[op] != END && (!last || node < last)) {
15582   /* While that wasn't END last time... */
15583   NODE_ALIGN(node);
15584   op = OP(node);
15585   if (op == CLOSE || op == WHILEM)
15586    indent--;
15587   next = regnext((regnode *)node);
15588
15589   /* Where, what. */
15590   if (OP(node) == OPTIMIZED) {
15591    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15592     optstart = node;
15593    else
15594     goto after_print;
15595   } else
15596    CLEAR_OPTSTART;
15597
15598   regprop(r, sv, node);
15599   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15600      (int)(2*indent + 1), "", SvPVX_const(sv));
15601
15602   if (OP(node) != OPTIMIZED) {
15603    if (next == NULL)  /* Next ptr. */
15604     PerlIO_printf(Perl_debug_log, " (0)");
15605    else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15606     PerlIO_printf(Perl_debug_log, " (FAIL)");
15607    else
15608     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15609    (void)PerlIO_putc(Perl_debug_log, '\n');
15610   }
15611
15612  after_print:
15613   if (PL_regkind[(U8)op] == BRANCHJ) {
15614    assert(next);
15615    {
15616     const regnode *nnode = (OP(next) == LONGJMP
15617          ? regnext((regnode *)next)
15618          : next);
15619     if (last && nnode > last)
15620      nnode = last;
15621     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15622    }
15623   }
15624   else if (PL_regkind[(U8)op] == BRANCH) {
15625    assert(next);
15626    DUMPUNTIL(NEXTOPER(node), next);
15627   }
15628   else if ( PL_regkind[(U8)op]  == TRIE ) {
15629    const regnode *this_trie = node;
15630    const char op = OP(node);
15631    const U32 n = ARG(node);
15632    const reg_ac_data * const ac = op>=AHOCORASICK ?
15633    (reg_ac_data *)ri->data->data[n] :
15634    NULL;
15635    const reg_trie_data * const trie =
15636     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15637 #ifdef DEBUGGING
15638    AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15639 #endif
15640    const regnode *nextbranch= NULL;
15641    I32 word_idx;
15642    sv_setpvs(sv, "");
15643    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15644     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15645
15646     PerlIO_printf(Perl_debug_log, "%*s%s ",
15647     (int)(2*(indent+3)), "",
15648      elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15649        PL_colors[0], PL_colors[1],
15650        (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15651        PERL_PV_PRETTY_ELLIPSES    |
15652        PERL_PV_PRETTY_LTGT
15653        )
15654        : "???"
15655     );
15656     if (trie->jump) {
15657      U16 dist= trie->jump[word_idx+1];
15658      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15659         (UV)((dist ? this_trie + dist : next) - start));
15660      if (dist) {
15661       if (!nextbranch)
15662        nextbranch= this_trie + trie->jump[0];
15663       DUMPUNTIL(this_trie + dist, nextbranch);
15664      }
15665      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15666       nextbranch= regnext((regnode *)nextbranch);
15667     } else {
15668      PerlIO_printf(Perl_debug_log, "\n");
15669     }
15670    }
15671    if (last && next > last)
15672     node= last;
15673    else
15674     node= next;
15675   }
15676   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15677    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15678      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15679   }
15680   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15681    assert(next);
15682    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15683   }
15684   else if ( op == PLUS || op == STAR) {
15685    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15686   }
15687   else if (PL_regkind[(U8)op] == ANYOF) {
15688    /* arglen 1 + class block */
15689    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15690      ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15691    node = NEXTOPER(node);
15692   }
15693   else if (PL_regkind[(U8)op] == EXACT) {
15694    /* Literal string, where present. */
15695    node += NODE_SZ_STR(node) - 1;
15696    node = NEXTOPER(node);
15697   }
15698   else {
15699    node = NEXTOPER(node);
15700    node += regarglen[(U8)op];
15701   }
15702   if (op == CURLYX || op == OPEN)
15703    indent++;
15704  }
15705  CLEAR_OPTSTART;
15706 #ifdef DEBUG_DUMPUNTIL
15707  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15708 #endif
15709  return node;
15710 }
15711
15712 #endif /* DEBUGGING */
15713
15714 /*
15715  * Local variables:
15716  * c-indentation-style: bsd
15717  * c-basic-offset: 4
15718  * indent-tabs-mode: nil
15719  * End:
15720  *
15721  * ex: set ts=8 sts=4 sw=4 et:
15722  */