5018004/regcomp.c

   1 /*    regcomp.c
   2  */
   3
   4 /*
   5  * 'A fair jaw-cracker dwarf-language must be.'            --Samwise Gamgee
   6  *
   7  *     [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
   8  */
   9
  10 /* This file contains functions for compiling a regular expression.  See
  11  * also regexec.c which funnily enough, contains functions for executing
  12  * a regular expression.
  13  *
  14  * This file is also copied at build time to ext/re/re_comp.c, where
  15  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  16  * This causes the main functions to be compiled under new names and with
  17  * debugging support added, which makes "use re 'debug'" work.
  18  */
  19
  20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  21  * confused with the original package (see point 3 below).  Thanks, Henry!
  22  */
  23
  24 /* Additional note: this code is very heavily munged from Henry's version
  25  * in places.  In some spots I've traded clarity for efficiency, so don't
  26  * blame Henry for some of the lack of readability.
  27  */
  28
  29 /* The names of the functions have been changed from regcomp and
  30  * regexec to pregcomp and pregexec in order to avoid conflicts
  31  * with the POSIX routines of the same names.
  32 */
  33
  34 #ifdef PERL_EXT_RE_BUILD
  35 #include "re_top.h"
  36 #endif
  37
  38 /*
  39  * pregcomp and pregexec -- regsub and regerror are not used in perl
  40  *
  41  * Copyright (c) 1986 by University of Toronto.
  42  * Written by Henry Spencer.  Not derived from licensed software.
  43  *
  44  * Permission is granted to anyone to use this software for any
  45  * purpose on any computer system, and to redistribute it freely,
  46  * subject to the following restrictions:
  47  *
  48  * 1. The author is not responsible for the consequences of use of
  49  *  this software, no matter how awful, even if they arise
  50  *  from defects in it.
  51  *
  52  * 2. The origin of this software must not be misrepresented, either
  53  *  by explicit claim or by omission.
  54  *
  55  * 3. Altered versions must be plainly marked as such, and must not
  56  *  be misrepresented as being the original software.
  57  *
  58  *
  59  ****    Alterations to Henry's code are...
  60  ****
  61  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  62  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  63  ****    by Larry Wall and others
  64  ****
  65  ****    You may distribute under the terms of either the GNU General Public
  66  ****    License or the Artistic License, as specified in the README file.
  67
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGCOMP_C
  75 #include "perl.h"
  76
  77 #ifndef PERL_IN_XSUB_RE
  78 #include "re_defs.h"
  79 #endif
  80
  81 #define REG_COMP_C
  82 #ifdef PERL_IN_XSUB_RE
  83 #  include "re_comp.h"
  84 extern const struct regexp_engine my_reg_engine;
  85 #else
  86 #  include "regcomp.h"
  87 #endif
  88
  89 #include "dquote_static.c"
  90 #include "charclass_invlists.h"
  91 #include "inline_invlist.c"
  92 #include "unicode_constants.h"
  93
  94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
  97
  98 #ifdef op
  99 #undef op
 100 #endif /* op */
 101
 102 #ifdef MSDOS
 103 #  if defined(BUGGY_MSC6)
 104  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
 105 #    pragma optimize("a",off)
 106  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
 107 #    pragma optimize("w",on )
 108 #  endif /* BUGGY_MSC6 */
 109 #endif /* MSDOS */
 110
 111 #ifndef STATIC
 112 #define STATIC static
 113 #endif
 114
 115
 116 typedef struct RExC_state_t {
 117  U32  flags;   /* RXf_* are we folding, multilining? */
 118  U32  pm_flags;  /* PMf_* stuff from the calling PMOP */
 119  char *precomp;  /* uncompiled string. */
 120  REGEXP *rx_sv;   /* The SV that is the regexp. */
 121  regexp *rx;                    /* perl core regexp structure */
 122  regexp_internal *rxi;           /* internal data for regexp object pprivate field */
 123  char *start;   /* Start of input for compile */
 124  char *end;   /* End of input for compile */
 125  char *parse;   /* Input-scan pointer. */
 126  I32  whilem_seen;  /* number of WHILEM in this expr */
 127  regnode *emit_start;  /* Start of emitted-code area */
 128  regnode *emit_bound;  /* First regnode outside of the allocated space */
 129  regnode *emit;   /* Code-emit pointer; &regdummy = don't = compiling */
 130  I32  naughty;  /* How bad is this pattern? */
 131  I32  sawback;  /* Did we see \1, ...? */
 132  U32  seen;
 133  I32  size;   /* Code size. */
 134  I32  npar;   /* Capture buffer count, (OPEN). */
 135  I32  cpar;   /* Capture buffer count, (CLOSE). */
 136  I32  nestroot;  /* root parens we are in - used by accept */
 137  I32  extralen;
 138  I32  seen_zerolen;
 139  regnode **open_parens;  /* pointers to open parens */
 140  regnode **close_parens;  /* pointers to close parens */
 141  regnode *opend;   /* END node in program */
 142  I32  utf8;  /* whether the pattern is utf8 or not */
 143  I32  orig_utf8; /* whether the pattern was originally in utf8 */
 144         /* XXX use this for future optimisation of case
 145         * where pattern must be upgraded to utf8. */
 146  I32  uni_semantics; /* If a d charset modifier should use unicode
 147         rules, even if the pattern is not in
 148         utf8 */
 149  HV  *paren_names;  /* Paren names */
 150
 151  regnode **recurse;  /* Recurse regops */
 152  I32  recurse_count;  /* Number of recurse regops */
 153  I32  in_lookbehind;
 154  I32  contains_locale;
 155  I32  override_recoding;
 156  I32  in_multi_char_class;
 157  struct reg_code_block *code_blocks; /* positions of literal (?{})
 158            within pattern */
 159  int  num_code_blocks; /* size of code_blocks[] */
 160  int  code_index;  /* next code_blocks[] slot */
 161 #if ADD_TO_REGEXEC
 162  char  *starttry;  /* -Dr: where regtry was called. */
 163 #define RExC_starttry (pRExC_state->starttry)
 164 #endif
 165  SV  *runtime_code_qr; /* qr with the runtime code blocks */
 166 #ifdef DEBUGGING
 167  const char  *lastparse;
 168  I32         lastnum;
 169  AV          *paren_name_list;       /* idx -> name */
 170 #define RExC_lastparse (pRExC_state->lastparse)
 171 #define RExC_lastnum (pRExC_state->lastnum)
 172 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 173 #endif
 174 } RExC_state_t;
 175
 176 #define RExC_flags (pRExC_state->flags)
 177 #define RExC_pm_flags (pRExC_state->pm_flags)
 178 #define RExC_precomp (pRExC_state->precomp)
 179 #define RExC_rx_sv (pRExC_state->rx_sv)
 180 #define RExC_rx  (pRExC_state->rx)
 181 #define RExC_rxi (pRExC_state->rxi)
 182 #define RExC_start (pRExC_state->start)
 183 #define RExC_end (pRExC_state->end)
 184 #define RExC_parse (pRExC_state->parse)
 185 #define RExC_whilem_seen (pRExC_state->whilem_seen)
 186 #ifdef RE_TRACK_PATTERN_OFFSETS
 187 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
 188 #endif
 189 #define RExC_emit (pRExC_state->emit)
 190 #define RExC_emit_start (pRExC_state->emit_start)
 191 #define RExC_emit_bound (pRExC_state->emit_bound)
 192 #define RExC_naughty (pRExC_state->naughty)
 193 #define RExC_sawback (pRExC_state->sawback)
 194 #define RExC_seen (pRExC_state->seen)
 195 #define RExC_size (pRExC_state->size)
 196 #define RExC_npar (pRExC_state->npar)
 197 #define RExC_nestroot   (pRExC_state->nestroot)
 198 #define RExC_extralen (pRExC_state->extralen)
 199 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
 200 #define RExC_utf8 (pRExC_state->utf8)
 201 #define RExC_uni_semantics (pRExC_state->uni_semantics)
 202 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
 203 #define RExC_open_parens (pRExC_state->open_parens)
 204 #define RExC_close_parens (pRExC_state->close_parens)
 205 #define RExC_opend (pRExC_state->opend)
 206 #define RExC_paren_names (pRExC_state->paren_names)
 207 #define RExC_recurse (pRExC_state->recurse)
 208 #define RExC_recurse_count (pRExC_state->recurse_count)
 209 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
 210 #define RExC_contains_locale (pRExC_state->contains_locale)
 211 #define RExC_override_recoding (pRExC_state->override_recoding)
 212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
 213
 214
 215 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
 216 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
 217   ((*s) == '{' && regcurly(s, FALSE)))
 218
 219 #ifdef SPSTART
 220 #undef SPSTART  /* dratted cpp namespace... */
 221 #endif
 222 /*
 223  * Flags to be passed up and down.
 224  */
 225 #define WORST  0 /* Worst case. */
 226 #define HASWIDTH 0x01 /* Known to match non-null strings. */
 227
 228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
 229  * character.  (There needs to be a case: in the switch statement in regexec.c
 230  * for any node marked SIMPLE.)  Note that this is not the same thing as
 231  * REGNODE_SIMPLE */
 232 #define SIMPLE  0x02
 233 #define SPSTART  0x04 /* Starts with * or + */
 234 #define POSTPONED 0x08    /* (?1),(?&name), (??{...}) or similar */
 235 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
 236 #define RESTART_UTF8    0x20    /* Restart, need to calcuate sizes as UTF-8 */
 237
 238 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
 239
 240 /* whether trie related optimizations are enabled */
 241 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
 242 #define TRIE_STUDY_OPT
 243 #define FULL_TRIE_STUDY
 244 #define TRIE_STCLASS
 245 #endif
 246
 247
 248
 249 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
 250 #define PBITVAL(paren) (1 << ((paren) & 7))
 251 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
 252 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
 253 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
 254
 255 #define REQUIRE_UTF8 STMT_START {                                       \
 256          if (!UTF) {                           \
 257           *flagp = RESTART_UTF8;            \
 258           return NULL;                      \
 259          }                                     \
 260       } STMT_END
 261
 262 /* This converts the named class defined in regcomp.h to its equivalent class
 263  * number defined in handy.h. */
 264 #define namedclass_to_classnum(class)  ((int) ((class) / 2))
 265 #define classnum_to_namedclass(classnum)  ((classnum) * 2)
 266
 267 /* About scan_data_t.
 268
 269   During optimisation we recurse through the regexp program performing
 270   various inplace (keyhole style) optimisations. In addition study_chunk
 271   and scan_commit populate this data structure with information about
 272   what strings MUST appear in the pattern. We look for the longest
 273   string that must appear at a fixed location, and we look for the
 274   longest string that may appear at a floating location. So for instance
 275   in the pattern:
 276
 277  /FOO[xX]A.*B[xX]BAR/
 278
 279   Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
 280   strings (because they follow a .* construct). study_chunk will identify
 281   both FOO and BAR as being the longest fixed and floating strings respectively.
 282
 283   The strings can be composites, for instance
 284
 285  /(f)(o)(o)/
 286
 287   will result in a composite fixed substring 'foo'.
 288
 289   For each string some basic information is maintained:
 290
 291   - offset or min_offset
 292  This is the position the string must appear at, or not before.
 293  It also implicitly (when combined with minlenp) tells us how many
 294  characters must match before the string we are searching for.
 295  Likewise when combined with minlenp and the length of the string it
 296  tells us how many characters must appear after the string we have
 297  found.
 298
 299   - max_offset
 300  Only used for floating strings. This is the rightmost point that
 301  the string can appear at. If set to I32 max it indicates that the
 302  string can occur infinitely far to the right.
 303
 304   - minlenp
 305  A pointer to the minimum number of characters of the pattern that the
 306  string was found inside. This is important as in the case of positive
 307  lookahead or positive lookbehind we can have multiple patterns
 308  involved. Consider
 309
 310  /(?=FOO).*F/
 311
 312  The minimum length of the pattern overall is 3, the minimum length
 313  of the lookahead part is 3, but the minimum length of the part that
 314  will actually match is 1. So 'FOO's minimum length is 3, but the
 315  minimum length for the F is 1. This is important as the minimum length
 316  is used to determine offsets in front of and behind the string being
 317  looked for.  Since strings can be composites this is the length of the
 318  pattern at the time it was committed with a scan_commit. Note that
 319  the length is calculated by study_chunk, so that the minimum lengths
 320  are not known until the full pattern has been compiled, thus the
 321  pointer to the value.
 322
 323   - lookbehind
 324
 325  In the case of lookbehind the string being searched for can be
 326  offset past the start point of the final matching string.
 327  If this value was just blithely removed from the min_offset it would
 328  invalidate some of the calculations for how many chars must match
 329  before or after (as they are derived from min_offset and minlen and
 330  the length of the string being searched for).
 331  When the final pattern is compiled and the data is moved from the
 332  scan_data_t structure into the regexp structure the information
 333  about lookbehind is factored in, with the information that would
 334  have been lost precalculated in the end_shift field for the
 335  associated string.
 336
 337   The fields pos_min and pos_delta are used to store the minimum offset
 338   and the delta to the maximum offset at the current point in the pattern.
 339
 340 */
 341
 342 typedef struct scan_data_t {
 343  /*I32 len_min;      unused */
 344  /*I32 len_delta;    unused */
 345  I32 pos_min;
 346  I32 pos_delta;
 347  SV *last_found;
 348  I32 last_end;     /* min value, <0 unless valid. */
 349  I32 last_start_min;
 350  I32 last_start_max;
 351  SV **longest;     /* Either &l_fixed, or &l_float. */
 352  SV *longest_fixed;      /* longest fixed string found in pattern */
 353  I32 offset_fixed;       /* offset where it starts */
 354  I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
 355  I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
 356  SV *longest_float;      /* longest floating string found in pattern */
 357  I32 offset_float_min;   /* earliest point in string it can appear */
 358  I32 offset_float_max;   /* latest point in string it can appear */
 359  I32 *minlen_float;      /* pointer to the minlen relevant to the string */
 360  I32 lookbehind_float;   /* is the position of the string modified by LB */
 361  I32 flags;
 362  I32 whilem_c;
 363  I32 *last_closep;
 364  struct regnode_charclass_class *start_class;
 365 } scan_data_t;
 366
 367 /*
 368  * Forward declarations for pregcomp()'s friends.
 369  */
 370
 371 static const scan_data_t zero_scan_data =
 372   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
 373
 374 #define SF_BEFORE_EOL  (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
 375 #define SF_BEFORE_SEOL  0x0001
 376 #define SF_BEFORE_MEOL  0x0002
 377 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
 378 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
 379
 380 #ifdef NO_UNARY_PLUS
 381 #  define SF_FIX_SHIFT_EOL (0+2)
 382 #  define SF_FL_SHIFT_EOL  (0+4)
 383 #else
 384 #  define SF_FIX_SHIFT_EOL (+2)
 385 #  define SF_FL_SHIFT_EOL  (+4)
 386 #endif
 387
 388 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
 389 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
 390
 391 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
 392 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
 393 #define SF_IS_INF  0x0040
 394 #define SF_HAS_PAR  0x0080
 395 #define SF_IN_PAR  0x0100
 396 #define SF_HAS_EVAL  0x0200
 397 #define SCF_DO_SUBSTR  0x0400
 398 #define SCF_DO_STCLASS_AND 0x0800
 399 #define SCF_DO_STCLASS_OR 0x1000
 400 #define SCF_DO_STCLASS  (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
 401 #define SCF_WHILEM_VISITED_POS 0x2000
 402
 403 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
 404 #define SCF_SEEN_ACCEPT         0x8000
 405
 406 #define UTF cBOOL(RExC_utf8)
 407
 408 /* The enums for all these are ordered so things work out correctly */
 409 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
 410 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
 411 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
 412 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
 413 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
 414 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
 415 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
 416
 417 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
 418
 419 #define OOB_NAMEDCLASS  -1
 420
 421 /* There is no code point that is out-of-bounds, so this is problematic.  But
 422  * its only current use is to initialize a variable that is always set before
 423  * looked at. */
 424 #define OOB_UNICODE  0xDEADBEEF
 425
 426 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
 427 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
 428
 429
 430 /* length of regex to show in messages that don't mark a position within */
 431 #define RegexLengthToShowInErrorMessages 127
 432
 433 /*
 434  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
 435  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
 436  * op/pragma/warn/regcomp.
 437  */
 438 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
 439 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
 440
 441 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
 442
 443 /*
 444  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
 445  * arg. Show regex, up to a maximum length. If it's too long, chop and add
 446  * "...".
 447  */
 448 #define _FAIL(code) STMT_START {     \
 449  const char *ellipses = "";      \
 450  IV len = RExC_end - RExC_precomp;     \
 451                   \
 452  if (!SIZE_ONLY)       \
 453   SAVEFREESV(RExC_rx_sv);      \
 454  if (len > RegexLengthToShowInErrorMessages) {   \
 455   /* chop 10 shorter than the max, to ensure meaning of "..." */ \
 456   len = RegexLengthToShowInErrorMessages - 10;   \
 457   ellipses = "...";      \
 458  }         \
 459  code;                                                               \
 460 } STMT_END
 461
 462 #define FAIL(msg) _FAIL(       \
 463  Perl_croak(aTHX_ "%s in regex m/%.*s%s/",     \
 464    msg, (int)len, RExC_precomp, ellipses))
 465
 466 #define FAIL2(msg,arg) _FAIL(       \
 467  Perl_croak(aTHX_ msg " in regex m/%.*s%s/",     \
 468    arg, (int)len, RExC_precomp, ellipses))
 469
 470 /*
 471  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
 472  */
 473 #define Simple_vFAIL(m) STMT_START {     \
 474  const IV offset = RExC_parse - RExC_precomp;   \
 475  Perl_croak(aTHX_ "%s" REPORT_LOCATION,    \
 476    m, (int)offset, RExC_precomp, RExC_precomp + offset); \
 477 } STMT_END
 478
 479 /*
 480  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
 481  */
 482 #define vFAIL(m) STMT_START {    \
 483  if (!SIZE_ONLY)     \
 484   SAVEFREESV(RExC_rx_sv);    \
 485  Simple_vFAIL(m);     \
 486 } STMT_END
 487
 488 /*
 489  * Like Simple_vFAIL(), but accepts two arguments.
 490  */
 491 #define Simple_vFAIL2(m,a1) STMT_START {   \
 492  const IV offset = RExC_parse - RExC_precomp;   \
 493  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,   \
 494    (int)offset, RExC_precomp, RExC_precomp + offset); \
 495 } STMT_END
 496
 497 /*
 498  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
 499  */
 500 #define vFAIL2(m,a1) STMT_START {   \
 501  if (!SIZE_ONLY)     \
 502   SAVEFREESV(RExC_rx_sv);    \
 503  Simple_vFAIL2(m, a1);    \
 504 } STMT_END
 505
 506
 507 /*
 508  * Like Simple_vFAIL(), but accepts three arguments.
 509  */
 510 #define Simple_vFAIL3(m, a1, a2) STMT_START {   \
 511  const IV offset = RExC_parse - RExC_precomp;  \
 512  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,  \
 513    (int)offset, RExC_precomp, RExC_precomp + offset); \
 514 } STMT_END
 515
 516 /*
 517  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
 518  */
 519 #define vFAIL3(m,a1,a2) STMT_START {   \
 520  if (!SIZE_ONLY)     \
 521   SAVEFREESV(RExC_rx_sv);    \
 522  Simple_vFAIL3(m, a1, a2);    \
 523 } STMT_END
 524
 525 /*
 526  * Like Simple_vFAIL(), but accepts four arguments.
 527  */
 528 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {  \
 529  const IV offset = RExC_parse - RExC_precomp;  \
 530  S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,  \
 531    (int)offset, RExC_precomp, RExC_precomp + offset); \
 532 } STMT_END
 533
 534 #define vFAIL4(m,a1,a2,a3) STMT_START {   \
 535  if (!SIZE_ONLY)     \
 536   SAVEFREESV(RExC_rx_sv);    \
 537  Simple_vFAIL4(m, a1, a2, a3);   \
 538 } STMT_END
 539
 540 /* m is not necessarily a "literal string", in this macro */
 541 #define reg_warn_non_literal_string(loc, m) STMT_START {                \
 542  const IV offset = loc - RExC_precomp;                               \
 543  Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
 544    m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
 545 } STMT_END
 546
 547 #define ckWARNreg(loc,m) STMT_START {     \
 548  const IV offset = loc - RExC_precomp;    \
 549  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 550    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 551 } STMT_END
 552
 553 #define vWARN_dep(loc, m) STMT_START {            \
 554  const IV offset = loc - RExC_precomp;    \
 555  Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
 556    (int)offset, RExC_precomp, RExC_precomp + offset);         \
 557 } STMT_END
 558
 559 #define ckWARNdep(loc,m) STMT_START {            \
 560  const IV offset = loc - RExC_precomp;    \
 561  Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),                 \
 562    m REPORT_LOCATION,      \
 563    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 564 } STMT_END
 565
 566 #define ckWARNregdep(loc,m) STMT_START {    \
 567  const IV offset = loc - RExC_precomp;    \
 568  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 569    m REPORT_LOCATION,      \
 570    (int)offset, RExC_precomp, RExC_precomp + offset);  \
 571 } STMT_END
 572
 573 #define ckWARN2regdep(loc,m, a1) STMT_START {    \
 574  const IV offset = loc - RExC_precomp;    \
 575  Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
 576    m REPORT_LOCATION,      \
 577    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 578 } STMT_END
 579
 580 #define ckWARN2reg(loc, m, a1) STMT_START {    \
 581  const IV offset = loc - RExC_precomp;    \
 582  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 583    a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
 584 } STMT_END
 585
 586 #define vWARN3(loc, m, a1, a2) STMT_START {    \
 587  const IV offset = loc - RExC_precomp;    \
 588  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 589    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 590 } STMT_END
 591
 592 #define ckWARN3reg(loc, m, a1, a2) STMT_START {    \
 593  const IV offset = loc - RExC_precomp;    \
 594  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 595    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
 596 } STMT_END
 597
 598 #define vWARN4(loc, m, a1, a2, a3) STMT_START {    \
 599  const IV offset = loc - RExC_precomp;    \
 600  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 601    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 602 } STMT_END
 603
 604 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START {   \
 605  const IV offset = loc - RExC_precomp;    \
 606  Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
 607    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
 608 } STMT_END
 609
 610 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {   \
 611  const IV offset = loc - RExC_precomp;    \
 612  Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,  \
 613    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
 614 } STMT_END
 615
 616
 617 /* Allow for side effects in s */
 618 #define REGC(c,s) STMT_START {   \
 619  if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
 620 } STMT_END
 621
 622 /* Macros for recording node offsets.   20001227 mjd@plover.com
 623  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
 624  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
 625  * Element 0 holds the number n.
 626  * Position is 1 indexed.
 627  */
 628 #ifndef RE_TRACK_PATTERN_OFFSETS
 629 #define Set_Node_Offset_To_R(node,byte)
 630 #define Set_Node_Offset(node,byte)
 631 #define Set_Cur_Node_Offset
 632 #define Set_Node_Length_To_R(node,len)
 633 #define Set_Node_Length(node,len)
 634 #define Set_Node_Cur_Length(node)
 635 #define Node_Offset(n)
 636 #define Node_Length(n)
 637 #define Set_Node_Offset_Length(node,offset,len)
 638 #define ProgLen(ri) ri->u.proglen
 639 #define SetProgLen(ri,x) ri->u.proglen = x
 640 #else
 641 #define ProgLen(ri) ri->u.offsets[0]
 642 #define SetProgLen(ri,x) ri->u.offsets[0] = x
 643 #define Set_Node_Offset_To_R(node,byte) STMT_START {   \
 644  if (! SIZE_ONLY) {       \
 645   MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",  \
 646      __LINE__, (int)(node), (int)(byte)));  \
 647   if((node) < 0) {      \
 648    Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
 649   } else {       \
 650    RExC_offsets[2*(node)-1] = (byte);    \
 651   }        \
 652  }         \
 653 } STMT_END
 654
 655 #define Set_Node_Offset(node,byte) \
 656  Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
 657 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
 658
 659 #define Set_Node_Length_To_R(node,len) STMT_START {   \
 660  if (! SIZE_ONLY) {       \
 661   MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",  \
 662     __LINE__, (int)(node), (int)(len)));   \
 663   if((node) < 0) {      \
 664    Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
 665   } else {       \
 666    RExC_offsets[2*(node)] = (len);    \
 667   }        \
 668  }         \
 669 } STMT_END
 670
 671 #define Set_Node_Length(node,len) \
 672  Set_Node_Length_To_R((node)-RExC_emit_start, len)
 673 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
 674 #define Set_Node_Cur_Length(node) \
 675  Set_Node_Length(node, RExC_parse - parse_start)
 676
 677 /* Get offsets and lengths */
 678 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 679 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 680
 681 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
 682  Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
 683  Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
 684 } STMT_END
 685 #endif
 686
 687 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
 688 #define EXPERIMENTAL_INPLACESCAN
 689 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
 690
 691 #define DEBUG_STUDYDATA(str,data,depth)                              \
 692 DEBUG_OPTIMISE_MORE_r(if(data){                                      \
 693  PerlIO_printf(Perl_debug_log,                                    \
 694   "%*s" str "Pos:%"IVdf"/%"IVdf                                \
 695   " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s",       \
 696   (int)(depth)*2, "",                                          \
 697   (IV)((data)->pos_min),                                       \
 698   (IV)((data)->pos_delta),                                     \
 699   (UV)((data)->flags),                                         \
 700   (IV)((data)->whilem_c),                                      \
 701   (IV)((data)->last_closep ? *((data)->last_closep) : -1),     \
 702   is_inf ? "INF " : ""                                         \
 703  );                                                               \
 704  if ((data)->last_found)                                          \
 705   PerlIO_printf(Perl_debug_log,                                \
 706    "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
 707    " %sFloat: '%s' @ %"IVdf"/%"IVdf"",                      \
 708    SvPVX_const((data)->last_found),                         \
 709    (IV)((data)->last_end),                                  \
 710    (IV)((data)->last_start_min),                            \
 711    (IV)((data)->last_start_max),                            \
 712    ((data)->longest &&                                      \
 713    (data)->longest==&((data)->longest_fixed)) ? "*" : "",  \
 714    SvPVX_const((data)->longest_fixed),                      \
 715    (IV)((data)->offset_fixed),                              \
 716    ((data)->longest &&                                      \
 717    (data)->longest==&((data)->longest_float)) ? "*" : "",  \
 718    SvPVX_const((data)->longest_float),                      \
 719    (IV)((data)->offset_float_min),                          \
 720    (IV)((data)->offset_float_max)                           \
 721   );                                                           \
 722  PerlIO_printf(Perl_debug_log,"\n");                              \
 723 });
 724
 725 /* Mark that we cannot extend a found fixed substring at this point.
 726    Update the longest found anchored substring and the longest found
 727    floating substrings if needed. */
 728
 729 STATIC void
 730 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
 731 {
 732  const STRLEN l = CHR_SVLEN(data->last_found);
 733  const STRLEN old_l = CHR_SVLEN(*data->longest);
 734  GET_RE_DEBUG_FLAGS_DECL;
 735
 736  PERL_ARGS_ASSERT_SCAN_COMMIT;
 737
 738  if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
 739   SvSetMagicSV(*data->longest, data->last_found);
 740   if (*data->longest == data->longest_fixed) {
 741    data->offset_fixed = l ? data->last_start_min : data->pos_min;
 742    if (data->flags & SF_BEFORE_EOL)
 743     data->flags
 744      |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
 745    else
 746     data->flags &= ~SF_FIX_BEFORE_EOL;
 747    data->minlen_fixed=minlenp;
 748    data->lookbehind_fixed=0;
 749   }
 750   else { /* *data->longest == data->longest_float */
 751    data->offset_float_min = l ? data->last_start_min : data->pos_min;
 752    data->offset_float_max = (l
 753          ? data->last_start_max
 754          : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
 755    if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
 756     data->offset_float_max = I32_MAX;
 757    if (data->flags & SF_BEFORE_EOL)
 758     data->flags
 759      |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
 760    else
 761     data->flags &= ~SF_FL_BEFORE_EOL;
 762    data->minlen_float=minlenp;
 763    data->lookbehind_float=0;
 764   }
 765  }
 766  SvCUR_set(data->last_found, 0);
 767  {
 768   SV * const sv = data->last_found;
 769   if (SvUTF8(sv) && SvMAGICAL(sv)) {
 770    MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
 771    if (mg)
 772     mg->mg_len = 0;
 773   }
 774  }
 775  data->last_end = -1;
 776  data->flags &= ~SF_BEFORE_EOL;
 777  DEBUG_STUDYDATA("commit: ",data,0);
 778 }
 779
 780 /* These macros set, clear and test whether the synthetic start class ('ssc',
 781  * given by the parameter) matches an empty string (EOS).  This uses the
 782  * 'next_off' field in the node, to save a bit in the flags field.  The ssc
 783  * stands alone, so there is never a next_off, so this field is otherwise
 784  * unused.  The EOS information is used only for compilation, but theoretically
 785  * it could be passed on to the execution code.  This could be used to store
 786  * more than one bit of information, but only this one is currently used. */
 787 #define SET_SSC_EOS(node)   STMT_START { (node)->next_off = TRUE; } STMT_END
 788 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
 789 #define TEST_SSC_EOS(node)  cBOOL((node)->next_off)
 790
 791 /* Can match anything (initialization) */
 792 STATIC void
 793 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 794 {
 795  PERL_ARGS_ASSERT_CL_ANYTHING;
 796
 797  ANYOF_BITMAP_SETALL(cl);
 798  cl->flags = ANYOF_UNICODE_ALL;
 799  SET_SSC_EOS(cl);
 800
 801  /* If any portion of the regex is to operate under locale rules,
 802  * initialization includes it.  The reason this isn't done for all regexes
 803  * is that the optimizer was written under the assumption that locale was
 804  * all-or-nothing.  Given the complexity and lack of documentation in the
 805  * optimizer, and that there are inadequate test cases for locale, so many
 806  * parts of it may not work properly, it is safest to avoid locale unless
 807  * necessary. */
 808  if (RExC_contains_locale) {
 809   ANYOF_CLASS_SETALL(cl);     /* /l uses class */
 810   cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
 811  }
 812  else {
 813   ANYOF_CLASS_ZERO(cl);     /* Only /l uses class now */
 814  }
 815 }
 816
 817 /* Can match anything (initialization) */
 818 STATIC int
 819 S_cl_is_anything(const struct regnode_charclass_class *cl)
 820 {
 821  int value;
 822
 823  PERL_ARGS_ASSERT_CL_IS_ANYTHING;
 824
 825  for (value = 0; value < ANYOF_MAX; value += 2)
 826   if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
 827    return 1;
 828  if (!(cl->flags & ANYOF_UNICODE_ALL))
 829   return 0;
 830  if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
 831   return 0;
 832  return 1;
 833 }
 834
 835 /* Can match anything (initialization) */
 836 STATIC void
 837 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
 838 {
 839  PERL_ARGS_ASSERT_CL_INIT;
 840
 841  Zero(cl, 1, struct regnode_charclass_class);
 842  cl->type = ANYOF;
 843  cl_anything(pRExC_state, cl);
 844  ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 845 }
 846
 847 /* These two functions currently do the exact same thing */
 848 #define cl_init_zero  S_cl_init
 849
 850 /* 'AND' a given class with another one.  Can create false positives.  'cl'
 851  * should not be inverted.  'and_with->flags & ANYOF_CLASS' should be 0 if
 852  * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
 853 STATIC void
 854 S_cl_and(struct regnode_charclass_class *cl,
 855   const struct regnode_charclass_class *and_with)
 856 {
 857  PERL_ARGS_ASSERT_CL_AND;
 858
 859  assert(PL_regkind[and_with->type] == ANYOF);
 860
 861  /* I (khw) am not sure all these restrictions are necessary XXX */
 862  if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
 863   && !(ANYOF_CLASS_TEST_ANY_SET(cl))
 864   && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 865   && !(and_with->flags & ANYOF_LOC_FOLD)
 866   && !(cl->flags & ANYOF_LOC_FOLD)) {
 867   int i;
 868
 869   if (and_with->flags & ANYOF_INVERT)
 870    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 871     cl->bitmap[i] &= ~and_with->bitmap[i];
 872   else
 873    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
 874     cl->bitmap[i] &= and_with->bitmap[i];
 875  } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
 876
 877  if (and_with->flags & ANYOF_INVERT) {
 878
 879   /* Here, the and'ed node is inverted.  Get the AND of the flags that
 880   * aren't affected by the inversion.  Those that are affected are
 881   * handled individually below */
 882   U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
 883   cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
 884   cl->flags |= affected_flags;
 885
 886   /* We currently don't know how to deal with things that aren't in the
 887   * bitmap, but we know that the intersection is no greater than what
 888   * is already in cl, so let there be false positives that get sorted
 889   * out after the synthetic start class succeeds, and the node is
 890   * matched for real. */
 891
 892   /* The inversion of these two flags indicate that the resulting
 893   * intersection doesn't have them */
 894   if (and_with->flags & ANYOF_UNICODE_ALL) {
 895    cl->flags &= ~ANYOF_UNICODE_ALL;
 896   }
 897   if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
 898    cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
 899   }
 900  }
 901  else {   /* and'd node is not inverted */
 902   U8 outside_bitmap_but_not_utf8; /* Temp variable */
 903
 904   if (! ANYOF_NONBITMAP(and_with)) {
 905
 906    /* Here 'and_with' doesn't match anything outside the bitmap
 907    * (except possibly ANYOF_UNICODE_ALL), which means the
 908    * intersection can't either, except for ANYOF_UNICODE_ALL, in
 909    * which case we don't know what the intersection is, but it's no
 910    * greater than what cl already has, so can just leave it alone,
 911    * with possible false positives */
 912    if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
 913     ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 914     cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
 915    }
 916   }
 917   else if (! ANYOF_NONBITMAP(cl)) {
 918
 919    /* Here, 'and_with' does match something outside the bitmap, and cl
 920    * doesn't have a list of things to match outside the bitmap.  If
 921    * cl can match all code points above 255, the intersection will
 922    * be those above-255 code points that 'and_with' matches.  If cl
 923    * can't match all Unicode code points, it means that it can't
 924    * match anything outside the bitmap (since the 'if' that got us
 925    * into this block tested for that), so we leave the bitmap empty.
 926    */
 927    if (cl->flags & ANYOF_UNICODE_ALL) {
 928     ARG_SET(cl, ARG(and_with));
 929
 930     /* and_with's ARG may match things that don't require UTF8.
 931     * And now cl's will too, in spite of this being an 'and'.  See
 932     * the comments below about the kludge */
 933     cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
 934    }
 935   }
 936   else {
 937    /* Here, both 'and_with' and cl match something outside the
 938    * bitmap.  Currently we do not do the intersection, so just match
 939    * whatever cl had at the beginning.  */
 940   }
 941
 942
 943   /* Take the intersection of the two sets of flags.  However, the
 944   * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'.  This is a
 945   * kludge around the fact that this flag is not treated like the others
 946   * which are initialized in cl_anything().  The way the optimizer works
 947   * is that the synthetic start class (SSC) is initialized to match
 948   * anything, and then the first time a real node is encountered, its
 949   * values are AND'd with the SSC's with the result being the values of
 950   * the real node.  However, there are paths through the optimizer where
 951   * the AND never gets called, so those initialized bits are set
 952   * inappropriately, which is not usually a big deal, as they just cause
 953   * false positives in the SSC, which will just mean a probably
 954   * imperceptible slow down in execution.  However this bit has a
 955   * higher false positive consequence in that it can cause utf8.pm,
 956   * utf8_heavy.pl ... to be loaded when not necessary, which is a much
 957   * bigger slowdown and also causes significant extra memory to be used.
 958   * In order to prevent this, the code now takes a different tack.  The
 959   * bit isn't set unless some part of the regular expression needs it,
 960   * but once set it won't get cleared.  This means that these extra
 961   * modules won't get loaded unless there was some path through the
 962   * pattern that would have required them anyway, and  so any false
 963   * positives that occur by not ANDing them out when they could be
 964   * aren't as severe as they would be if we treated this bit like all
 965   * the others */
 966   outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
 967          & ANYOF_NONBITMAP_NON_UTF8;
 968   cl->flags &= and_with->flags;
 969   cl->flags |= outside_bitmap_but_not_utf8;
 970  }
 971 }
 972
 973 /* 'OR' a given class with another one.  Can create false positives.  'cl'
 974  * should not be inverted.  'or_with->flags & ANYOF_CLASS' should be 0 if
 975  * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
 976 STATIC void
 977 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
 978 {
 979  PERL_ARGS_ASSERT_CL_OR;
 980
 981  if (or_with->flags & ANYOF_INVERT) {
 982
 983   /* Here, the or'd node is to be inverted.  This means we take the
 984   * complement of everything not in the bitmap, but currently we don't
 985   * know what that is, so give up and match anything */
 986   if (ANYOF_NONBITMAP(or_with)) {
 987    cl_anything(pRExC_state, cl);
 988   }
 989   /* We do not use
 990   * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
 991   *   <= (B1 | !B2) | (CL1 | !CL2)
 992   * which is wasteful if CL2 is small, but we ignore CL2:
 993   *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
 994   * XXXX Can we handle case-fold?  Unclear:
 995   *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
 996   *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
 997   */
 998   else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
 999    && !(or_with->flags & ANYOF_LOC_FOLD)
1000    && !(cl->flags & ANYOF_LOC_FOLD) ) {
1001    int i;
1002
1003    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1004     cl->bitmap[i] |= ~or_with->bitmap[i];
1005   } /* XXXX: logic is complicated otherwise */
1006   else {
1007    cl_anything(pRExC_state, cl);
1008   }
1009
1010   /* And, we can just take the union of the flags that aren't affected
1011   * by the inversion */
1012   cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1013
1014   /* For the remaining flags:
1015    ANYOF_UNICODE_ALL and inverted means to not match anything above
1016      255, which means that the union with cl should just be
1017      what cl has in it, so can ignore this flag
1018    ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1019      is 127-255 to match them, but then invert that, so the
1020      union with cl should just be what cl has in it, so can
1021      ignore this flag
1022   */
1023  } else {    /* 'or_with' is not inverted */
1024   /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1025   if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1026    && (!(or_with->flags & ANYOF_LOC_FOLD)
1027     || (cl->flags & ANYOF_LOC_FOLD)) ) {
1028    int i;
1029
1030    /* OR char bitmap and class bitmap separately */
1031    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1032     cl->bitmap[i] |= or_with->bitmap[i];
1033    if (or_with->flags & ANYOF_CLASS) {
1034     ANYOF_CLASS_OR(or_with, cl);
1035    }
1036   }
1037   else { /* XXXX: logic is complicated, leave it along for a moment. */
1038    cl_anything(pRExC_state, cl);
1039   }
1040
1041   if (ANYOF_NONBITMAP(or_with)) {
1042
1043    /* Use the added node's outside-the-bit-map match if there isn't a
1044    * conflict.  If there is a conflict (both nodes match something
1045    * outside the bitmap, but what they match outside is not the same
1046    * pointer, and hence not easily compared until XXX we extend
1047    * inversion lists this far), give up and allow the start class to
1048    * match everything outside the bitmap.  If that stuff is all above
1049    * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1050    if (! ANYOF_NONBITMAP(cl)) {
1051     ARG_SET(cl, ARG(or_with));
1052    }
1053    else if (ARG(cl) != ARG(or_with)) {
1054
1055     if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1056      cl_anything(pRExC_state, cl);
1057     }
1058     else {
1059      cl->flags |= ANYOF_UNICODE_ALL;
1060     }
1061    }
1062   }
1063
1064   /* Take the union */
1065   cl->flags |= or_with->flags;
1066  }
1067 }
1068
1069 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1070 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
1071 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1072 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1073
1074
1075 #ifdef DEBUGGING
1076 /*
1077    dump_trie(trie,widecharmap,revcharmap)
1078    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1079    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1080
1081    These routines dump out a trie in a somewhat readable format.
1082    The _interim_ variants are used for debugging the interim
1083    tables that are used to generate the final compressed
1084    representation which is what dump_trie expects.
1085
1086    Part of the reason for their existence is to provide a form
1087    of documentation as to how the different representations function.
1088
1089 */
1090
1091 /*
1092   Dumps the final compressed table form of the trie to Perl_debug_log.
1093   Used for debugging make_trie().
1094 */
1095
1096 STATIC void
1097 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1098    AV *revcharmap, U32 depth)
1099 {
1100  U32 state;
1101  SV *sv=sv_newmortal();
1102  int colwidth= widecharmap ? 6 : 4;
1103  U16 word;
1104  GET_RE_DEBUG_FLAGS_DECL;
1105
1106  PERL_ARGS_ASSERT_DUMP_TRIE;
1107
1108  PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1109   (int)depth * 2 + 2,"",
1110   "Match","Base","Ofs" );
1111
1112  for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1113   SV ** const tmp = av_fetch( revcharmap, state, 0);
1114   if ( tmp ) {
1115    PerlIO_printf( Perl_debug_log, "%*s",
1116     colwidth,
1117     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1118        PL_colors[0], PL_colors[1],
1119        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1120        PERL_PV_ESCAPE_FIRSTCHAR
1121     )
1122    );
1123   }
1124  }
1125  PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1126   (int)depth * 2 + 2,"");
1127
1128  for( state = 0 ; state < trie->uniquecharcount ; state++ )
1129   PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1130  PerlIO_printf( Perl_debug_log, "\n");
1131
1132  for( state = 1 ; state < trie->statecount ; state++ ) {
1133   const U32 base = trie->states[ state ].trans.base;
1134
1135   PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1136
1137   if ( trie->states[ state ].wordnum ) {
1138    PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1139   } else {
1140    PerlIO_printf( Perl_debug_log, "%6s", "" );
1141   }
1142
1143   PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1144
1145   if ( base ) {
1146    U32 ofs = 0;
1147
1148    while( ( base + ofs  < trie->uniquecharcount ) ||
1149     ( base + ofs - trie->uniquecharcount < trie->lasttrans
1150      && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1151      ofs++;
1152
1153    PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1154
1155    for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1156     if ( ( base + ofs >= trie->uniquecharcount ) &&
1157      ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1158      trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1159     {
1160     PerlIO_printf( Perl_debug_log, "%*"UVXf,
1161      colwidth,
1162      (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1163     } else {
1164      PerlIO_printf( Perl_debug_log, "%*s",colwidth,"   ." );
1165     }
1166    }
1167
1168    PerlIO_printf( Perl_debug_log, "]");
1169
1170   }
1171   PerlIO_printf( Perl_debug_log, "\n" );
1172  }
1173  PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1174  for (word=1; word <= trie->wordcount; word++) {
1175   PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1176    (int)word, (int)(trie->wordinfo[word].prev),
1177    (int)(trie->wordinfo[word].len));
1178  }
1179  PerlIO_printf(Perl_debug_log, "\n" );
1180 }
1181 /*
1182   Dumps a fully constructed but uncompressed trie in list form.
1183   List tries normally only are used for construction when the number of
1184   possible chars (trie->uniquecharcount) is very high.
1185   Used for debugging make_trie().
1186 */
1187 STATIC void
1188 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1189       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1190       U32 depth)
1191 {
1192  U32 state;
1193  SV *sv=sv_newmortal();
1194  int colwidth= widecharmap ? 6 : 4;
1195  GET_RE_DEBUG_FLAGS_DECL;
1196
1197  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1198
1199  /* print out the table precompression.  */
1200  PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1201   (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1202   "------:-----+-----------------\n" );
1203
1204  for( state=1 ; state < next_alloc ; state ++ ) {
1205   U16 charid;
1206
1207   PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1208    (int)depth * 2 + 2,"", (UV)state  );
1209   if ( ! trie->states[ state ].wordnum ) {
1210    PerlIO_printf( Perl_debug_log, "%5s| ","");
1211   } else {
1212    PerlIO_printf( Perl_debug_log, "W%4x| ",
1213     trie->states[ state ].wordnum
1214    );
1215   }
1216   for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1217    SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1218    if ( tmp ) {
1219     PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1220      colwidth,
1221      pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1222        PL_colors[0], PL_colors[1],
1223        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1224        PERL_PV_ESCAPE_FIRSTCHAR
1225      ) ,
1226      TRIE_LIST_ITEM(state,charid).forid,
1227      (UV)TRIE_LIST_ITEM(state,charid).newstate
1228     );
1229     if (!(charid % 10))
1230      PerlIO_printf(Perl_debug_log, "\n%*s| ",
1231       (int)((depth * 2) + 14), "");
1232    }
1233   }
1234   PerlIO_printf( Perl_debug_log, "\n");
1235  }
1236 }
1237
1238 /*
1239   Dumps a fully constructed but uncompressed trie in table form.
1240   This is the normal DFA style state transition table, with a few
1241   twists to facilitate compression later.
1242   Used for debugging make_trie().
1243 */
1244 STATIC void
1245 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1246       HV *widecharmap, AV *revcharmap, U32 next_alloc,
1247       U32 depth)
1248 {
1249  U32 state;
1250  U16 charid;
1251  SV *sv=sv_newmortal();
1252  int colwidth= widecharmap ? 6 : 4;
1253  GET_RE_DEBUG_FLAGS_DECL;
1254
1255  PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1256
1257  /*
1258  print out the table precompression so that we can do a visual check
1259  that they are identical.
1260  */
1261
1262  PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1263
1264  for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1265   SV ** const tmp = av_fetch( revcharmap, charid, 0);
1266   if ( tmp ) {
1267    PerlIO_printf( Perl_debug_log, "%*s",
1268     colwidth,
1269     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1270        PL_colors[0], PL_colors[1],
1271        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1272        PERL_PV_ESCAPE_FIRSTCHAR
1273     )
1274    );
1275   }
1276  }
1277
1278  PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1279
1280  for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1281   PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1282  }
1283
1284  PerlIO_printf( Perl_debug_log, "\n" );
1285
1286  for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1287
1288   PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1289    (int)depth * 2 + 2,"",
1290    (UV)TRIE_NODENUM( state ) );
1291
1292   for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1293    UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1294    if (v)
1295     PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1296    else
1297     PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1298   }
1299   if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1300    PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1301   } else {
1302    PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1303    trie->states[ TRIE_NODENUM( state ) ].wordnum );
1304   }
1305  }
1306 }
1307
1308 #endif
1309
1310
1311 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1312   startbranch: the first branch in the whole branch sequence
1313   first      : start branch of sequence of branch-exact nodes.
1314    May be the same as startbranch
1315   last       : Thing following the last branch.
1316    May be the same as tail.
1317   tail       : item following the branch sequence
1318   count      : words in the sequence
1319   flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1320   depth      : indent depth
1321
1322 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1323
1324 A trie is an N'ary tree where the branches are determined by digital
1325 decomposition of the key. IE, at the root node you look up the 1st character and
1326 follow that branch repeat until you find the end of the branches. Nodes can be
1327 marked as "accepting" meaning they represent a complete word. Eg:
1328
1329   /he|she|his|hers/
1330
1331 would convert into the following structure. Numbers represent states, letters
1332 following numbers represent valid transitions on the letter from that state, if
1333 the number is in square brackets it represents an accepting state, otherwise it
1334 will be in parenthesis.
1335
1336  +-h->+-e->[3]-+-r->(8)-+-s->[9]
1337  |    |
1338  |   (2)
1339  |    |
1340  (1)   +-i->(6)-+-s->[7]
1341  |
1342  +-s->(3)-+-h->(4)-+-e->[5]
1343
1344  Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1345
1346 This shows that when matching against the string 'hers' we will begin at state 1
1347 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1348 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1349 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1350 single traverse. We store a mapping from accepting to state to which word was
1351 matched, and then when we have multiple possibilities we try to complete the
1352 rest of the regex in the order in which they occured in the alternation.
1353
1354 The only prior NFA like behaviour that would be changed by the TRIE support is
1355 the silent ignoring of duplicate alternations which are of the form:
1356
1357  / (DUPE|DUPE) X? (?{ ... }) Y /x
1358
1359 Thus EVAL blocks following a trie may be called a different number of times with
1360 and without the optimisation. With the optimisations dupes will be silently
1361 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1362 the following demonstrates:
1363
1364  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1365
1366 which prints out 'word' three times, but
1367
1368  'words'=~/(word|word|word)(?{ print $1 })S/
1369
1370 which doesnt print it out at all. This is due to other optimisations kicking in.
1371
1372 Example of what happens on a structural level:
1373
1374 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1375
1376    1: CURLYM[1] {1,32767}(18)
1377    5:   BRANCH(8)
1378    6:     EXACT <ac>(16)
1379    8:   BRANCH(11)
1380    9:     EXACT <ad>(16)
1381   11:   BRANCH(14)
1382   12:     EXACT <ab>(16)
1383   16:   SUCCEED(0)
1384   17:   NOTHING(18)
1385   18: END(0)
1386
1387 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1388 and should turn into:
1389
1390    1: CURLYM[1] {1,32767}(18)
1391    5:   TRIE(16)
1392   [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1393   <ac>
1394   <ad>
1395   <ab>
1396   16:   SUCCEED(0)
1397   17:   NOTHING(18)
1398   18: END(0)
1399
1400 Cases where tail != last would be like /(?foo|bar)baz/:
1401
1402    1: BRANCH(4)
1403    2:   EXACT <foo>(8)
1404    4: BRANCH(7)
1405    5:   EXACT <bar>(8)
1406    7: TAIL(8)
1407    8: EXACT <baz>(10)
1408   10: END(0)
1409
1410 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1411 and would end up looking like:
1412
1413  1: TRIE(8)
1414  [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1415   <foo>
1416   <bar>
1417    7: TAIL(8)
1418    8: EXACT <baz>(10)
1419   10: END(0)
1420
1421  d = uvuni_to_utf8_flags(d, uv, 0);
1422
1423 is the recommended Unicode-aware way of saying
1424
1425  *(d++) = uv;
1426 */
1427
1428 #define TRIE_STORE_REVCHAR(val)                                            \
1429  STMT_START {                                                           \
1430   if (UTF) {          \
1431    SV *zlopp = newSV(7); /* XXX: optimize me */                   \
1432    unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);    \
1433    unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1434    SvCUR_set(zlopp, kapow - flrbbbbb);       \
1435    SvPOK_on(zlopp);         \
1436    SvUTF8_on(zlopp);         \
1437    av_push(revcharmap, zlopp);        \
1438   } else {          \
1439    char ooooff = (char)val;                                           \
1440    av_push(revcharmap, newSVpvn(&ooooff, 1));      \
1441   }           \
1442   } STMT_END
1443
1444 #define TRIE_READ_CHAR STMT_START {                                                     \
1445  wordlen++;                                                                          \
1446  if ( UTF ) {                                                                        \
1447   /* if it is UTF then it is either already folded, or does not need folding */   \
1448   uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
1449  }                                                                                   \
1450  else if (folder == PL_fold_latin1) {                                                \
1451   /* if we use this folder we have to obey unicode rules on latin-1 data */       \
1452   if ( foldlen > 0 ) {                                                            \
1453   uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
1454   foldlen -= len;                                                              \
1455   scan += len;                                                                 \
1456   len = 0;                                                                     \
1457   } else {                                                                        \
1458    len = 1;                                                                    \
1459    uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                     \
1460    skiplen = UNISKIP(uvc);                                                     \
1461    foldlen -= skiplen;                                                         \
1462    scan = foldbuf + skiplen;                                                   \
1463   }                                                                               \
1464  } else {                                                                            \
1465   /* raw data, will be folded later if needed */                                  \
1466   uvc = (U32)*uc;                                                                 \
1467   len = 1;                                                                        \
1468  }                                                                                   \
1469 } STMT_END
1470
1471
1472
1473 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
1474  if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
1475   U32 ging = TRIE_LIST_LEN( state ) *= 2;                 \
1476   Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1477  }                                                           \
1478  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
1479  TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
1480  TRIE_LIST_CUR( state )++;                                   \
1481 } STMT_END
1482
1483 #define TRIE_LIST_NEW(state) STMT_START {                       \
1484  Newxz( trie->states[ state ].trans.list,               \
1485   4, reg_trie_trans_le );                                 \
1486  TRIE_LIST_CUR( state ) = 1;                                \
1487  TRIE_LIST_LEN( state ) = 4;                                \
1488 } STMT_END
1489
1490 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
1491  U16 dupe= trie->states[ state ].wordnum;                    \
1492  regnode * const noper_next = regnext( noper );              \
1493                 \
1494  DEBUG_r({                                                   \
1495   /* store the word for dumping */                        \
1496   SV* tmp;                                                \
1497   if (OP(noper) != NOTHING)                               \
1498    tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1499   else                                                    \
1500    tmp = newSVpvn_utf8( "", 0, UTF );   \
1501   av_push( trie_words, tmp );                             \
1502  });                                                         \
1503                 \
1504  curword++;                                                  \
1505  trie->wordinfo[curword].prev   = 0;                         \
1506  trie->wordinfo[curword].len    = wordlen;                   \
1507  trie->wordinfo[curword].accept = state;                     \
1508                 \
1509  if ( noper_next < tail ) {                                  \
1510   if (!trie->jump)                                        \
1511    trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1512   trie->jump[curword] = (U16)(noper_next - convert);      \
1513   if (!jumper)                                            \
1514    jumper = noper_next;                                \
1515   if (!nextbranch)                                        \
1516    nextbranch= regnext(cur);                           \
1517  }                                                           \
1518                 \
1519  if ( dupe ) {                                               \
1520   /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
1521   /* chain, so that when the bits of chain are later    */\
1522   /* linked together, the dups appear in the chain      */\
1523   trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1524   trie->wordinfo[dupe].prev = curword;                    \
1525  } else {                                                    \
1526   /* we haven't inserted this word yet.                */ \
1527   trie->states[ state ].wordnum = curword;                \
1528  }                                                           \
1529 } STMT_END
1530
1531
1532 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)  \
1533  ( ( base + charid >=  ucharcount     \
1534   && base + charid < ubound     \
1535   && state == trie->trans[ base - ucharcount + charid ].check \
1536   && trie->trans[ base - ucharcount + charid ].next )  \
1537   ? trie->trans[ base - ucharcount + charid ].next  \
1538   : ( state==1 ? special : 0 )     \
1539  )
1540
1541 #define MADE_TRIE       1
1542 #define MADE_JUMP_TRIE  2
1543 #define MADE_EXACT_TRIE 4
1544
1545 STATIC I32
1546 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1547 {
1548  dVAR;
1549  /* first pass, loop through and scan words */
1550  reg_trie_data *trie;
1551  HV *widecharmap = NULL;
1552  AV *revcharmap = newAV();
1553  regnode *cur;
1554  const U32 uniflags = UTF8_ALLOW_DEFAULT;
1555  STRLEN len = 0;
1556  UV uvc = 0;
1557  U16 curword = 0;
1558  U32 next_alloc = 0;
1559  regnode *jumper = NULL;
1560  regnode *nextbranch = NULL;
1561  regnode *convert = NULL;
1562  U32 *prev_states; /* temp array mapping each state to previous one */
1563  /* we just use folder as a flag in utf8 */
1564  const U8 * folder = NULL;
1565
1566 #ifdef DEBUGGING
1567  const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1568  AV *trie_words = NULL;
1569  /* along with revcharmap, this only used during construction but both are
1570  * useful during debugging so we store them in the struct when debugging.
1571  */
1572 #else
1573  const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1574  STRLEN trie_charcount=0;
1575 #endif
1576  SV *re_trie_maxbuff;
1577  GET_RE_DEBUG_FLAGS_DECL;
1578
1579  PERL_ARGS_ASSERT_MAKE_TRIE;
1580 #ifndef DEBUGGING
1581  PERL_UNUSED_ARG(depth);
1582 #endif
1583
1584  switch (flags) {
1585   case EXACT: break;
1586   case EXACTFA:
1587   case EXACTFU_SS:
1588   case EXACTFU_TRICKYFOLD:
1589   case EXACTFU: folder = PL_fold_latin1; break;
1590   case EXACTF:  folder = PL_fold; break;
1591   case EXACTFL: folder = PL_fold_locale; break;
1592   default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1593  }
1594
1595  trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1596  trie->refcount = 1;
1597  trie->startstate = 1;
1598  trie->wordcount = word_count;
1599  RExC_rxi->data->data[ data_slot ] = (void*)trie;
1600  trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1601  if (flags == EXACT)
1602   trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1603  trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1604      trie->wordcount+1, sizeof(reg_trie_wordinfo));
1605
1606  DEBUG_r({
1607   trie_words = newAV();
1608  });
1609
1610  re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1611  if (!SvIOK(re_trie_maxbuff)) {
1612   sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1613  }
1614  DEBUG_TRIE_COMPILE_r({
1615     PerlIO_printf( Perl_debug_log,
1616     "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1617     (int)depth * 2 + 2, "",
1618     REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1619     REG_NODE_NUM(last), REG_NODE_NUM(tail),
1620     (int)depth);
1621  });
1622
1623    /* Find the node we are going to overwrite */
1624  if ( first == startbranch && OP( last ) != BRANCH ) {
1625   /* whole branch chain */
1626   convert = first;
1627  } else {
1628   /* branch sub-chain */
1629   convert = NEXTOPER( first );
1630  }
1631
1632  /*  -- First loop and Setup --
1633
1634  We first traverse the branches and scan each word to determine if it
1635  contains widechars, and how many unique chars there are, this is
1636  important as we have to build a table with at least as many columns as we
1637  have unique chars.
1638
1639  We use an array of integers to represent the character codes 0..255
1640  (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1641  native representation of the character value as the key and IV's for the
1642  coded index.
1643
1644  *TODO* If we keep track of how many times each character is used we can
1645  remap the columns so that the table compression later on is more
1646  efficient in terms of memory by ensuring the most common value is in the
1647  middle and the least common are on the outside.  IMO this would be better
1648  than a most to least common mapping as theres a decent chance the most
1649  common letter will share a node with the least common, meaning the node
1650  will not be compressible. With a middle is most common approach the worst
1651  case is when we have the least common nodes twice.
1652
1653  */
1654
1655  for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1656   regnode *noper = NEXTOPER( cur );
1657   const U8 *uc = (U8*)STRING( noper );
1658   const U8 *e  = uc + STR_LEN( noper );
1659   STRLEN foldlen = 0;
1660   U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1661   STRLEN skiplen = 0;
1662   const U8 *scan = (U8*)NULL;
1663   U32 wordlen      = 0;         /* required init */
1664   STRLEN chars = 0;
1665   bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1666
1667   if (OP(noper) == NOTHING) {
1668    regnode *noper_next= regnext(noper);
1669    if (noper_next != tail && OP(noper_next) == flags) {
1670     noper = noper_next;
1671     uc= (U8*)STRING(noper);
1672     e= uc + STR_LEN(noper);
1673     trie->minlen= STR_LEN(noper);
1674    } else {
1675     trie->minlen= 0;
1676     continue;
1677    }
1678   }
1679
1680   if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1681    TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1682           regardless of encoding */
1683    if (OP( noper ) == EXACTFU_SS) {
1684     /* false positives are ok, so just set this */
1685     TRIE_BITMAP_SET(trie,0xDF);
1686    }
1687   }
1688   for ( ; uc < e ; uc += len ) {
1689    TRIE_CHARCOUNT(trie)++;
1690    TRIE_READ_CHAR;
1691    chars++;
1692    if ( uvc < 256 ) {
1693     if ( folder ) {
1694      U8 folded= folder[ (U8) uvc ];
1695      if ( !trie->charmap[ folded ] ) {
1696       trie->charmap[ folded ]=( ++trie->uniquecharcount );
1697       TRIE_STORE_REVCHAR( folded );
1698      }
1699     }
1700     if ( !trie->charmap[ uvc ] ) {
1701      trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1702      TRIE_STORE_REVCHAR( uvc );
1703     }
1704     if ( set_bit ) {
1705      /* store the codepoint in the bitmap, and its folded
1706      * equivalent. */
1707      TRIE_BITMAP_SET(trie, uvc);
1708
1709      /* store the folded codepoint */
1710      if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1711
1712      if ( !UTF ) {
1713       /* store first byte of utf8 representation of
1714       variant codepoints */
1715       if (! UNI_IS_INVARIANT(uvc)) {
1716        TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1717       }
1718      }
1719      set_bit = 0; /* We've done our bit :-) */
1720     }
1721    } else {
1722     SV** svpp;
1723     if ( !widecharmap )
1724      widecharmap = newHV();
1725
1726     svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1727
1728     if ( !svpp )
1729      Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1730
1731     if ( !SvTRUE( *svpp ) ) {
1732      sv_setiv( *svpp, ++trie->uniquecharcount );
1733      TRIE_STORE_REVCHAR(uvc);
1734     }
1735    }
1736   }
1737   if( cur == first ) {
1738    trie->minlen = chars;
1739    trie->maxlen = chars;
1740   } else if (chars < trie->minlen) {
1741    trie->minlen = chars;
1742   } else if (chars > trie->maxlen) {
1743    trie->maxlen = chars;
1744   }
1745   if (OP( noper ) == EXACTFU_SS) {
1746    /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1747    if (trie->minlen > 1)
1748     trie->minlen= 1;
1749   }
1750   if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1751    /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1752    *        - We assume that any such sequence might match a 2 byte string */
1753    if (trie->minlen > 2 )
1754     trie->minlen= 2;
1755   }
1756
1757  } /* end first pass */
1758  DEBUG_TRIE_COMPILE_r(
1759   PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1760     (int)depth * 2 + 2,"",
1761     ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1762     (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1763     (int)trie->minlen, (int)trie->maxlen )
1764  );
1765
1766  /*
1767   We now know what we are dealing with in terms of unique chars and
1768   string sizes so we can calculate how much memory a naive
1769   representation using a flat table  will take. If it's over a reasonable
1770   limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1771   conservative but potentially much slower representation using an array
1772   of lists.
1773
1774   At the end we convert both representations into the same compressed
1775   form that will be used in regexec.c for matching with. The latter
1776   is a form that cannot be used to construct with but has memory
1777   properties similar to the list form and access properties similar
1778   to the table form making it both suitable for fast searches and
1779   small enough that its feasable to store for the duration of a program.
1780
1781   See the comment in the code where the compressed table is produced
1782   inplace from the flat tabe representation for an explanation of how
1783   the compression works.
1784
1785  */
1786
1787
1788  Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1789  prev_states[1] = 0;
1790
1791  if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1792   /*
1793    Second Pass -- Array Of Lists Representation
1794
1795    Each state will be represented by a list of charid:state records
1796    (reg_trie_trans_le) the first such element holds the CUR and LEN
1797    points of the allocated array. (See defines above).
1798
1799    We build the initial structure using the lists, and then convert
1800    it into the compressed table form which allows faster lookups
1801    (but cant be modified once converted).
1802   */
1803
1804   STRLEN transcount = 1;
1805
1806   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1807    "%*sCompiling trie using list compiler\n",
1808    (int)depth * 2 + 2, ""));
1809
1810   trie->states = (reg_trie_state *)
1811    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1812         sizeof(reg_trie_state) );
1813   TRIE_LIST_NEW(1);
1814   next_alloc = 2;
1815
1816   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1817
1818    regnode *noper   = NEXTOPER( cur );
1819    U8 *uc           = (U8*)STRING( noper );
1820    const U8 *e      = uc + STR_LEN( noper );
1821    U32 state        = 1;         /* required init */
1822    U16 charid       = 0;         /* sanity init */
1823    U8 *scan         = (U8*)NULL; /* sanity init */
1824    STRLEN foldlen   = 0;         /* required init */
1825    U32 wordlen      = 0;         /* required init */
1826    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1827    STRLEN skiplen   = 0;
1828
1829    if (OP(noper) == NOTHING) {
1830     regnode *noper_next= regnext(noper);
1831     if (noper_next != tail && OP(noper_next) == flags) {
1832      noper = noper_next;
1833      uc= (U8*)STRING(noper);
1834      e= uc + STR_LEN(noper);
1835     }
1836    }
1837
1838    if (OP(noper) != NOTHING) {
1839     for ( ; uc < e ; uc += len ) {
1840
1841      TRIE_READ_CHAR;
1842
1843      if ( uvc < 256 ) {
1844       charid = trie->charmap[ uvc ];
1845      } else {
1846       SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1847       if ( !svpp ) {
1848        charid = 0;
1849       } else {
1850        charid=(U16)SvIV( *svpp );
1851       }
1852      }
1853      /* charid is now 0 if we dont know the char read, or nonzero if we do */
1854      if ( charid ) {
1855
1856       U16 check;
1857       U32 newstate = 0;
1858
1859       charid--;
1860       if ( !trie->states[ state ].trans.list ) {
1861        TRIE_LIST_NEW( state );
1862       }
1863       for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1864        if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1865         newstate = TRIE_LIST_ITEM( state, check ).newstate;
1866         break;
1867        }
1868       }
1869       if ( ! newstate ) {
1870        newstate = next_alloc++;
1871        prev_states[newstate] = state;
1872        TRIE_LIST_PUSH( state, charid, newstate );
1873        transcount++;
1874       }
1875       state = newstate;
1876      } else {
1877       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1878      }
1879     }
1880    }
1881    TRIE_HANDLE_WORD(state);
1882
1883   } /* end second pass */
1884
1885   /* next alloc is the NEXT state to be allocated */
1886   trie->statecount = next_alloc;
1887   trie->states = (reg_trie_state *)
1888    PerlMemShared_realloc( trie->states,
1889         next_alloc
1890         * sizeof(reg_trie_state) );
1891
1892   /* and now dump it out before we compress it */
1893   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1894               revcharmap, next_alloc,
1895               depth+1)
1896   );
1897
1898   trie->trans = (reg_trie_trans *)
1899    PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1900   {
1901    U32 state;
1902    U32 tp = 0;
1903    U32 zp = 0;
1904
1905
1906    for( state=1 ; state < next_alloc ; state ++ ) {
1907     U32 base=0;
1908
1909     /*
1910     DEBUG_TRIE_COMPILE_MORE_r(
1911      PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1912     );
1913     */
1914
1915     if (trie->states[state].trans.list) {
1916      U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1917      U16 maxid=minid;
1918      U16 idx;
1919
1920      for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1921       const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1922       if ( forid < minid ) {
1923        minid=forid;
1924       } else if ( forid > maxid ) {
1925        maxid=forid;
1926       }
1927      }
1928      if ( transcount < tp + maxid - minid + 1) {
1929       transcount *= 2;
1930       trie->trans = (reg_trie_trans *)
1931        PerlMemShared_realloc( trie->trans,
1932              transcount
1933              * sizeof(reg_trie_trans) );
1934       Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1935      }
1936      base = trie->uniquecharcount + tp - minid;
1937      if ( maxid == minid ) {
1938       U32 set = 0;
1939       for ( ; zp < tp ; zp++ ) {
1940        if ( ! trie->trans[ zp ].next ) {
1941         base = trie->uniquecharcount + zp - minid;
1942         trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1943         trie->trans[ zp ].check = state;
1944         set = 1;
1945         break;
1946        }
1947       }
1948       if ( !set ) {
1949        trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1950        trie->trans[ tp ].check = state;
1951        tp++;
1952        zp = tp;
1953       }
1954      } else {
1955       for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1956        const U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1957        trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1958        trie->trans[ tid ].check = state;
1959       }
1960       tp += ( maxid - minid + 1 );
1961      }
1962      Safefree(trie->states[ state ].trans.list);
1963     }
1964     /*
1965     DEBUG_TRIE_COMPILE_MORE_r(
1966      PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1967     );
1968     */
1969     trie->states[ state ].trans.base=base;
1970    }
1971    trie->lasttrans = tp + 1;
1972   }
1973  } else {
1974   /*
1975   Second Pass -- Flat Table Representation.
1976
1977   we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1978   We know that we will need Charcount+1 trans at most to store the data
1979   (one row per char at worst case) So we preallocate both structures
1980   assuming worst case.
1981
1982   We then construct the trie using only the .next slots of the entry
1983   structs.
1984
1985   We use the .check field of the first entry of the node temporarily to
1986   make compression both faster and easier by keeping track of how many non
1987   zero fields are in the node.
1988
1989   Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1990   transition.
1991
1992   There are two terms at use here: state as a TRIE_NODEIDX() which is a
1993   number representing the first entry of the node, and state as a
1994   TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1995   TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1996   are 2 entrys per node. eg:
1997
1998    A B       A B
1999   1. 2 4    1. 3 7
2000   2. 0 3    3. 0 5
2001   3. 0 0    5. 0 0
2002   4. 0 0    7. 0 0
2003
2004   The table is internally in the right hand, idx form. However as we also
2005   have to deal with the states array which is indexed by nodenum we have to
2006   use TRIE_NODENUM() to convert.
2007
2008   */
2009   DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2010    "%*sCompiling trie using table compiler\n",
2011    (int)depth * 2 + 2, ""));
2012
2013   trie->trans = (reg_trie_trans *)
2014    PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2015         * trie->uniquecharcount + 1,
2016         sizeof(reg_trie_trans) );
2017   trie->states = (reg_trie_state *)
2018    PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2019         sizeof(reg_trie_state) );
2020   next_alloc = trie->uniquecharcount + 1;
2021
2022
2023   for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2024
2025    regnode *noper   = NEXTOPER( cur );
2026    const U8 *uc     = (U8*)STRING( noper );
2027    const U8 *e      = uc + STR_LEN( noper );
2028
2029    U32 state        = 1;         /* required init */
2030
2031    U16 charid       = 0;         /* sanity init */
2032    U32 accept_state = 0;         /* sanity init */
2033    U8 *scan         = (U8*)NULL; /* sanity init */
2034
2035    STRLEN foldlen   = 0;         /* required init */
2036    U32 wordlen      = 0;         /* required init */
2037    STRLEN skiplen   = 0;
2038    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2039
2040    if (OP(noper) == NOTHING) {
2041     regnode *noper_next= regnext(noper);
2042     if (noper_next != tail && OP(noper_next) == flags) {
2043      noper = noper_next;
2044      uc= (U8*)STRING(noper);
2045      e= uc + STR_LEN(noper);
2046     }
2047    }
2048
2049    if ( OP(noper) != NOTHING ) {
2050     for ( ; uc < e ; uc += len ) {
2051
2052      TRIE_READ_CHAR;
2053
2054      if ( uvc < 256 ) {
2055       charid = trie->charmap[ uvc ];
2056      } else {
2057       SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2058       charid = svpp ? (U16)SvIV(*svpp) : 0;
2059      }
2060      if ( charid ) {
2061       charid--;
2062       if ( !trie->trans[ state + charid ].next ) {
2063        trie->trans[ state + charid ].next = next_alloc;
2064        trie->trans[ state ].check++;
2065        prev_states[TRIE_NODENUM(next_alloc)]
2066          = TRIE_NODENUM(state);
2067        next_alloc += trie->uniquecharcount;
2068       }
2069       state = trie->trans[ state + charid ].next;
2070      } else {
2071       Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2072      }
2073      /* charid is now 0 if we dont know the char read, or nonzero if we do */
2074     }
2075    }
2076    accept_state = TRIE_NODENUM( state );
2077    TRIE_HANDLE_WORD(accept_state);
2078
2079   } /* end second pass */
2080
2081   /* and now dump it out before we compress it */
2082   DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2083               revcharmap,
2084               next_alloc, depth+1));
2085
2086   {
2087   /*
2088   * Inplace compress the table.*
2089
2090   For sparse data sets the table constructed by the trie algorithm will
2091   be mostly 0/FAIL transitions or to put it another way mostly empty.
2092   (Note that leaf nodes will not contain any transitions.)
2093
2094   This algorithm compresses the tables by eliminating most such
2095   transitions, at the cost of a modest bit of extra work during lookup:
2096
2097   - Each states[] entry contains a .base field which indicates the
2098   index in the state[] array wheres its transition data is stored.
2099
2100   - If .base is 0 there are no valid transitions from that node.
2101
2102   - If .base is nonzero then charid is added to it to find an entry in
2103   the trans array.
2104
2105   -If trans[states[state].base+charid].check!=state then the
2106   transition is taken to be a 0/Fail transition. Thus if there are fail
2107   transitions at the front of the node then the .base offset will point
2108   somewhere inside the previous nodes data (or maybe even into a node
2109   even earlier), but the .check field determines if the transition is
2110   valid.
2111
2112   XXX - wrong maybe?
2113   The following process inplace converts the table to the compressed
2114   table: We first do not compress the root node 1,and mark all its
2115   .check pointers as 1 and set its .base pointer as 1 as well. This
2116   allows us to do a DFA construction from the compressed table later,
2117   and ensures that any .base pointers we calculate later are greater
2118   than 0.
2119
2120   - We set 'pos' to indicate the first entry of the second node.
2121
2122   - We then iterate over the columns of the node, finding the first and
2123   last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2124   and set the .check pointers accordingly, and advance pos
2125   appropriately and repreat for the next node. Note that when we copy
2126   the next pointers we have to convert them from the original
2127   NODEIDX form to NODENUM form as the former is not valid post
2128   compression.
2129
2130   - If a node has no transitions used we mark its base as 0 and do not
2131   advance the pos pointer.
2132
2133   - If a node only has one transition we use a second pointer into the
2134   structure to fill in allocated fail transitions from other states.
2135   This pointer is independent of the main pointer and scans forward
2136   looking for null transitions that are allocated to a state. When it
2137   finds one it writes the single transition into the "hole".  If the
2138   pointer doesnt find one the single transition is appended as normal.
2139
2140   - Once compressed we can Renew/realloc the structures to release the
2141   excess space.
2142
2143   See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2144   specifically Fig 3.47 and the associated pseudocode.
2145
2146   demq
2147   */
2148   const U32 laststate = TRIE_NODENUM( next_alloc );
2149   U32 state, charid;
2150   U32 pos = 0, zp=0;
2151   trie->statecount = laststate;
2152
2153   for ( state = 1 ; state < laststate ; state++ ) {
2154    U8 flag = 0;
2155    const U32 stateidx = TRIE_NODEIDX( state );
2156    const U32 o_used = trie->trans[ stateidx ].check;
2157    U32 used = trie->trans[ stateidx ].check;
2158    trie->trans[ stateidx ].check = 0;
2159
2160    for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2161     if ( flag || trie->trans[ stateidx + charid ].next ) {
2162      if ( trie->trans[ stateidx + charid ].next ) {
2163       if (o_used == 1) {
2164        for ( ; zp < pos ; zp++ ) {
2165         if ( ! trie->trans[ zp ].next ) {
2166          break;
2167         }
2168        }
2169        trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2170        trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2171        trie->trans[ zp ].check = state;
2172        if ( ++zp > pos ) pos = zp;
2173        break;
2174       }
2175       used--;
2176      }
2177      if ( !flag ) {
2178       flag = 1;
2179       trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2180      }
2181      trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2182      trie->trans[ pos ].check = state;
2183      pos++;
2184     }
2185    }
2186   }
2187   trie->lasttrans = pos + 1;
2188   trie->states = (reg_trie_state *)
2189    PerlMemShared_realloc( trie->states, laststate
2190         * sizeof(reg_trie_state) );
2191   DEBUG_TRIE_COMPILE_MORE_r(
2192     PerlIO_printf( Perl_debug_log,
2193      "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2194      (int)depth * 2 + 2,"",
2195      (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2196      (IV)next_alloc,
2197      (IV)pos,
2198      ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2199    );
2200
2201   } /* end table compress */
2202  }
2203  DEBUG_TRIE_COMPILE_MORE_r(
2204    PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2205     (int)depth * 2 + 2, "",
2206     (UV)trie->statecount,
2207     (UV)trie->lasttrans)
2208  );
2209  /* resize the trans array to remove unused space */
2210  trie->trans = (reg_trie_trans *)
2211   PerlMemShared_realloc( trie->trans, trie->lasttrans
2212        * sizeof(reg_trie_trans) );
2213
2214  {   /* Modify the program and insert the new TRIE node */
2215   U8 nodetype =(U8)(flags & 0xFF);
2216   char *str=NULL;
2217
2218 #ifdef DEBUGGING
2219   regnode *optimize = NULL;
2220 #ifdef RE_TRACK_PATTERN_OFFSETS
2221
2222   U32 mjd_offset = 0;
2223   U32 mjd_nodelen = 0;
2224 #endif /* RE_TRACK_PATTERN_OFFSETS */
2225 #endif /* DEBUGGING */
2226   /*
2227   This means we convert either the first branch or the first Exact,
2228   depending on whether the thing following (in 'last') is a branch
2229   or not and whther first is the startbranch (ie is it a sub part of
2230   the alternation or is it the whole thing.)
2231   Assuming its a sub part we convert the EXACT otherwise we convert
2232   the whole branch sequence, including the first.
2233   */
2234   /* Find the node we are going to overwrite */
2235   if ( first != startbranch || OP( last ) == BRANCH ) {
2236    /* branch sub-chain */
2237    NEXT_OFF( first ) = (U16)(last - first);
2238 #ifdef RE_TRACK_PATTERN_OFFSETS
2239    DEBUG_r({
2240     mjd_offset= Node_Offset((convert));
2241     mjd_nodelen= Node_Length((convert));
2242    });
2243 #endif
2244    /* whole branch chain */
2245   }
2246 #ifdef RE_TRACK_PATTERN_OFFSETS
2247   else {
2248    DEBUG_r({
2249     const  regnode *nop = NEXTOPER( convert );
2250     mjd_offset= Node_Offset((nop));
2251     mjd_nodelen= Node_Length((nop));
2252    });
2253   }
2254   DEBUG_OPTIMISE_r(
2255    PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2256     (int)depth * 2 + 2, "",
2257     (UV)mjd_offset, (UV)mjd_nodelen)
2258   );
2259 #endif
2260   /* But first we check to see if there is a common prefix we can
2261   split out as an EXACT and put in front of the TRIE node.  */
2262   trie->startstate= 1;
2263   if ( trie->bitmap && !widecharmap && !trie->jump  ) {
2264    U32 state;
2265    for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2266     U32 ofs = 0;
2267     I32 idx = -1;
2268     U32 count = 0;
2269     const U32 base = trie->states[ state ].trans.base;
2270
2271     if ( trie->states[state].wordnum )
2272       count = 1;
2273
2274     for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2275      if ( ( base + ofs >= trie->uniquecharcount ) &&
2276       ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2277       trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2278      {
2279       if ( ++count > 1 ) {
2280        SV **tmp = av_fetch( revcharmap, ofs, 0);
2281        const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2282        if ( state == 1 ) break;
2283        if ( count == 2 ) {
2284         Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2285         DEBUG_OPTIMISE_r(
2286          PerlIO_printf(Perl_debug_log,
2287           "%*sNew Start State=%"UVuf" Class: [",
2288           (int)depth * 2 + 2, "",
2289           (UV)state));
2290         if (idx >= 0) {
2291          SV ** const tmp = av_fetch( revcharmap, idx, 0);
2292          const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2293
2294          TRIE_BITMAP_SET(trie,*ch);
2295          if ( folder )
2296           TRIE_BITMAP_SET(trie, folder[ *ch ]);
2297          DEBUG_OPTIMISE_r(
2298           PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2299          );
2300         }
2301        }
2302        TRIE_BITMAP_SET(trie,*ch);
2303        if ( folder )
2304         TRIE_BITMAP_SET(trie,folder[ *ch ]);
2305        DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2306       }
2307       idx = ofs;
2308      }
2309     }
2310     if ( count == 1 ) {
2311      SV **tmp = av_fetch( revcharmap, idx, 0);
2312      STRLEN len;
2313      char *ch = SvPV( *tmp, len );
2314      DEBUG_OPTIMISE_r({
2315       SV *sv=sv_newmortal();
2316       PerlIO_printf( Perl_debug_log,
2317        "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2318        (int)depth * 2 + 2, "",
2319        (UV)state, (UV)idx,
2320        pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2321         PL_colors[0], PL_colors[1],
2322         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2323         PERL_PV_ESCAPE_FIRSTCHAR
2324        )
2325       );
2326      });
2327      if ( state==1 ) {
2328       OP( convert ) = nodetype;
2329       str=STRING(convert);
2330       STR_LEN(convert)=0;
2331      }
2332      STR_LEN(convert) += len;
2333      while (len--)
2334       *str++ = *ch++;
2335     } else {
2336 #ifdef DEBUGGING
2337      if (state>1)
2338       DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2339 #endif
2340      break;
2341     }
2342    }
2343    trie->prefixlen = (state-1);
2344    if (str) {
2345     regnode *n = convert+NODE_SZ_STR(convert);
2346     NEXT_OFF(convert) = NODE_SZ_STR(convert);
2347     trie->startstate = state;
2348     trie->minlen -= (state - 1);
2349     trie->maxlen -= (state - 1);
2350 #ifdef DEBUGGING
2351    /* At least the UNICOS C compiler choked on this
2352     * being argument to DEBUG_r(), so let's just have
2353     * it right here. */
2354    if (
2355 #ifdef PERL_EXT_RE_BUILD
2356     1
2357 #else
2358     DEBUG_r_TEST
2359 #endif
2360     ) {
2361     regnode *fix = convert;
2362     U32 word = trie->wordcount;
2363     mjd_nodelen++;
2364     Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2365     while( ++fix < n ) {
2366      Set_Node_Offset_Length(fix, 0, 0);
2367     }
2368     while (word--) {
2369      SV ** const tmp = av_fetch( trie_words, word, 0 );
2370      if (tmp) {
2371       if ( STR_LEN(convert) <= SvCUR(*tmp) )
2372        sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2373       else
2374        sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2375      }
2376     }
2377    }
2378 #endif
2379     if (trie->maxlen) {
2380      convert = n;
2381     } else {
2382      NEXT_OFF(convert) = (U16)(tail - convert);
2383      DEBUG_r(optimize= n);
2384     }
2385    }
2386   }
2387   if (!jumper)
2388    jumper = last;
2389   if ( trie->maxlen ) {
2390    NEXT_OFF( convert ) = (U16)(tail - convert);
2391    ARG_SET( convert, data_slot );
2392    /* Store the offset to the first unabsorbed branch in
2393    jump[0], which is otherwise unused by the jump logic.
2394    We use this when dumping a trie and during optimisation. */
2395    if (trie->jump)
2396     trie->jump[0] = (U16)(nextbranch - convert);
2397
2398    /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2399    *   and there is a bitmap
2400    *   and the first "jump target" node we found leaves enough room
2401    * then convert the TRIE node into a TRIEC node, with the bitmap
2402    * embedded inline in the opcode - this is hypothetically faster.
2403    */
2404    if ( !trie->states[trie->startstate].wordnum
2405     && trie->bitmap
2406     && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2407    {
2408     OP( convert ) = TRIEC;
2409     Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2410     PerlMemShared_free(trie->bitmap);
2411     trie->bitmap= NULL;
2412    } else
2413     OP( convert ) = TRIE;
2414
2415    /* store the type in the flags */
2416    convert->flags = nodetype;
2417    DEBUG_r({
2418    optimize = convert
2419      + NODE_STEP_REGNODE
2420      + regarglen[ OP( convert ) ];
2421    });
2422    /* XXX We really should free up the resource in trie now,
2423     as we won't use them - (which resources?) dmq */
2424   }
2425   /* needed for dumping*/
2426   DEBUG_r(if (optimize) {
2427    regnode *opt = convert;
2428
2429    while ( ++opt < optimize) {
2430     Set_Node_Offset_Length(opt,0,0);
2431    }
2432    /*
2433     Try to clean up some of the debris left after the
2434     optimisation.
2435    */
2436    while( optimize < jumper ) {
2437     mjd_nodelen += Node_Length((optimize));
2438     OP( optimize ) = OPTIMIZED;
2439     Set_Node_Offset_Length(optimize,0,0);
2440     optimize++;
2441    }
2442    Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2443   });
2444  } /* end node insert */
2445  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2446
2447  /*  Finish populating the prev field of the wordinfo array.  Walk back
2448  *  from each accept state until we find another accept state, and if
2449  *  so, point the first word's .prev field at the second word. If the
2450  *  second already has a .prev field set, stop now. This will be the
2451  *  case either if we've already processed that word's accept state,
2452  *  or that state had multiple words, and the overspill words were
2453  *  already linked up earlier.
2454  */
2455  {
2456   U16 word;
2457   U32 state;
2458   U16 prev;
2459
2460   for (word=1; word <= trie->wordcount; word++) {
2461    prev = 0;
2462    if (trie->wordinfo[word].prev)
2463     continue;
2464    state = trie->wordinfo[word].accept;
2465    while (state) {
2466     state = prev_states[state];
2467     if (!state)
2468      break;
2469     prev = trie->states[state].wordnum;
2470     if (prev)
2471      break;
2472    }
2473    trie->wordinfo[word].prev = prev;
2474   }
2475   Safefree(prev_states);
2476  }
2477
2478
2479  /* and now dump out the compressed format */
2480  DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2481
2482  RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2483 #ifdef DEBUGGING
2484  RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2485  RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2486 #else
2487  SvREFCNT_dec_NN(revcharmap);
2488 #endif
2489  return trie->jump
2490   ? MADE_JUMP_TRIE
2491   : trie->startstate>1
2492    ? MADE_EXACT_TRIE
2493    : MADE_TRIE;
2494 }
2495
2496 STATIC void
2497 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode *stclass, U32 depth)
2498 {
2499 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2500
2501    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2502    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2503    ISBN 0-201-10088-6
2504
2505    We find the fail state for each state in the trie, this state is the longest proper
2506    suffix of the current state's 'word' that is also a proper prefix of another word in our
2507    trie. State 1 represents the word '' and is thus the default fail state. This allows
2508    the DFA not to have to restart after its tried and failed a word at a given point, it
2509    simply continues as though it had been matching the other word in the first place.
2510    Consider
2511  'abcdgu'=~/abcdefg|cdgu/
2512    When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2513    fail, which would bring us to the state representing 'd' in the second word where we would
2514    try 'g' and succeed, proceeding to match 'cdgu'.
2515  */
2516  /* add a fail transition */
2517  const U32 trie_offset = ARG(source);
2518  reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2519  U32 *q;
2520  const U32 ucharcount = trie->uniquecharcount;
2521  const U32 numstates = trie->statecount;
2522  const U32 ubound = trie->lasttrans + ucharcount;
2523  U32 q_read = 0;
2524  U32 q_write = 0;
2525  U32 charid;
2526  U32 base = trie->states[ 1 ].trans.base;
2527  U32 *fail;
2528  reg_ac_data *aho;
2529  const U32 data_slot = add_data( pRExC_state, 1, "T" );
2530  GET_RE_DEBUG_FLAGS_DECL;
2531
2532  PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2533 #ifndef DEBUGGING
2534  PERL_UNUSED_ARG(depth);
2535 #endif
2536
2537
2538  ARG_SET( stclass, data_slot );
2539  aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2540  RExC_rxi->data->data[ data_slot ] = (void*)aho;
2541  aho->trie=trie_offset;
2542  aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2543  Copy( trie->states, aho->states, numstates, reg_trie_state );
2544  Newxz( q, numstates, U32);
2545  aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2546  aho->refcount = 1;
2547  fail = aho->fail;
2548  /* initialize fail[0..1] to be 1 so that we always have
2549  a valid final fail state */
2550  fail[ 0 ] = fail[ 1 ] = 1;
2551
2552  for ( charid = 0; charid < ucharcount ; charid++ ) {
2553   const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2554   if ( newstate ) {
2555    q[ q_write ] = newstate;
2556    /* set to point at the root */
2557    fail[ q[ q_write++ ] ]=1;
2558   }
2559  }
2560  while ( q_read < q_write) {
2561   const U32 cur = q[ q_read++ % numstates ];
2562   base = trie->states[ cur ].trans.base;
2563
2564   for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2565    const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2566    if (ch_state) {
2567     U32 fail_state = cur;
2568     U32 fail_base;
2569     do {
2570      fail_state = fail[ fail_state ];
2571      fail_base = aho->states[ fail_state ].trans.base;
2572     } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2573
2574     fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2575     fail[ ch_state ] = fail_state;
2576     if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2577     {
2578       aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
2579     }
2580     q[ q_write++ % numstates] = ch_state;
2581    }
2582   }
2583  }
2584  /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2585  when we fail in state 1, this allows us to use the
2586  charclass scan to find a valid start char. This is based on the principle
2587  that theres a good chance the string being searched contains lots of stuff
2588  that cant be a start char.
2589  */
2590  fail[ 0 ] = fail[ 1 ] = 0;
2591  DEBUG_TRIE_COMPILE_r({
2592   PerlIO_printf(Perl_debug_log,
2593      "%*sStclass Failtable (%"UVuf" states): 0",
2594      (int)(depth * 2), "", (UV)numstates
2595   );
2596   for( q_read=1; q_read<numstates; q_read++ ) {
2597    PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2598   }
2599   PerlIO_printf(Perl_debug_log, "\n");
2600  });
2601  Safefree(q);
2602  /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2603 }
2604
2605
2606 /*
2607  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2608  * These need to be revisited when a newer toolchain becomes available.
2609  */
2610 #if defined(__sparc64__) && defined(__GNUC__)
2611 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2612 #       undef  SPARC64_GCC_WORKAROUND
2613 #       define SPARC64_GCC_WORKAROUND 1
2614 #   endif
2615 #endif
2616
2617 #define DEBUG_PEEP(str,scan,depth) \
2618  DEBUG_OPTIMISE_r({if (scan){ \
2619  SV * const mysv=sv_newmortal(); \
2620  regnode *Next = regnext(scan); \
2621  regprop(RExC_rx, mysv, scan); \
2622  PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2623  (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2624  Next ? (REG_NODE_NUM(Next)) : 0 ); \
2625    }});
2626
2627
2628 /* The below joins as many adjacent EXACTish nodes as possible into a single
2629  * one.  The regop may be changed if the node(s) contain certain sequences that
2630  * require special handling.  The joining is only done if:
2631  * 1) there is room in the current conglomerated node to entirely contain the
2632  *    next one.
2633  * 2) they are the exact same node type
2634  *
2635  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2636  * these get optimized out
2637  *
2638  * If a node is to match under /i (folded), the number of characters it matches
2639  * can be different than its character length if it contains a multi-character
2640  * fold.  *min_subtract is set to the total delta of the input nodes.
2641  *
2642  * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2643  * and contains LATIN SMALL LETTER SHARP S
2644  *
2645  * This is as good a place as any to discuss the design of handling these
2646  * multi-character fold sequences.  It's been wrong in Perl for a very long
2647  * time.  There are three code points in Unicode whose multi-character folds
2648  * were long ago discovered to mess things up.  The previous designs for
2649  * dealing with these involved assigning a special node for them.  This
2650  * approach doesn't work, as evidenced by this example:
2651  *      "\xDFs" =~ /s\xDF/ui    # Used to fail before these patches
2652  * Both these fold to "sss", but if the pattern is parsed to create a node that
2653  * would match just the \xDF, it won't be able to handle the case where a
2654  * successful match would have to cross the node's boundary.  The new approach
2655  * that hopefully generally solves the problem generates an EXACTFU_SS node
2656  * that is "sss".
2657  *
2658  * It turns out that there are problems with all multi-character folds, and not
2659  * just these three.  Now the code is general, for all such cases, but the
2660  * three still have some special handling.  The approach taken is:
2661  * 1)   This routine examines each EXACTFish node that could contain multi-
2662  *      character fold sequences.  It returns in *min_subtract how much to
2663  *      subtract from the the actual length of the string to get a real minimum
2664  *      match length; it is 0 if there are no multi-char folds.  This delta is
2665  *      used by the caller to adjust the min length of the match, and the delta
2666  *      between min and max, so that the optimizer doesn't reject these
2667  *      possibilities based on size constraints.
2668  * 2)   Certain of these sequences require special handling by the trie code,
2669  *      so, if found, this code changes the joined node type to special ops:
2670  *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
2671  * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2672  *      is used for an EXACTFU node that contains at least one "ss" sequence in
2673  *      it.  For non-UTF-8 patterns and strings, this is the only case where
2674  *      there is a possible fold length change.  That means that a regular
2675  *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
2676  *      with length changes, and so can be processed faster.  regexec.c takes
2677  *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
2678  *      pre-folded by regcomp.c.  This saves effort in regex matching.
2679  *      However, the pre-folding isn't done for non-UTF8 patterns because the
2680  *      fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2681  *      down by forcing the pattern into UTF8 unless necessary.  Also what
2682  *      EXACTF and EXACTFL nodes fold to isn't known until runtime.  The fold
2683  *      possibilities for the non-UTF8 patterns are quite simple, except for
2684  *      the sharp s.  All the ones that don't involve a UTF-8 target string are
2685  *      members of a fold-pair, and arrays are set up for all of them so that
2686  *      the other member of the pair can be found quickly.  Code elsewhere in
2687  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2688  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
2689  *      described in the next item.
2690  * 4)   A problem remains for the sharp s in EXACTF nodes.  Whether it matches
2691  *      'ss' or not is not knowable at compile time.  It will match iff the
2692  *      target string is in UTF-8, unlike the EXACTFU nodes, where it always
2693  *      matches; and the EXACTFL and EXACTFA nodes where it never does.  Thus
2694  *      it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2695  *      described in item 3).  An assumption that the optimizer part of
2696  *      regexec.c (probably unwittingly) makes is that a character in the
2697  *      pattern corresponds to at most a single character in the target string.
2698  *      (And I do mean character, and not byte here, unlike other parts of the
2699  *      documentation that have never been updated to account for multibyte
2700  *      Unicode.)  This assumption is wrong only in this case, as all other
2701  *      cases are either 1-1 folds when no UTF-8 is involved; or is true by
2702  *      virtue of having this file pre-fold UTF-8 patterns.   I'm
2703  *      reluctant to try to change this assumption, so instead the code punts.
2704  *      This routine examines EXACTF nodes for the sharp s, and returns a
2705  *      boolean indicating whether or not the node is an EXACTF node that
2706  *      contains a sharp s.  When it is true, the caller sets a flag that later
2707  *      causes the optimizer in this file to not set values for the floating
2708  *      and fixed string lengths, and thus avoids the optimizer code in
2709  *      regexec.c that makes the invalid assumption.  Thus, there is no
2710  *      optimization based on string lengths for EXACTF nodes that contain the
2711  *      sharp s.  This only happens for /id rules (which means the pattern
2712  *      isn't in UTF-8).
2713  */
2714
2715 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2716  if (PL_regkind[OP(scan)] == EXACT) \
2717   join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2718
2719 STATIC U32
2720 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2721  /* Merge several consecutive EXACTish nodes into one. */
2722  regnode *n = regnext(scan);
2723  U32 stringok = 1;
2724  regnode *next = scan + NODE_SZ_STR(scan);
2725  U32 merged = 0;
2726  U32 stopnow = 0;
2727 #ifdef DEBUGGING
2728  regnode *stop = scan;
2729  GET_RE_DEBUG_FLAGS_DECL;
2730 #else
2731  PERL_UNUSED_ARG(depth);
2732 #endif
2733
2734  PERL_ARGS_ASSERT_JOIN_EXACT;
2735 #ifndef EXPERIMENTAL_INPLACESCAN
2736  PERL_UNUSED_ARG(flags);
2737  PERL_UNUSED_ARG(val);
2738 #endif
2739  DEBUG_PEEP("join",scan,depth);
2740
2741  /* Look through the subsequent nodes in the chain.  Skip NOTHING, merge
2742  * EXACT ones that are mergeable to the current one. */
2743  while (n
2744   && (PL_regkind[OP(n)] == NOTHING
2745    || (stringok && OP(n) == OP(scan)))
2746   && NEXT_OFF(n)
2747   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2748  {
2749
2750   if (OP(n) == TAIL || n > next)
2751    stringok = 0;
2752   if (PL_regkind[OP(n)] == NOTHING) {
2753    DEBUG_PEEP("skip:",n,depth);
2754    NEXT_OFF(scan) += NEXT_OFF(n);
2755    next = n + NODE_STEP_REGNODE;
2756 #ifdef DEBUGGING
2757    if (stringok)
2758     stop = n;
2759 #endif
2760    n = regnext(n);
2761   }
2762   else if (stringok) {
2763    const unsigned int oldl = STR_LEN(scan);
2764    regnode * const nnext = regnext(n);
2765
2766    /* XXX I (khw) kind of doubt that this works on platforms where
2767    * U8_MAX is above 255 because of lots of other assumptions */
2768    /* Don't join if the sum can't fit into a single node */
2769    if (oldl + STR_LEN(n) > U8_MAX)
2770     break;
2771
2772    DEBUG_PEEP("merg",n,depth);
2773    merged++;
2774
2775    NEXT_OFF(scan) += NEXT_OFF(n);
2776    STR_LEN(scan) += STR_LEN(n);
2777    next = n + NODE_SZ_STR(n);
2778    /* Now we can overwrite *n : */
2779    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2780 #ifdef DEBUGGING
2781    stop = next - 1;
2782 #endif
2783    n = nnext;
2784    if (stopnow) break;
2785   }
2786
2787 #ifdef EXPERIMENTAL_INPLACESCAN
2788   if (flags && !NEXT_OFF(n)) {
2789    DEBUG_PEEP("atch", val, depth);
2790    if (reg_off_by_arg[OP(n)]) {
2791     ARG_SET(n, val - n);
2792    }
2793    else {
2794     NEXT_OFF(n) = val - n;
2795    }
2796    stopnow = 1;
2797   }
2798 #endif
2799  }
2800
2801  *min_subtract = 0;
2802  *has_exactf_sharp_s = FALSE;
2803
2804  /* Here, all the adjacent mergeable EXACTish nodes have been merged.  We
2805  * can now analyze for sequences of problematic code points.  (Prior to
2806  * this final joining, sequences could have been split over boundaries, and
2807  * hence missed).  The sequences only happen in folding, hence for any
2808  * non-EXACT EXACTish node */
2809  if (OP(scan) != EXACT) {
2810   const U8 * const s0 = (U8*) STRING(scan);
2811   const U8 * s = s0;
2812   const U8 * const s_end = s0 + STR_LEN(scan);
2813
2814   /* One pass is made over the node's string looking for all the
2815   * possibilities.  to avoid some tests in the loop, there are two main
2816   * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2817   * non-UTF-8 */
2818   if (UTF) {
2819
2820    /* Examine the string for a multi-character fold sequence.  UTF-8
2821    * patterns have all characters pre-folded by the time this code is
2822    * executed */
2823    while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2824          length sequence we are looking for is 2 */
2825    {
2826     int count = 0;
2827     int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2828     if (! len) {    /* Not a multi-char fold: get next char */
2829      s += UTF8SKIP(s);
2830      continue;
2831     }
2832
2833     /* Nodes with 'ss' require special handling, except for EXACTFL
2834     * and EXACTFA for which there is no multi-char fold to this */
2835     if (len == 2 && *s == 's' && *(s+1) == 's'
2836      && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2837     {
2838      count = 2;
2839      OP(scan) = EXACTFU_SS;
2840      s += 2;
2841     }
2842     else if (len == 6   /* len is the same in both ASCII and EBCDIC for these */
2843       && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2844          COMBINING_DIAERESIS_UTF8
2845          COMBINING_ACUTE_ACCENT_UTF8,
2846         6)
2847        || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2848           COMBINING_DIAERESIS_UTF8
2849           COMBINING_ACUTE_ACCENT_UTF8,
2850          6)))
2851     {
2852      count = 3;
2853
2854      /* These two folds require special handling by trie's, so
2855      * change the node type to indicate this.  If EXACTFA and
2856      * EXACTFL were ever to be handled by trie's, this would
2857      * have to be changed.  If this node has already been
2858      * changed to EXACTFU_SS in this loop, leave it as is.  (I
2859      * (khw) think it doesn't matter in regexec.c for UTF
2860      * patterns, but no need to change it */
2861      if (OP(scan) == EXACTFU) {
2862       OP(scan) = EXACTFU_TRICKYFOLD;
2863      }
2864      s += 6;
2865     }
2866     else { /* Here is a generic multi-char fold. */
2867      const U8* multi_end  = s + len;
2868
2869      /* Count how many characters in it.  In the case of /l and
2870      * /aa, no folds which contain ASCII code points are
2871      * allowed, so check for those, and skip if found.  (In
2872      * EXACTFL, no folds are allowed to any Latin1 code point,
2873      * not just ASCII.  But there aren't any of these
2874      * currently, nor ever likely, so don't take the time to
2875      * test for them.  The code that generates the
2876      * is_MULTI_foo() macros croaks should one actually get put
2877      * into Unicode .) */
2878      if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2879       count = utf8_length(s, multi_end);
2880       s = multi_end;
2881      }
2882      else {
2883       while (s < multi_end) {
2884        if (isASCII(*s)) {
2885         s++;
2886         goto next_iteration;
2887        }
2888        else {
2889         s += UTF8SKIP(s);
2890        }
2891        count++;
2892       }
2893      }
2894     }
2895
2896     /* The delta is how long the sequence is minus 1 (1 is how long
2897     * the character that folds to the sequence is) */
2898     *min_subtract += count - 1;
2899    next_iteration: ;
2900    }
2901   }
2902   else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2903
2904    /* Here, the pattern is not UTF-8.  Look for the multi-char folds
2905    * that are all ASCII.  As in the above case, EXACTFL and EXACTFA
2906    * nodes can't have multi-char folds to this range (and there are
2907    * no existing ones in the upper latin1 range).  In the EXACTF
2908    * case we look also for the sharp s, which can be in the final
2909    * position.  Otherwise we can stop looking 1 byte earlier because
2910    * have to find at least two characters for a multi-fold */
2911    const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2912
2913    /* The below is perhaps overboard, but this allows us to save a
2914    * test each time through the loop at the expense of a mask.  This
2915    * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2916    * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
2917    * are 64.  This uses an exclusive 'or' to find that bit and then
2918    * inverts it to form a mask, with just a single 0, in the bit
2919    * position where 'S' and 's' differ. */
2920    const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2921    const U8 s_masked = 's' & S_or_s_mask;
2922
2923    while (s < upper) {
2924     int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2925     if (! len) {    /* Not a multi-char fold. */
2926      if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2927      {
2928       *has_exactf_sharp_s = TRUE;
2929      }
2930      s++;
2931      continue;
2932     }
2933
2934     if (len == 2
2935      && ((*s & S_or_s_mask) == s_masked)
2936      && ((*(s+1) & S_or_s_mask) == s_masked))
2937     {
2938
2939      /* EXACTF nodes need to know that the minimum length
2940      * changed so that a sharp s in the string can match this
2941      * ss in the pattern, but they remain EXACTF nodes, as they
2942      * won't match this unless the target string is is UTF-8,
2943      * which we don't know until runtime */
2944      if (OP(scan) != EXACTF) {
2945       OP(scan) = EXACTFU_SS;
2946      }
2947     }
2948
2949     *min_subtract += len - 1;
2950     s += len;
2951    }
2952   }
2953  }
2954
2955 #ifdef DEBUGGING
2956  /* Allow dumping but overwriting the collection of skipped
2957  * ops and/or strings with fake optimized ops */
2958  n = scan + NODE_SZ_STR(scan);
2959  while (n <= stop) {
2960   OP(n) = OPTIMIZED;
2961   FLAGS(n) = 0;
2962   NEXT_OFF(n) = 0;
2963   n++;
2964  }
2965 #endif
2966  DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2967  return stopnow;
2968 }
2969
2970 /* REx optimizer.  Converts nodes into quicker variants "in place".
2971    Finds fixed substrings.  */
2972
2973 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2974    to the position after last scanned or to NULL. */
2975
2976 #define INIT_AND_WITHP \
2977  assert(!and_withp); \
2978  Newx(and_withp,1,struct regnode_charclass_class); \
2979  SAVEFREEPV(and_withp)
2980
2981 /* this is a chain of data about sub patterns we are processing that
2982    need to be handled separately/specially in study_chunk. Its so
2983    we can simulate recursion without losing state.  */
2984 struct scan_frame;
2985 typedef struct scan_frame {
2986  regnode *last;  /* last node to process in this frame */
2987  regnode *next;  /* next node to process when last is reached */
2988  struct scan_frame *prev; /*previous frame*/
2989  I32 stop; /* what stopparen do we use */
2990 } scan_frame;
2991
2992
2993 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2994
2995 STATIC I32
2996 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2997       I32 *minlenp, I32 *deltap,
2998       regnode *last,
2999       scan_data_t *data,
3000       I32 stopparen,
3001       U8* recursed,
3002       struct regnode_charclass_class *and_withp,
3003       U32 flags, U32 depth)
3004       /* scanp: Start here (read-write). */
3005       /* deltap: Write maxlen-minlen here. */
3006       /* last: Stop before this one. */
3007       /* data: string data about the pattern */
3008       /* stopparen: treat close N as END */
3009       /* recursed: which subroutines have we recursed into */
3010       /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3011 {
3012  dVAR;
3013  I32 min = 0;    /* There must be at least this number of characters to match */
3014  I32 pars = 0, code;
3015  regnode *scan = *scanp, *next;
3016  I32 delta = 0;
3017  int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3018  int is_inf_internal = 0;  /* The studied chunk is infinite */
3019  I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3020  scan_data_t data_fake;
3021  SV *re_trie_maxbuff = NULL;
3022  regnode *first_non_open = scan;
3023  I32 stopmin = I32_MAX;
3024  scan_frame *frame = NULL;
3025  GET_RE_DEBUG_FLAGS_DECL;
3026
3027  PERL_ARGS_ASSERT_STUDY_CHUNK;
3028
3029 #ifdef DEBUGGING
3030  StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3031 #endif
3032
3033  if ( depth == 0 ) {
3034   while (first_non_open && OP(first_non_open) == OPEN)
3035    first_non_open=regnext(first_non_open);
3036  }
3037
3038
3039   fake_study_recurse:
3040  while ( scan && OP(scan) != END && scan < last ){
3041   UV min_subtract = 0;    /* How mmany chars to subtract from the minimum
3042         node length to get a real minimum (because
3043         the folded version may be shorter) */
3044   bool has_exactf_sharp_s = FALSE;
3045   /* Peephole optimizer: */
3046   DEBUG_STUDYDATA("Peep:", data,depth);
3047   DEBUG_PEEP("Peep",scan,depth);
3048
3049   /* Its not clear to khw or hv why this is done here, and not in the
3050   * clauses that deal with EXACT nodes.  khw's guess is that it's
3051   * because of a previous design */
3052   JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3053
3054   /* Follow the next-chain of the current node and optimize
3055   away all the NOTHINGs from it.  */
3056   if (OP(scan) != CURLYX) {
3057    const int max = (reg_off_by_arg[OP(scan)]
3058      ? I32_MAX
3059      /* I32 may be smaller than U16 on CRAYs! */
3060      : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3061    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3062    int noff;
3063    regnode *n = scan;
3064
3065    /* Skip NOTHING and LONGJMP. */
3066    while ((n = regnext(n))
3067     && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3068      || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3069     && off + noff < max)
3070     off += noff;
3071    if (reg_off_by_arg[OP(scan)])
3072     ARG(scan) = off;
3073    else
3074     NEXT_OFF(scan) = off;
3075   }
3076
3077
3078
3079   /* The principal pseudo-switch.  Cannot be a switch, since we
3080   look into several different things.  */
3081   if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3082     || OP(scan) == IFTHEN) {
3083    next = regnext(scan);
3084    code = OP(scan);
3085    /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3086
3087    if (OP(next) == code || code == IFTHEN) {
3088     /* NOTE - There is similar code to this block below for handling
3089     TRIE nodes on a re-study.  If you change stuff here check there
3090     too. */
3091     I32 max1 = 0, min1 = I32_MAX, num = 0;
3092     struct regnode_charclass_class accum;
3093     regnode * const startbranch=scan;
3094
3095     if (flags & SCF_DO_SUBSTR)
3096      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3097     if (flags & SCF_DO_STCLASS)
3098      cl_init_zero(pRExC_state, &accum);
3099
3100     while (OP(scan) == code) {
3101      I32 deltanext, minnext, f = 0, fake;
3102      struct regnode_charclass_class this_class;
3103
3104      num++;
3105      data_fake.flags = 0;
3106      if (data) {
3107       data_fake.whilem_c = data->whilem_c;
3108       data_fake.last_closep = data->last_closep;
3109      }
3110      else
3111       data_fake.last_closep = &fake;
3112
3113      data_fake.pos_delta = delta;
3114      next = regnext(scan);
3115      scan = NEXTOPER(scan);
3116      if (code != BRANCH)
3117       scan = NEXTOPER(scan);
3118      if (flags & SCF_DO_STCLASS) {
3119       cl_init(pRExC_state, &this_class);
3120       data_fake.start_class = &this_class;
3121       f = SCF_DO_STCLASS_AND;
3122      }
3123      if (flags & SCF_WHILEM_VISITED_POS)
3124       f |= SCF_WHILEM_VISITED_POS;
3125
3126      /* we suppose the run is continuous, last=next...*/
3127      minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3128           next, &data_fake,
3129           stopparen, recursed, NULL, f,depth+1);
3130      if (min1 > minnext)
3131       min1 = minnext;
3132      if (deltanext == I32_MAX) {
3133       is_inf = is_inf_internal = 1;
3134       max1 = I32_MAX;
3135      } else if (max1 < minnext + deltanext)
3136       max1 = minnext + deltanext;
3137      scan = next;
3138      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3139       pars++;
3140      if (data_fake.flags & SCF_SEEN_ACCEPT) {
3141       if ( stopmin > minnext)
3142        stopmin = min + min1;
3143       flags &= ~SCF_DO_SUBSTR;
3144       if (data)
3145        data->flags |= SCF_SEEN_ACCEPT;
3146      }
3147      if (data) {
3148       if (data_fake.flags & SF_HAS_EVAL)
3149        data->flags |= SF_HAS_EVAL;
3150       data->whilem_c = data_fake.whilem_c;
3151      }
3152      if (flags & SCF_DO_STCLASS)
3153       cl_or(pRExC_state, &accum, &this_class);
3154     }
3155     if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3156      min1 = 0;
3157     if (flags & SCF_DO_SUBSTR) {
3158      data->pos_min += min1;
3159      if (data->pos_delta >= I32_MAX - (max1 - min1))
3160       data->pos_delta = I32_MAX;
3161      else
3162       data->pos_delta += max1 - min1;
3163      if (max1 != min1 || is_inf)
3164       data->longest = &(data->longest_float);
3165     }
3166     min += min1;
3167     if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
3168      delta = I32_MAX;
3169     else
3170      delta += max1 - min1;
3171     if (flags & SCF_DO_STCLASS_OR) {
3172      cl_or(pRExC_state, data->start_class, &accum);
3173      if (min1) {
3174       cl_and(data->start_class, and_withp);
3175       flags &= ~SCF_DO_STCLASS;
3176      }
3177     }
3178     else if (flags & SCF_DO_STCLASS_AND) {
3179      if (min1) {
3180       cl_and(data->start_class, &accum);
3181       flags &= ~SCF_DO_STCLASS;
3182      }
3183      else {
3184       /* Switch to OR mode: cache the old value of
3185       * data->start_class */
3186       INIT_AND_WITHP;
3187       StructCopy(data->start_class, and_withp,
3188         struct regnode_charclass_class);
3189       flags &= ~SCF_DO_STCLASS_AND;
3190       StructCopy(&accum, data->start_class,
3191         struct regnode_charclass_class);
3192       flags |= SCF_DO_STCLASS_OR;
3193       SET_SSC_EOS(data->start_class);
3194      }
3195     }
3196
3197     if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3198     /* demq.
3199
3200     Assuming this was/is a branch we are dealing with: 'scan' now
3201     points at the item that follows the branch sequence, whatever
3202     it is. We now start at the beginning of the sequence and look
3203     for subsequences of
3204
3205     BRANCH->EXACT=>x1
3206     BRANCH->EXACT=>x2
3207     tail
3208
3209     which would be constructed from a pattern like /A|LIST|OF|WORDS/
3210
3211     If we can find such a subsequence we need to turn the first
3212     element into a trie and then add the subsequent branch exact
3213     strings to the trie.
3214
3215     We have two cases
3216
3217      1. patterns where the whole set of branches can be converted.
3218
3219      2. patterns where only a subset can be converted.
3220
3221     In case 1 we can replace the whole set with a single regop
3222     for the trie. In case 2 we need to keep the start and end
3223     branches so
3224
3225      'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3226      becomes BRANCH TRIE; BRANCH X;
3227
3228     There is an additional case, that being where there is a
3229     common prefix, which gets split out into an EXACT like node
3230     preceding the TRIE node.
3231
3232     If x(1..n)==tail then we can do a simple trie, if not we make
3233     a "jump" trie, such that when we match the appropriate word
3234     we "jump" to the appropriate tail node. Essentially we turn
3235     a nested if into a case structure of sorts.
3236
3237     */
3238
3239      int made=0;
3240      if (!re_trie_maxbuff) {
3241       re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3242       if (!SvIOK(re_trie_maxbuff))
3243        sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3244      }
3245      if ( SvIV(re_trie_maxbuff)>=0  ) {
3246       regnode *cur;
3247       regnode *first = (regnode *)NULL;
3248       regnode *last = (regnode *)NULL;
3249       regnode *tail = scan;
3250       U8 trietype = 0;
3251       U32 count=0;
3252
3253 #ifdef DEBUGGING
3254       SV * const mysv = sv_newmortal();       /* for dumping */
3255 #endif
3256       /* var tail is used because there may be a TAIL
3257       regop in the way. Ie, the exacts will point to the
3258       thing following the TAIL, but the last branch will
3259       point at the TAIL. So we advance tail. If we
3260       have nested (?:) we may have to move through several
3261       tails.
3262       */
3263
3264       while ( OP( tail ) == TAIL ) {
3265        /* this is the TAIL generated by (?:) */
3266        tail = regnext( tail );
3267       }
3268
3269
3270       DEBUG_TRIE_COMPILE_r({
3271        regprop(RExC_rx, mysv, tail );
3272        PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3273         (int)depth * 2 + 2, "",
3274         "Looking for TRIE'able sequences. Tail node is: ",
3275         SvPV_nolen_const( mysv )
3276        );
3277       });
3278
3279       /*
3280
3281        Step through the branches
3282         cur represents each branch,
3283         noper is the first thing to be matched as part of that branch
3284         noper_next is the regnext() of that node.
3285
3286        We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3287        via a "jump trie" but we also support building with NOJUMPTRIE,
3288        which restricts the trie logic to structures like /FOO|BAR/.
3289
3290        If noper is a trieable nodetype then the branch is a possible optimization
3291        target. If we are building under NOJUMPTRIE then we require that noper_next
3292        is the same as scan (our current position in the regex program).
3293
3294        Once we have two or more consecutive such branches we can create a
3295        trie of the EXACT's contents and stitch it in place into the program.
3296
3297        If the sequence represents all of the branches in the alternation we
3298        replace the entire thing with a single TRIE node.
3299
3300        Otherwise when it is a subsequence we need to stitch it in place and
3301        replace only the relevant branches. This means the first branch has
3302        to remain as it is used by the alternation logic, and its next pointer,
3303        and needs to be repointed at the item on the branch chain following
3304        the last branch we have optimized away.
3305
3306        This could be either a BRANCH, in which case the subsequence is internal,
3307        or it could be the item following the branch sequence in which case the
3308        subsequence is at the end (which does not necessarily mean the first node
3309        is the start of the alternation).
3310
3311        TRIE_TYPE(X) is a define which maps the optype to a trietype.
3312
3313         optype          |  trietype
3314         ----------------+-----------
3315         NOTHING         | NOTHING
3316         EXACT           | EXACT
3317         EXACTFU         | EXACTFU
3318         EXACTFU_SS      | EXACTFU
3319         EXACTFU_TRICKYFOLD | EXACTFU
3320         EXACTFA         | 0
3321
3322
3323       */
3324 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
3325      ( EXACT == (X) )   ? EXACT :        \
3326      ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
3327      0 )
3328
3329       /* dont use tail as the end marker for this traverse */
3330       for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3331        regnode * const noper = NEXTOPER( cur );
3332        U8 noper_type = OP( noper );
3333        U8 noper_trietype = TRIE_TYPE( noper_type );
3334 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3335        regnode * const noper_next = regnext( noper );
3336        U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3337        U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3338 #endif
3339
3340        DEBUG_TRIE_COMPILE_r({
3341         regprop(RExC_rx, mysv, cur);
3342         PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3343         (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3344
3345         regprop(RExC_rx, mysv, noper);
3346         PerlIO_printf( Perl_debug_log, " -> %s",
3347          SvPV_nolen_const(mysv));
3348
3349         if ( noper_next ) {
3350         regprop(RExC_rx, mysv, noper_next );
3351         PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3352          SvPV_nolen_const(mysv));
3353         }
3354         PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3355         REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3356         PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3357         );
3358        });
3359
3360        /* Is noper a trieable nodetype that can be merged with the
3361        * current trie (if there is one)? */
3362        if ( noper_trietype
3363         &&
3364         (
3365           ( noper_trietype == NOTHING)
3366           || ( trietype == NOTHING )
3367           || ( trietype == noper_trietype )
3368         )
3369 #ifdef NOJUMPTRIE
3370         && noper_next == tail
3371 #endif
3372         && count < U16_MAX)
3373        {
3374         /* Handle mergable triable node
3375         * Either we are the first node in a new trieable sequence,
3376         * in which case we do some bookkeeping, otherwise we update
3377         * the end pointer. */
3378         if ( !first ) {
3379          first = cur;
3380          if ( noper_trietype == NOTHING ) {
3381 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3382           regnode * const noper_next = regnext( noper );
3383           U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3384           U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3385 #endif
3386
3387           if ( noper_next_trietype ) {
3388            trietype = noper_next_trietype;
3389           } else if (noper_next_type)  {
3390            /* a NOTHING regop is 1 regop wide. We need at least two
3391            * for a trie so we can't merge this in */
3392            first = NULL;
3393           }
3394          } else {
3395           trietype = noper_trietype;
3396          }
3397         } else {
3398          if ( trietype == NOTHING )
3399           trietype = noper_trietype;
3400          last = cur;
3401         }
3402         if (first)
3403          count++;
3404        } /* end handle mergable triable node */
3405        else {
3406         /* handle unmergable node -
3407         * noper may either be a triable node which can not be tried
3408         * together with the current trie, or a non triable node */
3409         if ( last ) {
3410          /* If last is set and trietype is not NOTHING then we have found
3411          * at least two triable branch sequences in a row of a similar
3412          * trietype so we can turn them into a trie. If/when we
3413          * allow NOTHING to start a trie sequence this condition will be
3414          * required, and it isn't expensive so we leave it in for now. */
3415          if ( trietype && trietype != NOTHING )
3416           make_trie( pRExC_state,
3417             startbranch, first, cur, tail, count,
3418             trietype, depth+1 );
3419          last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3420         }
3421         if ( noper_trietype
3422 #ifdef NOJUMPTRIE
3423          && noper_next == tail
3424 #endif
3425         ){
3426          /* noper is triable, so we can start a new trie sequence */
3427          count = 1;
3428          first = cur;
3429          trietype = noper_trietype;
3430         } else if (first) {
3431          /* if we already saw a first but the current node is not triable then we have
3432          * to reset the first information. */
3433          count = 0;
3434          first = NULL;
3435          trietype = 0;
3436         }
3437        } /* end handle unmergable node */
3438       } /* loop over branches */
3439       DEBUG_TRIE_COMPILE_r({
3440        regprop(RExC_rx, mysv, cur);
3441        PerlIO_printf( Perl_debug_log,
3442        "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3443        "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3444
3445       });
3446       if ( last && trietype ) {
3447        if ( trietype != NOTHING ) {
3448         /* the last branch of the sequence was part of a trie,
3449         * so we have to construct it here outside of the loop
3450         */
3451         made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3452 #ifdef TRIE_STUDY_OPT
3453         if ( ((made == MADE_EXACT_TRIE &&
3454          startbranch == first)
3455          || ( first_non_open == first )) &&
3456          depth==0 ) {
3457          flags |= SCF_TRIE_RESTUDY;
3458          if ( startbranch == first
3459           && scan == tail )
3460          {
3461           RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3462          }
3463         }
3464 #endif
3465        } else {
3466         /* at this point we know whatever we have is a NOTHING sequence/branch
3467         * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3468         */
3469         if ( startbranch == first ) {
3470          regnode *opt;
3471          /* the entire thing is a NOTHING sequence, something like this:
3472          * (?:|) So we can turn it into a plain NOTHING op. */
3473          DEBUG_TRIE_COMPILE_r({
3474           regprop(RExC_rx, mysv, cur);
3475           PerlIO_printf( Perl_debug_log,
3476           "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3477           "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3478
3479          });
3480          OP(startbranch)= NOTHING;
3481          NEXT_OFF(startbranch)= tail - startbranch;
3482          for ( opt= startbranch + 1; opt < tail ; opt++ )
3483           OP(opt)= OPTIMIZED;
3484         }
3485        }
3486       } /* end if ( last) */
3487      } /* TRIE_MAXBUF is non zero */
3488
3489     } /* do trie */
3490
3491    }
3492    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
3493     scan = NEXTOPER(NEXTOPER(scan));
3494    } else   /* single branch is optimized. */
3495     scan = NEXTOPER(scan);
3496    continue;
3497   } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3498    scan_frame *newframe = NULL;
3499    I32 paren;
3500    regnode *start;
3501    regnode *end;
3502
3503    if (OP(scan) != SUSPEND) {
3504    /* set the pointer */
3505     if (OP(scan) == GOSUB) {
3506      paren = ARG(scan);
3507      RExC_recurse[ARG2L(scan)] = scan;
3508      start = RExC_open_parens[paren-1];
3509      end   = RExC_close_parens[paren-1];
3510     } else {
3511      paren = 0;
3512      start = RExC_rxi->program + 1;
3513      end   = RExC_opend;
3514     }
3515     if (!recursed) {
3516      Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3517      SAVEFREEPV(recursed);
3518     }
3519     if (!PAREN_TEST(recursed,paren+1)) {
3520      PAREN_SET(recursed,paren+1);
3521      Newx(newframe,1,scan_frame);
3522     } else {
3523      if (flags & SCF_DO_SUBSTR) {
3524       SCAN_COMMIT(pRExC_state,data,minlenp);
3525       data->longest = &(data->longest_float);
3526      }
3527      is_inf = is_inf_internal = 1;
3528      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3529       cl_anything(pRExC_state, data->start_class);
3530      flags &= ~SCF_DO_STCLASS;
3531     }
3532    } else {
3533     Newx(newframe,1,scan_frame);
3534     paren = stopparen;
3535     start = scan+2;
3536     end = regnext(scan);
3537    }
3538    if (newframe) {
3539     assert(start);
3540     assert(end);
3541     SAVEFREEPV(newframe);
3542     newframe->next = regnext(scan);
3543     newframe->last = last;
3544     newframe->stop = stopparen;
3545     newframe->prev = frame;
3546
3547     frame = newframe;
3548     scan =  start;
3549     stopparen = paren;
3550     last = end;
3551
3552     continue;
3553    }
3554   }
3555   else if (OP(scan) == EXACT) {
3556    I32 l = STR_LEN(scan);
3557    UV uc;
3558    if (UTF) {
3559     const U8 * const s = (U8*)STRING(scan);
3560     uc = utf8_to_uvchr_buf(s, s + l, NULL);
3561     l = utf8_length(s, s + l);
3562    } else {
3563     uc = *((U8*)STRING(scan));
3564    }
3565    min += l;
3566    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3567     /* The code below prefers earlier match for fixed
3568     offset, later match for variable offset.  */
3569     if (data->last_end == -1) { /* Update the start info. */
3570      data->last_start_min = data->pos_min;
3571      data->last_start_max = is_inf
3572       ? I32_MAX : data->pos_min + data->pos_delta;
3573     }
3574     sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3575     if (UTF)
3576      SvUTF8_on(data->last_found);
3577     {
3578      SV * const sv = data->last_found;
3579      MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3580       mg_find(sv, PERL_MAGIC_utf8) : NULL;
3581      if (mg && mg->mg_len >= 0)
3582       mg->mg_len += utf8_length((U8*)STRING(scan),
3583             (U8*)STRING(scan)+STR_LEN(scan));
3584     }
3585     data->last_end = data->pos_min + l;
3586     data->pos_min += l; /* As in the first entry. */
3587     data->flags &= ~SF_BEFORE_EOL;
3588    }
3589    if (flags & SCF_DO_STCLASS_AND) {
3590     /* Check whether it is compatible with what we know already! */
3591     int compat = 1;
3592
3593
3594     /* If compatible, we or it in below.  It is compatible if is
3595     * in the bitmp and either 1) its bit or its fold is set, or 2)
3596     * it's for a locale.  Even if there isn't unicode semantics
3597     * here, at runtime there may be because of matching against a
3598     * utf8 string, so accept a possible false positive for
3599     * latin1-range folds */
3600     if (uc >= 0x100 ||
3601      (!(data->start_class->flags & ANYOF_LOCALE)
3602      && !ANYOF_BITMAP_TEST(data->start_class, uc)
3603      && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3604       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3605      )
3606     {
3607      compat = 0;
3608     }
3609     ANYOF_CLASS_ZERO(data->start_class);
3610     ANYOF_BITMAP_ZERO(data->start_class);
3611     if (compat)
3612      ANYOF_BITMAP_SET(data->start_class, uc);
3613     else if (uc >= 0x100) {
3614      int i;
3615
3616      /* Some Unicode code points fold to the Latin1 range; as
3617      * XXX temporary code, instead of figuring out if this is
3618      * one, just assume it is and set all the start class bits
3619      * that could be some such above 255 code point's fold
3620      * which will generate fals positives.  As the code
3621      * elsewhere that does compute the fold settles down, it
3622      * can be extracted out and re-used here */
3623      for (i = 0; i < 256; i++){
3624       if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3625        ANYOF_BITMAP_SET(data->start_class, i);
3626       }
3627      }
3628     }
3629     CLEAR_SSC_EOS(data->start_class);
3630     if (uc < 0x100)
3631     data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3632    }
3633    else if (flags & SCF_DO_STCLASS_OR) {
3634     /* false positive possible if the class is case-folded */
3635     if (uc < 0x100)
3636      ANYOF_BITMAP_SET(data->start_class, uc);
3637     else
3638      data->start_class->flags |= ANYOF_UNICODE_ALL;
3639     CLEAR_SSC_EOS(data->start_class);
3640     cl_and(data->start_class, and_withp);
3641    }
3642    flags &= ~SCF_DO_STCLASS;
3643   }
3644   else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3645    I32 l = STR_LEN(scan);
3646    UV uc = *((U8*)STRING(scan));
3647
3648    /* Search for fixed substrings supports EXACT only. */
3649    if (flags & SCF_DO_SUBSTR) {
3650     assert(data);
3651     SCAN_COMMIT(pRExC_state, data, minlenp);
3652    }
3653    if (UTF) {
3654     const U8 * const s = (U8 *)STRING(scan);
3655     uc = utf8_to_uvchr_buf(s, s + l, NULL);
3656     l = utf8_length(s, s + l);
3657    }
3658    if (has_exactf_sharp_s) {
3659     RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3660    }
3661    min += l - min_subtract;
3662    assert (min >= 0);
3663    delta += min_subtract;
3664    if (flags & SCF_DO_SUBSTR) {
3665     data->pos_min += l - min_subtract;
3666     if (data->pos_min < 0) {
3667      data->pos_min = 0;
3668     }
3669     data->pos_delta += min_subtract;
3670     if (min_subtract) {
3671      data->longest = &(data->longest_float);
3672     }
3673    }
3674    if (flags & SCF_DO_STCLASS_AND) {
3675     /* Check whether it is compatible with what we know already! */
3676     int compat = 1;
3677     if (uc >= 0x100 ||
3678     (!(data->start_class->flags & ANYOF_LOCALE)
3679     && !ANYOF_BITMAP_TEST(data->start_class, uc)
3680     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3681     {
3682      compat = 0;
3683     }
3684     ANYOF_CLASS_ZERO(data->start_class);
3685     ANYOF_BITMAP_ZERO(data->start_class);
3686     if (compat) {
3687      ANYOF_BITMAP_SET(data->start_class, uc);
3688      CLEAR_SSC_EOS(data->start_class);
3689      if (OP(scan) == EXACTFL) {
3690       /* XXX This set is probably no longer necessary, and
3691       * probably wrong as LOCALE now is on in the initial
3692       * state */
3693       data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3694      }
3695      else {
3696
3697       /* Also set the other member of the fold pair.  In case
3698       * that unicode semantics is called for at runtime, use
3699       * the full latin1 fold.  (Can't do this for locale,
3700       * because not known until runtime) */
3701       ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3702
3703       /* All other (EXACTFL handled above) folds except under
3704       * /iaa that include s, S, and sharp_s also may include
3705       * the others */
3706       if (OP(scan) != EXACTFA) {
3707        if (uc == 's' || uc == 'S') {
3708         ANYOF_BITMAP_SET(data->start_class,
3709             LATIN_SMALL_LETTER_SHARP_S);
3710        }
3711        else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3712         ANYOF_BITMAP_SET(data->start_class, 's');
3713         ANYOF_BITMAP_SET(data->start_class, 'S');
3714        }
3715       }
3716      }
3717     }
3718     else if (uc >= 0x100) {
3719      int i;
3720      for (i = 0; i < 256; i++){
3721       if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3722        ANYOF_BITMAP_SET(data->start_class, i);
3723       }
3724      }
3725     }
3726    }
3727    else if (flags & SCF_DO_STCLASS_OR) {
3728     if (data->start_class->flags & ANYOF_LOC_FOLD) {
3729      /* false positive possible if the class is case-folded.
3730      Assume that the locale settings are the same... */
3731      if (uc < 0x100) {
3732       ANYOF_BITMAP_SET(data->start_class, uc);
3733       if (OP(scan) != EXACTFL) {
3734
3735        /* And set the other member of the fold pair, but
3736        * can't do that in locale because not known until
3737        * run-time */
3738        ANYOF_BITMAP_SET(data->start_class,
3739            PL_fold_latin1[uc]);
3740
3741        /* All folds except under /iaa that include s, S,
3742        * and sharp_s also may include the others */
3743        if (OP(scan) != EXACTFA) {
3744         if (uc == 's' || uc == 'S') {
3745          ANYOF_BITMAP_SET(data->start_class,
3746             LATIN_SMALL_LETTER_SHARP_S);
3747         }
3748         else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3749          ANYOF_BITMAP_SET(data->start_class, 's');
3750          ANYOF_BITMAP_SET(data->start_class, 'S');
3751         }
3752        }
3753       }
3754      }
3755      CLEAR_SSC_EOS(data->start_class);
3756     }
3757     cl_and(data->start_class, and_withp);
3758    }
3759    flags &= ~SCF_DO_STCLASS;
3760   }
3761   else if (REGNODE_VARIES(OP(scan))) {
3762    I32 mincount, maxcount, minnext, deltanext, fl = 0;
3763    I32 f = flags, pos_before = 0;
3764    regnode * const oscan = scan;
3765    struct regnode_charclass_class this_class;
3766    struct regnode_charclass_class *oclass = NULL;
3767    I32 next_is_eval = 0;
3768
3769    switch (PL_regkind[OP(scan)]) {
3770    case WHILEM:  /* End of (?:...)* . */
3771     scan = NEXTOPER(scan);
3772     goto finish;
3773    case PLUS:
3774     if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3775      next = NEXTOPER(scan);
3776      if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3777       mincount = 1;
3778       maxcount = REG_INFTY;
3779       next = regnext(scan);
3780       scan = NEXTOPER(scan);
3781       goto do_curly;
3782      }
3783     }
3784     if (flags & SCF_DO_SUBSTR)
3785      data->pos_min++;
3786     min++;
3787     /* Fall through. */
3788    case STAR:
3789     if (flags & SCF_DO_STCLASS) {
3790      mincount = 0;
3791      maxcount = REG_INFTY;
3792      next = regnext(scan);
3793      scan = NEXTOPER(scan);
3794      goto do_curly;
3795     }
3796     is_inf = is_inf_internal = 1;
3797     scan = regnext(scan);
3798     if (flags & SCF_DO_SUBSTR) {
3799      SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3800      data->longest = &(data->longest_float);
3801     }
3802     goto optimize_curly_tail;
3803    case CURLY:
3804     if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3805      && (scan->flags == stopparen))
3806     {
3807      mincount = 1;
3808      maxcount = 1;
3809     } else {
3810      mincount = ARG1(scan);
3811      maxcount = ARG2(scan);
3812     }
3813     next = regnext(scan);
3814     if (OP(scan) == CURLYX) {
3815      I32 lp = (data ? *(data->last_closep) : 0);
3816      scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3817     }
3818     scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3819     next_is_eval = (OP(scan) == EVAL);
3820    do_curly:
3821     if (flags & SCF_DO_SUBSTR) {
3822      if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3823      pos_before = data->pos_min;
3824     }
3825     if (data) {
3826      fl = data->flags;
3827      data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3828      if (is_inf)
3829       data->flags |= SF_IS_INF;
3830     }
3831     if (flags & SCF_DO_STCLASS) {
3832      cl_init(pRExC_state, &this_class);
3833      oclass = data->start_class;
3834      data->start_class = &this_class;
3835      f |= SCF_DO_STCLASS_AND;
3836      f &= ~SCF_DO_STCLASS_OR;
3837     }
3838     /* Exclude from super-linear cache processing any {n,m}
3839     regops for which the combination of input pos and regex
3840     pos is not enough information to determine if a match
3841     will be possible.
3842
3843     For example, in the regex /foo(bar\s*){4,8}baz/ with the
3844     regex pos at the \s*, the prospects for a match depend not
3845     only on the input position but also on how many (bar\s*)
3846     repeats into the {4,8} we are. */
3847    if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3848      f &= ~SCF_WHILEM_VISITED_POS;
3849
3850     /* This will finish on WHILEM, setting scan, or on NULL: */
3851     minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3852          last, data, stopparen, recursed, NULL,
3853          (mincount == 0
3854           ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3855
3856     if (flags & SCF_DO_STCLASS)
3857      data->start_class = oclass;
3858     if (mincount == 0 || minnext == 0) {
3859      if (flags & SCF_DO_STCLASS_OR) {
3860       cl_or(pRExC_state, data->start_class, &this_class);
3861      }
3862      else if (flags & SCF_DO_STCLASS_AND) {
3863       /* Switch to OR mode: cache the old value of
3864       * data->start_class */
3865       INIT_AND_WITHP;
3866       StructCopy(data->start_class, and_withp,
3867         struct regnode_charclass_class);
3868       flags &= ~SCF_DO_STCLASS_AND;
3869       StructCopy(&this_class, data->start_class,
3870         struct regnode_charclass_class);
3871       flags |= SCF_DO_STCLASS_OR;
3872       SET_SSC_EOS(data->start_class);
3873      }
3874     } else {  /* Non-zero len */
3875      if (flags & SCF_DO_STCLASS_OR) {
3876       cl_or(pRExC_state, data->start_class, &this_class);
3877       cl_and(data->start_class, and_withp);
3878      }
3879      else if (flags & SCF_DO_STCLASS_AND)
3880       cl_and(data->start_class, &this_class);
3881      flags &= ~SCF_DO_STCLASS;
3882     }
3883     if (!scan)   /* It was not CURLYX, but CURLY. */
3884      scan = next;
3885     if ( /* ? quantifier ok, except for (?{ ... }) */
3886      (next_is_eval || !(mincount == 0 && maxcount == 1))
3887      && (minnext == 0) && (deltanext == 0)
3888      && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3889      && maxcount <= REG_INFTY/3) /* Complement check for big count */
3890     {
3891      /* Fatal warnings may leak the regexp without this: */
3892      SAVEFREESV(RExC_rx_sv);
3893      ckWARNreg(RExC_parse,
3894        "Quantifier unexpected on zero-length expression");
3895      (void)ReREFCNT_inc(RExC_rx_sv);
3896     }
3897
3898     min += minnext * mincount;
3899     is_inf_internal |= deltanext == I32_MAX
3900          || (maxcount == REG_INFTY && minnext + deltanext > 0);
3901     is_inf |= is_inf_internal;
3902     if (is_inf)
3903      delta = I32_MAX;
3904     else
3905      delta += (minnext + deltanext) * maxcount - minnext * mincount;
3906
3907     /* Try powerful optimization CURLYX => CURLYN. */
3908     if (  OP(oscan) == CURLYX && data
3909      && data->flags & SF_IN_PAR
3910      && !(data->flags & SF_HAS_EVAL)
3911      && !deltanext && minnext == 1 ) {
3912      /* Try to optimize to CURLYN.  */
3913      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3914      regnode * const nxt1 = nxt;
3915 #ifdef DEBUGGING
3916      regnode *nxt2;
3917 #endif
3918
3919      /* Skip open. */
3920      nxt = regnext(nxt);
3921      if (!REGNODE_SIMPLE(OP(nxt))
3922       && !(PL_regkind[OP(nxt)] == EXACT
3923        && STR_LEN(nxt) == 1))
3924       goto nogo;
3925 #ifdef DEBUGGING
3926      nxt2 = nxt;
3927 #endif
3928      nxt = regnext(nxt);
3929      if (OP(nxt) != CLOSE)
3930       goto nogo;
3931      if (RExC_open_parens) {
3932       RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3933       RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3934      }
3935      /* Now we know that nxt2 is the only contents: */
3936      oscan->flags = (U8)ARG(nxt);
3937      OP(oscan) = CURLYN;
3938      OP(nxt1) = NOTHING; /* was OPEN. */
3939
3940 #ifdef DEBUGGING
3941      OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3942      NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3943      NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3944      OP(nxt) = OPTIMIZED; /* was CLOSE. */
3945      OP(nxt + 1) = OPTIMIZED; /* was count. */
3946      NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3947 #endif
3948     }
3949    nogo:
3950
3951     /* Try optimization CURLYX => CURLYM. */
3952     if (  OP(oscan) == CURLYX && data
3953      && !(data->flags & SF_HAS_PAR)
3954      && !(data->flags & SF_HAS_EVAL)
3955      && !deltanext /* atom is fixed width */
3956      && minnext != 0 /* CURLYM can't handle zero width */
3957      && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3958     ) {
3959      /* XXXX How to optimize if data == 0? */
3960      /* Optimize to a simpler form.  */
3961      regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3962      regnode *nxt2;
3963
3964      OP(oscan) = CURLYM;
3965      while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3966        && (OP(nxt2) != WHILEM))
3967       nxt = nxt2;
3968      OP(nxt2)  = SUCCEED; /* Whas WHILEM */
3969      /* Need to optimize away parenths. */
3970      if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3971       /* Set the parenth number.  */
3972       regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3973
3974       oscan->flags = (U8)ARG(nxt);
3975       if (RExC_open_parens) {
3976        RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3977        RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3978       }
3979       OP(nxt1) = OPTIMIZED; /* was OPEN. */
3980       OP(nxt) = OPTIMIZED; /* was CLOSE. */
3981
3982 #ifdef DEBUGGING
3983       OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3984       OP(nxt + 1) = OPTIMIZED; /* was count. */
3985       NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3986       NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3987 #endif
3988 #if 0
3989       while ( nxt1 && (OP(nxt1) != WHILEM)) {
3990        regnode *nnxt = regnext(nxt1);
3991        if (nnxt == nxt) {
3992         if (reg_off_by_arg[OP(nxt1)])
3993          ARG_SET(nxt1, nxt2 - nxt1);
3994         else if (nxt2 - nxt1 < U16_MAX)
3995          NEXT_OFF(nxt1) = nxt2 - nxt1;
3996         else
3997          OP(nxt) = NOTHING; /* Cannot beautify */
3998        }
3999        nxt1 = nnxt;
4000       }
4001 #endif
4002       /* Optimize again: */
4003       study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4004          NULL, stopparen, recursed, NULL, 0,depth+1);
4005      }
4006      else
4007       oscan->flags = 0;
4008     }
4009     else if ((OP(oscan) == CURLYX)
4010       && (flags & SCF_WHILEM_VISITED_POS)
4011       /* See the comment on a similar expression above.
4012        However, this time it's not a subexpression
4013        we care about, but the expression itself. */
4014       && (maxcount == REG_INFTY)
4015       && data && ++data->whilem_c < 16) {
4016      /* This stays as CURLYX, we can put the count/of pair. */
4017      /* Find WHILEM (as in regexec.c) */
4018      regnode *nxt = oscan + NEXT_OFF(oscan);
4019
4020      if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4021       nxt += ARG(nxt);
4022      PREVOPER(nxt)->flags = (U8)(data->whilem_c
4023       | (RExC_whilem_seen << 4)); /* On WHILEM */
4024     }
4025     if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4026      pars++;
4027     if (flags & SCF_DO_SUBSTR) {
4028      SV *last_str = NULL;
4029      int counted = mincount != 0;
4030
4031      if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4032 #if defined(SPARC64_GCC_WORKAROUND)
4033       I32 b = 0;
4034       STRLEN l = 0;
4035       const char *s = NULL;
4036       I32 old = 0;
4037
4038       if (pos_before >= data->last_start_min)
4039        b = pos_before;
4040       else
4041        b = data->last_start_min;
4042
4043       l = 0;
4044       s = SvPV_const(data->last_found, l);
4045       old = b - data->last_start_min;
4046
4047 #else
4048       I32 b = pos_before >= data->last_start_min
4049        ? pos_before : data->last_start_min;
4050       STRLEN l;
4051       const char * const s = SvPV_const(data->last_found, l);
4052       I32 old = b - data->last_start_min;
4053 #endif
4054
4055       if (UTF)
4056        old = utf8_hop((U8*)s, old) - (U8*)s;
4057       l -= old;
4058       /* Get the added string: */
4059       last_str = newSVpvn_utf8(s  + old, l, UTF);
4060       if (deltanext == 0 && pos_before == b) {
4061        /* What was added is a constant string */
4062        if (mincount > 1) {
4063         SvGROW(last_str, (mincount * l) + 1);
4064         repeatcpy(SvPVX(last_str) + l,
4065           SvPVX_const(last_str), l, mincount - 1);
4066         SvCUR_set(last_str, SvCUR(last_str) * mincount);
4067         /* Add additional parts. */
4068         SvCUR_set(data->last_found,
4069           SvCUR(data->last_found) - l);
4070         sv_catsv(data->last_found, last_str);
4071         {
4072          SV * sv = data->last_found;
4073          MAGIC *mg =
4074           SvUTF8(sv) && SvMAGICAL(sv) ?
4075           mg_find(sv, PERL_MAGIC_utf8) : NULL;
4076          if (mg && mg->mg_len >= 0)
4077           mg->mg_len += CHR_SVLEN(last_str) - l;
4078         }
4079         data->last_end += l * (mincount - 1);
4080        }
4081       } else {
4082        /* start offset must point into the last copy */
4083        data->last_start_min += minnext * (mincount - 1);
4084        data->last_start_max += is_inf ? I32_MAX
4085         : (maxcount - 1) * (minnext + data->pos_delta);
4086       }
4087      }
4088      /* It is counted once already... */
4089      data->pos_min += minnext * (mincount - counted);
4090 #if 0
4091 PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
4092  counted, deltanext, I32_MAX, minnext, maxcount, mincount);
4093 if (deltanext != I32_MAX)
4094 PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
4095 #endif
4096      if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
4097       data->pos_delta = I32_MAX;
4098      else
4099       data->pos_delta += - counted * deltanext +
4100       (minnext + deltanext) * maxcount - minnext * mincount;
4101      if (mincount != maxcount) {
4102       /* Cannot extend fixed substrings found inside
4103        the group.  */
4104       SCAN_COMMIT(pRExC_state,data,minlenp);
4105       if (mincount && last_str) {
4106        SV * const sv = data->last_found;
4107        MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4108         mg_find(sv, PERL_MAGIC_utf8) : NULL;
4109
4110        if (mg)
4111         mg->mg_len = -1;
4112        sv_setsv(sv, last_str);
4113        data->last_end = data->pos_min;
4114        data->last_start_min =
4115         data->pos_min - CHR_SVLEN(last_str);
4116        data->last_start_max = is_inf
4117         ? I32_MAX
4118         : data->pos_min + data->pos_delta
4119         - CHR_SVLEN(last_str);
4120       }
4121       data->longest = &(data->longest_float);
4122      }
4123      SvREFCNT_dec(last_str);
4124     }
4125     if (data && (fl & SF_HAS_EVAL))
4126      data->flags |= SF_HAS_EVAL;
4127    optimize_curly_tail:
4128     if (OP(oscan) != CURLYX) {
4129      while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4130       && NEXT_OFF(next))
4131       NEXT_OFF(oscan) += NEXT_OFF(next);
4132     }
4133     continue;
4134    default:   /* REF, and CLUMP only? */
4135     if (flags & SCF_DO_SUBSTR) {
4136      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4137      data->longest = &(data->longest_float);
4138     }
4139     is_inf = is_inf_internal = 1;
4140     if (flags & SCF_DO_STCLASS_OR)
4141      cl_anything(pRExC_state, data->start_class);
4142     flags &= ~SCF_DO_STCLASS;
4143     break;
4144    }
4145   }
4146   else if (OP(scan) == LNBREAK) {
4147    if (flags & SCF_DO_STCLASS) {
4148     int value = 0;
4149     CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4150      if (flags & SCF_DO_STCLASS_AND) {
4151      for (value = 0; value < 256; value++)
4152       if (!is_VERTWS_cp(value))
4153        ANYOF_BITMAP_CLEAR(data->start_class, value);
4154     }
4155     else {
4156      for (value = 0; value < 256; value++)
4157       if (is_VERTWS_cp(value))
4158        ANYOF_BITMAP_SET(data->start_class, value);
4159     }
4160     if (flags & SCF_DO_STCLASS_OR)
4161      cl_and(data->start_class, and_withp);
4162     flags &= ~SCF_DO_STCLASS;
4163    }
4164    min++;
4165    delta++;    /* Because of the 2 char string cr-lf */
4166    if (flags & SCF_DO_SUBSTR) {
4167      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4168      data->pos_min += 1;
4169     data->pos_delta += 1;
4170     data->longest = &(data->longest_float);
4171     }
4172   }
4173   else if (REGNODE_SIMPLE(OP(scan))) {
4174    int value = 0;
4175
4176    if (flags & SCF_DO_SUBSTR) {
4177     SCAN_COMMIT(pRExC_state,data,minlenp);
4178     data->pos_min++;
4179    }
4180    min++;
4181    if (flags & SCF_DO_STCLASS) {
4182     int loop_max = 256;
4183     CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4184
4185     /* Some of the logic below assumes that switching
4186     locale on will only add false positives. */
4187     switch (PL_regkind[OP(scan)]) {
4188      U8 classnum;
4189
4190     case SANY:
4191     default:
4192 #ifdef DEBUGGING
4193     Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4194 #endif
4195     do_default:
4196      if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4197       cl_anything(pRExC_state, data->start_class);
4198      break;
4199     case REG_ANY:
4200      if (OP(scan) == SANY)
4201       goto do_default;
4202      if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4203       value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4204         || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4205       cl_anything(pRExC_state, data->start_class);
4206      }
4207      if (flags & SCF_DO_STCLASS_AND || !value)
4208       ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4209      break;
4210     case ANYOF:
4211      if (flags & SCF_DO_STCLASS_AND)
4212       cl_and(data->start_class,
4213        (struct regnode_charclass_class*)scan);
4214      else
4215       cl_or(pRExC_state, data->start_class,
4216        (struct regnode_charclass_class*)scan);
4217      break;
4218     case POSIXA:
4219      loop_max = 128;
4220      /* FALL THROUGH */
4221     case POSIXL:
4222     case POSIXD:
4223     case POSIXU:
4224      classnum = FLAGS(scan);
4225      if (flags & SCF_DO_STCLASS_AND) {
4226       if (!(data->start_class->flags & ANYOF_LOCALE)) {
4227        ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4228        for (value = 0; value < loop_max; value++) {
4229         if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4230          ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4231         }
4232        }
4233       }
4234      }
4235      else {
4236       if (data->start_class->flags & ANYOF_LOCALE) {
4237        ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4238       }
4239       else {
4240
4241       /* Even if under locale, set the bits for non-locale
4242       * in case it isn't a true locale-node.  This will
4243       * create false positives if it truly is locale */
4244       for (value = 0; value < loop_max; value++) {
4245        if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4246         ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4247        }
4248       }
4249       }
4250      }
4251      break;
4252     case NPOSIXA:
4253      loop_max = 128;
4254      /* FALL THROUGH */
4255     case NPOSIXL:
4256     case NPOSIXU:
4257     case NPOSIXD:
4258      classnum = FLAGS(scan);
4259      if (flags & SCF_DO_STCLASS_AND) {
4260       if (!(data->start_class->flags & ANYOF_LOCALE)) {
4261        ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4262        for (value = 0; value < loop_max; value++) {
4263         if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4264          ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4265         }
4266        }
4267       }
4268      }
4269      else {
4270       if (data->start_class->flags & ANYOF_LOCALE) {
4271        ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4272       }
4273       else {
4274
4275       /* Even if under locale, set the bits for non-locale in
4276       * case it isn't a true locale-node.  This will create
4277       * false positives if it truly is locale */
4278       for (value = 0; value < loop_max; value++) {
4279        if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4280         ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4281        }
4282       }
4283       if (PL_regkind[OP(scan)] == NPOSIXD) {
4284        data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4285       }
4286       }
4287      }
4288      break;
4289     }
4290     if (flags & SCF_DO_STCLASS_OR)
4291      cl_and(data->start_class, and_withp);
4292     flags &= ~SCF_DO_STCLASS;
4293    }
4294   }
4295   else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4296    data->flags |= (OP(scan) == MEOL
4297        ? SF_BEFORE_MEOL
4298        : SF_BEFORE_SEOL);
4299    SCAN_COMMIT(pRExC_state, data, minlenp);
4300
4301   }
4302   else if (  PL_regkind[OP(scan)] == BRANCHJ
4303     /* Lookbehind, or need to calculate parens/evals/stclass: */
4304     && (scan->flags || data || (flags & SCF_DO_STCLASS))
4305     && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4306    if ( OP(scan) == UNLESSM &&
4307     scan->flags == 0 &&
4308     OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4309     OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4310    ) {
4311     regnode *opt;
4312     regnode *upto= regnext(scan);
4313     DEBUG_PARSE_r({
4314      SV * const mysv_val=sv_newmortal();
4315      DEBUG_STUDYDATA("OPFAIL",data,depth);
4316
4317      /*DEBUG_PARSE_MSG("opfail");*/
4318      regprop(RExC_rx, mysv_val, upto);
4319      PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4320         SvPV_nolen_const(mysv_val),
4321         (IV)REG_NODE_NUM(upto),
4322         (IV)(upto - scan)
4323      );
4324     });
4325     OP(scan) = OPFAIL;
4326     NEXT_OFF(scan) = upto - scan;
4327     for (opt= scan + 1; opt < upto ; opt++)
4328      OP(opt) = OPTIMIZED;
4329     scan= upto;
4330     continue;
4331    }
4332    if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4333     || OP(scan) == UNLESSM )
4334    {
4335     /* Negative Lookahead/lookbehind
4336     In this case we can't do fixed string optimisation.
4337     */
4338
4339     I32 deltanext, minnext, fake = 0;
4340     regnode *nscan;
4341     struct regnode_charclass_class intrnl;
4342     int f = 0;
4343
4344     data_fake.flags = 0;
4345     if (data) {
4346      data_fake.whilem_c = data->whilem_c;
4347      data_fake.last_closep = data->last_closep;
4348     }
4349     else
4350      data_fake.last_closep = &fake;
4351     data_fake.pos_delta = delta;
4352     if ( flags & SCF_DO_STCLASS && !scan->flags
4353      && OP(scan) == IFMATCH ) { /* Lookahead */
4354      cl_init(pRExC_state, &intrnl);
4355      data_fake.start_class = &intrnl;
4356      f |= SCF_DO_STCLASS_AND;
4357     }
4358     if (flags & SCF_WHILEM_VISITED_POS)
4359      f |= SCF_WHILEM_VISITED_POS;
4360     next = regnext(scan);
4361     nscan = NEXTOPER(NEXTOPER(scan));
4362     minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4363      last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4364     if (scan->flags) {
4365      if (deltanext) {
4366       FAIL("Variable length lookbehind not implemented");
4367      }
4368      else if (minnext > (I32)U8_MAX) {
4369       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4370      }
4371      scan->flags = (U8)minnext;
4372     }
4373     if (data) {
4374      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4375       pars++;
4376      if (data_fake.flags & SF_HAS_EVAL)
4377       data->flags |= SF_HAS_EVAL;
4378      data->whilem_c = data_fake.whilem_c;
4379     }
4380     if (f & SCF_DO_STCLASS_AND) {
4381      if (flags & SCF_DO_STCLASS_OR) {
4382       /* OR before, AND after: ideally we would recurse with
4383       * data_fake to get the AND applied by study of the
4384       * remainder of the pattern, and then derecurse;
4385       * *** HACK *** for now just treat as "no information".
4386       * See [perl #56690].
4387       */
4388       cl_init(pRExC_state, data->start_class);
4389      }  else {
4390       /* AND before and after: combine and continue */
4391       const int was = TEST_SSC_EOS(data->start_class);
4392
4393       cl_and(data->start_class, &intrnl);
4394       if (was)
4395        SET_SSC_EOS(data->start_class);
4396      }
4397     }
4398    }
4399 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4400    else {
4401     /* Positive Lookahead/lookbehind
4402     In this case we can do fixed string optimisation,
4403     but we must be careful about it. Note in the case of
4404     lookbehind the positions will be offset by the minimum
4405     length of the pattern, something we won't know about
4406     until after the recurse.
4407     */
4408     I32 deltanext, fake = 0;
4409     regnode *nscan;
4410     struct regnode_charclass_class intrnl;
4411     int f = 0;
4412     /* We use SAVEFREEPV so that when the full compile
4413      is finished perl will clean up the allocated
4414      minlens when it's all done. This way we don't
4415      have to worry about freeing them when we know
4416      they wont be used, which would be a pain.
4417     */
4418     I32 *minnextp;
4419     Newx( minnextp, 1, I32 );
4420     SAVEFREEPV(minnextp);
4421
4422     if (data) {
4423      StructCopy(data, &data_fake, scan_data_t);
4424      if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4425       f |= SCF_DO_SUBSTR;
4426       if (scan->flags)
4427        SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4428       data_fake.last_found=newSVsv(data->last_found);
4429      }
4430     }
4431     else
4432      data_fake.last_closep = &fake;
4433     data_fake.flags = 0;
4434     data_fake.pos_delta = delta;
4435     if (is_inf)
4436      data_fake.flags |= SF_IS_INF;
4437     if ( flags & SCF_DO_STCLASS && !scan->flags
4438      && OP(scan) == IFMATCH ) { /* Lookahead */
4439      cl_init(pRExC_state, &intrnl);
4440      data_fake.start_class = &intrnl;
4441      f |= SCF_DO_STCLASS_AND;
4442     }
4443     if (flags & SCF_WHILEM_VISITED_POS)
4444      f |= SCF_WHILEM_VISITED_POS;
4445     next = regnext(scan);
4446     nscan = NEXTOPER(NEXTOPER(scan));
4447
4448     *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4449      last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4450     if (scan->flags) {
4451      if (deltanext) {
4452       FAIL("Variable length lookbehind not implemented");
4453      }
4454      else if (*minnextp > (I32)U8_MAX) {
4455       FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4456      }
4457      scan->flags = (U8)*minnextp;
4458     }
4459
4460     *minnextp += min;
4461
4462     if (f & SCF_DO_STCLASS_AND) {
4463      const int was = TEST_SSC_EOS(data.start_class);
4464
4465      cl_and(data->start_class, &intrnl);
4466      if (was)
4467       SET_SSC_EOS(data->start_class);
4468     }
4469     if (data) {
4470      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4471       pars++;
4472      if (data_fake.flags & SF_HAS_EVAL)
4473       data->flags |= SF_HAS_EVAL;
4474      data->whilem_c = data_fake.whilem_c;
4475      if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4476       if (RExC_rx->minlen<*minnextp)
4477        RExC_rx->minlen=*minnextp;
4478       SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4479       SvREFCNT_dec_NN(data_fake.last_found);
4480
4481       if ( data_fake.minlen_fixed != minlenp )
4482       {
4483        data->offset_fixed= data_fake.offset_fixed;
4484        data->minlen_fixed= data_fake.minlen_fixed;
4485        data->lookbehind_fixed+= scan->flags;
4486       }
4487       if ( data_fake.minlen_float != minlenp )
4488       {
4489        data->minlen_float= data_fake.minlen_float;
4490        data->offset_float_min=data_fake.offset_float_min;
4491        data->offset_float_max=data_fake.offset_float_max;
4492        data->lookbehind_float+= scan->flags;
4493       }
4494      }
4495     }
4496    }
4497 #endif
4498   }
4499   else if (OP(scan) == OPEN) {
4500    if (stopparen != (I32)ARG(scan))
4501     pars++;
4502   }
4503   else if (OP(scan) == CLOSE) {
4504    if (stopparen == (I32)ARG(scan)) {
4505     break;
4506    }
4507    if ((I32)ARG(scan) == is_par) {
4508     next = regnext(scan);
4509
4510     if ( next && (OP(next) != WHILEM) && next < last)
4511      is_par = 0;  /* Disable optimization */
4512    }
4513    if (data)
4514     *(data->last_closep) = ARG(scan);
4515   }
4516   else if (OP(scan) == EVAL) {
4517     if (data)
4518      data->flags |= SF_HAS_EVAL;
4519   }
4520   else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4521    if (flags & SCF_DO_SUBSTR) {
4522     SCAN_COMMIT(pRExC_state,data,minlenp);
4523     flags &= ~SCF_DO_SUBSTR;
4524    }
4525    if (data && OP(scan)==ACCEPT) {
4526     data->flags |= SCF_SEEN_ACCEPT;
4527     if (stopmin > min)
4528      stopmin = min;
4529    }
4530   }
4531   else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4532   {
4533     if (flags & SCF_DO_SUBSTR) {
4534      SCAN_COMMIT(pRExC_state,data,minlenp);
4535      data->longest = &(data->longest_float);
4536     }
4537     is_inf = is_inf_internal = 1;
4538     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4539      cl_anything(pRExC_state, data->start_class);
4540     flags &= ~SCF_DO_STCLASS;
4541   }
4542   else if (OP(scan) == GPOS) {
4543    if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4544     !(delta || is_inf || (data && data->pos_delta)))
4545    {
4546     if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4547      RExC_rx->extflags |= RXf_ANCH_GPOS;
4548     if (RExC_rx->gofs < (U32)min)
4549      RExC_rx->gofs = min;
4550    } else {
4551     RExC_rx->extflags |= RXf_GPOS_FLOAT;
4552     RExC_rx->gofs = 0;
4553    }
4554   }
4555 #ifdef TRIE_STUDY_OPT
4556 #ifdef FULL_TRIE_STUDY
4557   else if (PL_regkind[OP(scan)] == TRIE) {
4558    /* NOTE - There is similar code to this block above for handling
4559    BRANCH nodes on the initial study.  If you change stuff here
4560    check there too. */
4561    regnode *trie_node= scan;
4562    regnode *tail= regnext(scan);
4563    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4564    I32 max1 = 0, min1 = I32_MAX;
4565    struct regnode_charclass_class accum;
4566
4567    if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4568     SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4569    if (flags & SCF_DO_STCLASS)
4570     cl_init_zero(pRExC_state, &accum);
4571
4572    if (!trie->jump) {
4573     min1= trie->minlen;
4574     max1= trie->maxlen;
4575    } else {
4576     const regnode *nextbranch= NULL;
4577     U32 word;
4578
4579     for ( word=1 ; word <= trie->wordcount ; word++)
4580     {
4581      I32 deltanext=0, minnext=0, f = 0, fake;
4582      struct regnode_charclass_class this_class;
4583
4584      data_fake.flags = 0;
4585      if (data) {
4586       data_fake.whilem_c = data->whilem_c;
4587       data_fake.last_closep = data->last_closep;
4588      }
4589      else
4590       data_fake.last_closep = &fake;
4591      data_fake.pos_delta = delta;
4592      if (flags & SCF_DO_STCLASS) {
4593       cl_init(pRExC_state, &this_class);
4594       data_fake.start_class = &this_class;
4595       f = SCF_DO_STCLASS_AND;
4596      }
4597      if (flags & SCF_WHILEM_VISITED_POS)
4598       f |= SCF_WHILEM_VISITED_POS;
4599
4600      if (trie->jump[word]) {
4601       if (!nextbranch)
4602        nextbranch = trie_node + trie->jump[0];
4603       scan= trie_node + trie->jump[word];
4604       /* We go from the jump point to the branch that follows
4605       it. Note this means we need the vestigal unused branches
4606       even though they arent otherwise used.
4607       */
4608       minnext = study_chunk(pRExC_state, &scan, minlenp,
4609        &deltanext, (regnode *)nextbranch, &data_fake,
4610        stopparen, recursed, NULL, f,depth+1);
4611      }
4612      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4613       nextbranch= regnext((regnode*)nextbranch);
4614
4615      if (min1 > (I32)(minnext + trie->minlen))
4616       min1 = minnext + trie->minlen;
4617      if (deltanext == I32_MAX) {
4618       is_inf = is_inf_internal = 1;
4619       max1 = I32_MAX;
4620      } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4621       max1 = minnext + deltanext + trie->maxlen;
4622
4623      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4624       pars++;
4625      if (data_fake.flags & SCF_SEEN_ACCEPT) {
4626       if ( stopmin > min + min1)
4627        stopmin = min + min1;
4628       flags &= ~SCF_DO_SUBSTR;
4629       if (data)
4630        data->flags |= SCF_SEEN_ACCEPT;
4631      }
4632      if (data) {
4633       if (data_fake.flags & SF_HAS_EVAL)
4634        data->flags |= SF_HAS_EVAL;
4635       data->whilem_c = data_fake.whilem_c;
4636      }
4637      if (flags & SCF_DO_STCLASS)
4638       cl_or(pRExC_state, &accum, &this_class);
4639     }
4640    }
4641    if (flags & SCF_DO_SUBSTR) {
4642     data->pos_min += min1;
4643     data->pos_delta += max1 - min1;
4644     if (max1 != min1 || is_inf)
4645      data->longest = &(data->longest_float);
4646    }
4647    min += min1;
4648    delta += max1 - min1;
4649    if (flags & SCF_DO_STCLASS_OR) {
4650     cl_or(pRExC_state, data->start_class, &accum);
4651     if (min1) {
4652      cl_and(data->start_class, and_withp);
4653      flags &= ~SCF_DO_STCLASS;
4654     }
4655    }
4656    else if (flags & SCF_DO_STCLASS_AND) {
4657     if (min1) {
4658      cl_and(data->start_class, &accum);
4659      flags &= ~SCF_DO_STCLASS;
4660     }
4661     else {
4662      /* Switch to OR mode: cache the old value of
4663      * data->start_class */
4664      INIT_AND_WITHP;
4665      StructCopy(data->start_class, and_withp,
4666        struct regnode_charclass_class);
4667      flags &= ~SCF_DO_STCLASS_AND;
4668      StructCopy(&accum, data->start_class,
4669        struct regnode_charclass_class);
4670      flags |= SCF_DO_STCLASS_OR;
4671      SET_SSC_EOS(data->start_class);
4672     }
4673    }
4674    scan= tail;
4675    continue;
4676   }
4677 #else
4678   else if (PL_regkind[OP(scan)] == TRIE) {
4679    reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4680    U8*bang=NULL;
4681
4682    min += trie->minlen;
4683    delta += (trie->maxlen - trie->minlen);
4684    flags &= ~SCF_DO_STCLASS; /* xxx */
4685    if (flags & SCF_DO_SUBSTR) {
4686      SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4687      data->pos_min += trie->minlen;
4688      data->pos_delta += (trie->maxlen - trie->minlen);
4689     if (trie->maxlen != trie->minlen)
4690      data->longest = &(data->longest_float);
4691     }
4692     if (trie->jump) /* no more substrings -- for now /grr*/
4693      flags &= ~SCF_DO_SUBSTR;
4694   }
4695 #endif /* old or new */
4696 #endif /* TRIE_STUDY_OPT */
4697
4698   /* Else: zero-length, ignore. */
4699   scan = regnext(scan);
4700  }
4701  if (frame) {
4702   last = frame->last;
4703   scan = frame->next;
4704   stopparen = frame->stop;
4705   frame = frame->prev;
4706   goto fake_study_recurse;
4707  }
4708
4709   finish:
4710  assert(!frame);
4711  DEBUG_STUDYDATA("pre-fin:",data,depth);
4712
4713  *scanp = scan;
4714  *deltap = is_inf_internal ? I32_MAX : delta;
4715  if (flags & SCF_DO_SUBSTR && is_inf)
4716   data->pos_delta = I32_MAX - data->pos_min;
4717  if (is_par > (I32)U8_MAX)
4718   is_par = 0;
4719  if (is_par && pars==1 && data) {
4720   data->flags |= SF_IN_PAR;
4721   data->flags &= ~SF_HAS_PAR;
4722  }
4723  else if (pars && data) {
4724   data->flags |= SF_HAS_PAR;
4725   data->flags &= ~SF_IN_PAR;
4726  }
4727  if (flags & SCF_DO_STCLASS_OR)
4728   cl_and(data->start_class, and_withp);
4729  if (flags & SCF_TRIE_RESTUDY)
4730   data->flags |=  SCF_TRIE_RESTUDY;
4731
4732  DEBUG_STUDYDATA("post-fin:",data,depth);
4733
4734  return min < stopmin ? min : stopmin;
4735 }
4736
4737 STATIC U32
4738 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4739 {
4740  U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4741
4742  PERL_ARGS_ASSERT_ADD_DATA;
4743
4744  Renewc(RExC_rxi->data,
4745   sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4746   char, struct reg_data);
4747  if(count)
4748   Renew(RExC_rxi->data->what, count + n, U8);
4749  else
4750   Newx(RExC_rxi->data->what, n, U8);
4751  RExC_rxi->data->count = count + n;
4752  Copy(s, RExC_rxi->data->what + count, n, U8);
4753  return count;
4754 }
4755
4756 /*XXX: todo make this not included in a non debugging perl */
4757 #ifndef PERL_IN_XSUB_RE
4758 void
4759 Perl_reginitcolors(pTHX)
4760 {
4761  dVAR;
4762  const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4763  if (s) {
4764   char *t = savepv(s);
4765   int i = 0;
4766   PL_colors[0] = t;
4767   while (++i < 6) {
4768    t = strchr(t, '\t');
4769    if (t) {
4770     *t = '\0';
4771     PL_colors[i] = ++t;
4772    }
4773    else
4774     PL_colors[i] = t = (char *)"";
4775   }
4776  } else {
4777   int i = 0;
4778   while (i < 6)
4779    PL_colors[i++] = (char *)"";
4780  }
4781  PL_colorset = 1;
4782 }
4783 #endif
4784
4785
4786 #ifdef TRIE_STUDY_OPT
4787 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething)            \
4788  STMT_START {                                            \
4789   if (                                                \
4790    (data.flags & SCF_TRIE_RESTUDY)               \
4791    && ! restudied++                              \
4792   ) {                                                 \
4793    dOsomething;                                    \
4794    goto reStudy;                                   \
4795   }                                                   \
4796  } STMT_END
4797 #else
4798 #define CHECK_RESTUDY_GOTO_butfirst
4799 #endif
4800
4801 /*
4802  * pregcomp - compile a regular expression into internal code
4803  *
4804  * Decides which engine's compiler to call based on the hint currently in
4805  * scope
4806  */
4807
4808 #ifndef PERL_IN_XSUB_RE
4809
4810 /* return the currently in-scope regex engine (or the default if none)  */
4811
4812 regexp_engine const *
4813 Perl_current_re_engine(pTHX)
4814 {
4815  dVAR;
4816
4817  if (IN_PERL_COMPILETIME) {
4818   HV * const table = GvHV(PL_hintgv);
4819   SV **ptr;
4820
4821   if (!table)
4822    return &reh_regexp_engine;
4823   ptr = hv_fetchs(table, "regcomp", FALSE);
4824   if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4825    return &reh_regexp_engine;
4826   return INT2PTR(regexp_engine*,SvIV(*ptr));
4827  }
4828  else {
4829   SV *ptr;
4830   if (!PL_curcop->cop_hints_hash)
4831    return &reh_regexp_engine;
4832   ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4833   if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4834    return &reh_regexp_engine;
4835   return INT2PTR(regexp_engine*,SvIV(ptr));
4836  }
4837 }
4838
4839
4840 REGEXP *
4841 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4842 {
4843  dVAR;
4844  regexp_engine const *eng = current_re_engine();
4845  GET_RE_DEBUG_FLAGS_DECL;
4846
4847  PERL_ARGS_ASSERT_PREGCOMP;
4848
4849  /* Dispatch a request to compile a regexp to correct regexp engine. */
4850  DEBUG_COMPILE_r({
4851   PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4852       PTR2UV(eng));
4853  });
4854  return CALLREGCOMP_ENG(eng, pattern, flags);
4855 }
4856 #endif
4857
4858 /* public(ish) entry point for the perl core's own regex compiling code.
4859  * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4860  * pattern rather than a list of OPs, and uses the internal engine rather
4861  * than the current one */
4862
4863 REGEXP *
4864 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4865 {
4866  SV *pat = pattern; /* defeat constness! */
4867  PERL_ARGS_ASSERT_RE_COMPILE;
4868  return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4869 #ifdef PERL_IN_XSUB_RE
4870         &my_reg_engine,
4871 #else
4872         &reh_regexp_engine,
4873 #endif
4874         NULL, NULL, rx_flags, 0);
4875 }
4876
4877
4878 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
4879  * blocks, recalculate the indices. Update pat_p and plen_p in-place to
4880  * point to the realloced string and length.
4881  *
4882  * This is essentially a copy of Perl_bytes_to_utf8() with the code index
4883  * stuff added */
4884
4885 static void
4886 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
4887      char **pat_p, STRLEN *plen_p, int num_code_blocks)
4888 {
4889  U8 *const src = (U8*)*pat_p;
4890  U8 *dst;
4891  int n=0;
4892  STRLEN s = 0, d = 0;
4893  bool do_end = 0;
4894  GET_RE_DEBUG_FLAGS_DECL;
4895
4896  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4897   "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4898
4899  Newx(dst, *plen_p * 2 + 1, U8);
4900
4901  while (s < *plen_p) {
4902   const UV uv = NATIVE_TO_ASCII(src[s]);
4903   if (UNI_IS_INVARIANT(uv))
4904    dst[d]   = (U8)UTF_TO_NATIVE(uv);
4905   else {
4906    dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
4907    dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
4908   }
4909   if (n < num_code_blocks) {
4910    if (!do_end && pRExC_state->code_blocks[n].start == s) {
4911     pRExC_state->code_blocks[n].start = d;
4912     assert(dst[d] == '(');
4913     do_end = 1;
4914    }
4915    else if (do_end && pRExC_state->code_blocks[n].end == s) {
4916     pRExC_state->code_blocks[n].end = d;
4917     assert(dst[d] == ')');
4918     do_end = 0;
4919     n++;
4920    }
4921   }
4922   s++;
4923   d++;
4924  }
4925  dst[d] = '\0';
4926  *plen_p = d;
4927  *pat_p = (char*) dst;
4928  SAVEFREEPV(*pat_p);
4929  RExC_orig_utf8 = RExC_utf8 = 1;
4930 }
4931
4932
4933
4934 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
4935  * while recording any code block indices, and handling overloading,
4936  * nested qr// objects etc.  If pat is null, it will allocate a new
4937  * string, or just return the first arg, if there's only one.
4938  *
4939  * Returns the malloced/updated pat.
4940  * patternp and pat_count is the array of SVs to be concatted;
4941  * oplist is the optional list of ops that generated the SVs;
4942  * recompile_p is a pointer to a boolean that will be set if
4943  *   the regex will need to be recompiled.
4944  * delim, if non-null is an SV that will be inserted between each element
4945  */
4946
4947 static SV*
4948 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
4949     SV *pat, SV ** const patternp, int pat_count,
4950     OP *oplist, bool *recompile_p, SV *delim)
4951 {
4952  SV **svp;
4953  int n = 0;
4954  bool use_delim = FALSE;
4955  bool alloced = FALSE;
4956
4957  /* if we know we have at least two args, create an empty string,
4958  * then concatenate args to that. For no args, return an empty string */
4959  if (!pat && pat_count != 1) {
4960   pat = newSVpvn("", 0);
4961   SAVEFREESV(pat);
4962   alloced = TRUE;
4963  }
4964
4965  for (svp = patternp; svp < patternp + pat_count; svp++) {
4966   SV *sv;
4967   SV *rx  = NULL;
4968   STRLEN orig_patlen = 0;
4969   bool code = 0;
4970   SV *msv = use_delim ? delim : *svp;
4971
4972   /* if we've got a delimiter, we go round the loop twice for each
4973   * svp slot (except the last), using the delimiter the second
4974   * time round */
4975   if (use_delim) {
4976    svp--;
4977    use_delim = FALSE;
4978   }
4979   else if (delim)
4980    use_delim = TRUE;
4981
4982   if (SvTYPE(msv) == SVt_PVAV) {
4983    /* we've encountered an interpolated array within
4984    * the pattern, e.g. /...@a..../. Expand the list of elements,
4985    * then recursively append elements.
4986    * The code in this block is based on S_pushav() */
4987
4988    AV *const av = (AV*)msv;
4989    const I32 maxarg = AvFILL(av) + 1;
4990    SV **array;
4991
4992    if (oplist) {
4993     assert(oplist->op_type == OP_PADAV
4994      || oplist->op_type == OP_RV2AV);
4995     oplist = oplist->op_sibling;;
4996    }
4997
4998    if (SvRMAGICAL(av)) {
4999     U32 i;
5000
5001     Newx(array, maxarg, SV*);
5002     SAVEFREEPV(array);
5003     for (i=0; i < (U32)maxarg; i++) {
5004      SV ** const svp = av_fetch(av, i, FALSE);
5005      array[i] = svp ? *svp : &PL_sv_undef;
5006     }
5007    }
5008    else
5009     array = AvARRAY(av);
5010
5011    pat = S_concat_pat(aTHX_ pRExC_state, pat,
5012         array, maxarg, NULL, recompile_p,
5013         /* $" */
5014         GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5015
5016    continue;
5017   }
5018
5019
5020   /* we make the assumption here that each op in the list of
5021   * op_siblings maps to one SV pushed onto the stack,
5022   * except for code blocks, with have both an OP_NULL and
5023   * and OP_CONST.
5024   * This allows us to match up the list of SVs against the
5025   * list of OPs to find the next code block.
5026   *
5027   * Note that       PUSHMARK PADSV PADSV ..
5028   * is optimised to
5029   *                 PADRANGE PADSV  PADSV  ..
5030   * so the alignment still works. */
5031
5032   if (oplist) {
5033    if (oplist->op_type == OP_NULL
5034     && (oplist->op_flags & OPf_SPECIAL))
5035    {
5036     assert(n < pRExC_state->num_code_blocks);
5037     pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5038     pRExC_state->code_blocks[n].block = oplist;
5039     pRExC_state->code_blocks[n].src_regex = NULL;
5040     n++;
5041     code = 1;
5042     oplist = oplist->op_sibling; /* skip CONST */
5043     assert(oplist);
5044    }
5045    oplist = oplist->op_sibling;;
5046   }
5047
5048   /* apply magic and QR overloading to arg */
5049
5050   SvGETMAGIC(msv);
5051   if (SvROK(msv) && SvAMAGIC(msv)) {
5052    SV *sv = AMG_CALLunary(msv, regexp_amg);
5053    if (sv) {
5054     if (SvROK(sv))
5055      sv = SvRV(sv);
5056     if (SvTYPE(sv) != SVt_REGEXP)
5057      Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5058     msv = sv;
5059    }
5060   }
5061
5062   /* try concatenation overload ... */
5063   if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5064     (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5065   {
5066    sv_setsv(pat, sv);
5067    /* overloading involved: all bets are off over literal
5068    * code. Pretend we haven't seen it */
5069    pRExC_state->num_code_blocks -= n;
5070    n = 0;
5071   }
5072   else  {
5073    /* ... or failing that, try "" overload */
5074    while (SvAMAGIC(msv)
5075      && (sv = AMG_CALLunary(msv, string_amg))
5076      && sv != msv
5077      &&  !(   SvROK(msv)
5078       && SvROK(sv)
5079       && SvRV(msv) == SvRV(sv))
5080    ) {
5081     msv = sv;
5082     SvGETMAGIC(msv);
5083    }
5084    if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5085     msv = SvRV(msv);
5086
5087    if (pat) {
5088     /* this is a partially unrolled
5089     *     sv_catsv_nomg(pat, msv);
5090     * that allows us to adjust code block indices if
5091     * needed */
5092     STRLEN dlen;
5093     char *dst = SvPV_force_nomg(pat, dlen);
5094     orig_patlen = dlen;
5095     if (SvUTF8(msv) && !SvUTF8(pat)) {
5096      S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5097      sv_setpvn(pat, dst, dlen);
5098      SvUTF8_on(pat);
5099     }
5100     sv_catsv_nomg(pat, msv);
5101     rx = msv;
5102    }
5103    else
5104     pat = msv;
5105
5106    if (code)
5107     pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5108   }
5109
5110   /* extract any code blocks within any embedded qr//'s */
5111   if (rx && SvTYPE(rx) == SVt_REGEXP
5112    && RX_ENGINE((REGEXP*)rx)->op_comp)
5113   {
5114
5115    RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5116    if (ri->num_code_blocks) {
5117     int i;
5118     /* the presence of an embedded qr// with code means
5119     * we should always recompile: the text of the
5120     * qr// may not have changed, but it may be a
5121     * different closure than last time */
5122     *recompile_p = 1;
5123     Renew(pRExC_state->code_blocks,
5124      pRExC_state->num_code_blocks + ri->num_code_blocks,
5125      struct reg_code_block);
5126     pRExC_state->num_code_blocks += ri->num_code_blocks;
5127
5128     for (i=0; i < ri->num_code_blocks; i++) {
5129      struct reg_code_block *src, *dst;
5130      STRLEN offset =  orig_patlen
5131       + ReANY((REGEXP *)rx)->pre_prefix;
5132      assert(n < pRExC_state->num_code_blocks);
5133      src = &ri->code_blocks[i];
5134      dst = &pRExC_state->code_blocks[n];
5135      dst->start     = src->start + offset;
5136      dst->end     = src->end   + offset;
5137      dst->block     = src->block;
5138      dst->src_regex  = (REGEXP*) SvREFCNT_inc( (SV*)
5139            src->src_regex
5140             ? src->src_regex
5141             : (REGEXP*)rx);
5142      n++;
5143     }
5144    }
5145   }
5146  }
5147  /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5148  if (alloced)
5149   SvSETMAGIC(pat);
5150
5151  return pat;
5152 }
5153
5154
5155
5156 /* see if there are any run-time code blocks in the pattern.
5157  * False positives are allowed */
5158
5159 static bool
5160 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5161      char *pat, STRLEN plen)
5162 {
5163  int n = 0;
5164  STRLEN s;
5165
5166  for (s = 0; s < plen; s++) {
5167   if (n < pRExC_state->num_code_blocks
5168    && s == pRExC_state->code_blocks[n].start)
5169   {
5170    s = pRExC_state->code_blocks[n].end;
5171    n++;
5172    continue;
5173   }
5174   /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5175   * positives here */
5176   if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5177    (pat[s+2] == '{'
5178     || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5179   )
5180    return 1;
5181  }
5182  return 0;
5183 }
5184
5185 /* Handle run-time code blocks. We will already have compiled any direct
5186  * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5187  * copy of it, but with any literal code blocks blanked out and
5188  * appropriate chars escaped; then feed it into
5189  *
5190  *    eval "qr'modified_pattern'"
5191  *
5192  * For example,
5193  *
5194  *       a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5195  *
5196  * becomes
5197  *
5198  *    qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5199  *
5200  * After eval_sv()-ing that, grab any new code blocks from the returned qr
5201  * and merge them with any code blocks of the original regexp.
5202  *
5203  * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5204  * instead, just save the qr and return FALSE; this tells our caller that
5205  * the original pattern needs upgrading to utf8.
5206  */
5207
5208 static bool
5209 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5210  char *pat, STRLEN plen)
5211 {
5212  SV *qr;
5213
5214  GET_RE_DEBUG_FLAGS_DECL;
5215
5216  if (pRExC_state->runtime_code_qr) {
5217   /* this is the second time we've been called; this should
5218   * only happen if the main pattern got upgraded to utf8
5219   * during compilation; re-use the qr we compiled first time
5220   * round (which should be utf8 too)
5221   */
5222   qr = pRExC_state->runtime_code_qr;
5223   pRExC_state->runtime_code_qr = NULL;
5224   assert(RExC_utf8 && SvUTF8(qr));
5225  }
5226  else {
5227   int n = 0;
5228   STRLEN s;
5229   char *p, *newpat;
5230   int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5231   SV *sv, *qr_ref;
5232   dSP;
5233
5234   /* determine how many extra chars we need for ' and \ escaping */
5235   for (s = 0; s < plen; s++) {
5236    if (pat[s] == '\'' || pat[s] == '\\')
5237     newlen++;
5238   }
5239
5240   Newx(newpat, newlen, char);
5241   p = newpat;
5242   *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5243
5244   for (s = 0; s < plen; s++) {
5245    if (n < pRExC_state->num_code_blocks
5246     && s == pRExC_state->code_blocks[n].start)
5247    {
5248     /* blank out literal code block */
5249     assert(pat[s] == '(');
5250     while (s <= pRExC_state->code_blocks[n].end) {
5251      *p++ = '_';
5252      s++;
5253     }
5254     s--;
5255     n++;
5256     continue;
5257    }
5258    if (pat[s] == '\'' || pat[s] == '\\')
5259     *p++ = '\\';
5260    *p++ = pat[s];
5261   }
5262   *p++ = '\'';
5263   if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5264    *p++ = 'x';
5265   *p++ = '\0';
5266   DEBUG_COMPILE_r({
5267    PerlIO_printf(Perl_debug_log,
5268     "%sre-parsing pattern for runtime code:%s %s\n",
5269     PL_colors[4],PL_colors[5],newpat);
5270   });
5271
5272   sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5273   Safefree(newpat);
5274
5275   ENTER;
5276   SAVETMPS;
5277   save_re_context();
5278   PUSHSTACKi(PERLSI_REQUIRE);
5279   /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5280   * parsing qr''; normally only q'' does this. It also alters
5281   * hints handling */
5282   eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5283   SvREFCNT_dec_NN(sv);
5284   SPAGAIN;
5285   qr_ref = POPs;
5286   PUTBACK;
5287   {
5288    SV * const errsv = ERRSV;
5289    if (SvTRUE_NN(errsv))
5290    {
5291     Safefree(pRExC_state->code_blocks);
5292     /* use croak_sv ? */
5293     Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5294    }
5295   }
5296   assert(SvROK(qr_ref));
5297   qr = SvRV(qr_ref);
5298   assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5299   /* the leaving below frees the tmp qr_ref.
5300   * Give qr a life of its own */
5301   SvREFCNT_inc(qr);
5302   POPSTACK;
5303   FREETMPS;
5304   LEAVE;
5305
5306  }
5307
5308  if (!RExC_utf8 && SvUTF8(qr)) {
5309   /* first time through; the pattern got upgraded; save the
5310   * qr for the next time through */
5311   assert(!pRExC_state->runtime_code_qr);
5312   pRExC_state->runtime_code_qr = qr;
5313   return 0;
5314  }
5315
5316
5317  /* extract any code blocks within the returned qr//  */
5318
5319
5320  /* merge the main (r1) and run-time (r2) code blocks into one */
5321  {
5322   RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5323   struct reg_code_block *new_block, *dst;
5324   RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5325   int i1 = 0, i2 = 0;
5326
5327   if (!r2->num_code_blocks) /* we guessed wrong */
5328   {
5329    SvREFCNT_dec_NN(qr);
5330    return 1;
5331   }
5332
5333   Newx(new_block,
5334    r1->num_code_blocks + r2->num_code_blocks,
5335    struct reg_code_block);
5336   dst = new_block;
5337
5338   while (    i1 < r1->num_code_blocks
5339     || i2 < r2->num_code_blocks)
5340   {
5341    struct reg_code_block *src;
5342    bool is_qr = 0;
5343
5344    if (i1 == r1->num_code_blocks) {
5345     src = &r2->code_blocks[i2++];
5346     is_qr = 1;
5347    }
5348    else if (i2 == r2->num_code_blocks)
5349     src = &r1->code_blocks[i1++];
5350    else if (  r1->code_blocks[i1].start
5351      < r2->code_blocks[i2].start)
5352    {
5353     src = &r1->code_blocks[i1++];
5354     assert(src->end < r2->code_blocks[i2].start);
5355    }
5356    else {
5357     assert(  r1->code_blocks[i1].start
5358      > r2->code_blocks[i2].start);
5359     src = &r2->code_blocks[i2++];
5360     is_qr = 1;
5361     assert(src->end < r1->code_blocks[i1].start);
5362    }
5363
5364    assert(pat[src->start] == '(');
5365    assert(pat[src->end]   == ')');
5366    dst->start     = src->start;
5367    dst->end     = src->end;
5368    dst->block     = src->block;
5369    dst->src_regex  = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5370          : src->src_regex;
5371    dst++;
5372   }
5373   r1->num_code_blocks += r2->num_code_blocks;
5374   Safefree(r1->code_blocks);
5375   r1->code_blocks = new_block;
5376  }
5377
5378  SvREFCNT_dec_NN(qr);
5379  return 1;
5380 }
5381
5382
5383 STATIC bool
5384 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5385 {
5386  /* This is the common code for setting up the floating and fixed length
5387  * string data extracted from Perl_re_op_compile() below.  Returns a boolean
5388  * as to whether succeeded or not */
5389
5390  I32 t,ml;
5391
5392  if (! (longest_length
5393   || (eol /* Can't have SEOL and MULTI */
5394    && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5395   )
5396    /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5397   || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5398  {
5399   return FALSE;
5400  }
5401
5402  /* copy the information about the longest from the reg_scan_data
5403   over to the program. */
5404  if (SvUTF8(sv_longest)) {
5405   *rx_utf8 = sv_longest;
5406   *rx_substr = NULL;
5407  } else {
5408   *rx_substr = sv_longest;
5409   *rx_utf8 = NULL;
5410  }
5411  /* end_shift is how many chars that must be matched that
5412   follow this item. We calculate it ahead of time as once the
5413   lookbehind offset is added in we lose the ability to correctly
5414   calculate it.*/
5415  ml = minlen ? *(minlen) : (I32)longest_length;
5416  *rx_end_shift = ml - offset
5417   - longest_length + (SvTAIL(sv_longest) != 0)
5418   + lookbehind;
5419
5420  t = (eol/* Can't have SEOL and MULTI */
5421   && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5422  fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5423
5424  return TRUE;
5425 }
5426
5427 /*
5428  * Perl_re_op_compile - the perl internal RE engine's function to compile a
5429  * regular expression into internal code.
5430  * The pattern may be passed either as:
5431  *    a list of SVs (patternp plus pat_count)
5432  *    a list of OPs (expr)
5433  * If both are passed, the SV list is used, but the OP list indicates
5434  * which SVs are actually pre-compiled code blocks
5435  *
5436  * The SVs in the list have magic and qr overloading applied to them (and
5437  * the list may be modified in-place with replacement SVs in the latter
5438  * case).
5439  *
5440  * If the pattern hasn't changed from old_re, then old_re will be
5441  * returned.
5442  *
5443  * eng is the current engine. If that engine has an op_comp method, then
5444  * handle directly (i.e. we assume that op_comp was us); otherwise, just
5445  * do the initial concatenation of arguments and pass on to the external
5446  * engine.
5447  *
5448  * If is_bare_re is not null, set it to a boolean indicating whether the
5449  * arg list reduced (after overloading) to a single bare regex which has
5450  * been returned (i.e. /$qr/).
5451  *
5452  * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5453  *
5454  * pm_flags contains the PMf_* flags, typically based on those from the
5455  * pm_flags field of the related PMOP. Currently we're only interested in
5456  * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5457  *
5458  * We can't allocate space until we know how big the compiled form will be,
5459  * but we can't compile it (and thus know how big it is) until we've got a
5460  * place to put the code.  So we cheat:  we compile it twice, once with code
5461  * generation turned off and size counting turned on, and once "for real".
5462  * This also means that we don't allocate space until we are sure that the
5463  * thing really will compile successfully, and we never have to move the
5464  * code and thus invalidate pointers into it.  (Note that it has to be in
5465  * one piece because free() must be able to free it all.) [NB: not true in perl]
5466  *
5467  * Beware that the optimization-preparation code in here knows about some
5468  * of the structure of the compiled regexp.  [I'll say.]
5469  */
5470
5471 REGEXP *
5472 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5473      OP *expr, const regexp_engine* eng, REGEXP *old_re,
5474      bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5475 {
5476  dVAR;
5477  REGEXP *rx;
5478  struct regexp *r;
5479  regexp_internal *ri;
5480  STRLEN plen;
5481  char *exp;
5482  regnode *scan;
5483  I32 flags;
5484  I32 minlen = 0;
5485  U32 rx_flags;
5486  SV *pat;
5487  SV *code_blocksv = NULL;
5488  SV** new_patternp = patternp;
5489
5490  /* these are all flags - maybe they should be turned
5491  * into a single int with different bit masks */
5492  I32 sawlookahead = 0;
5493  I32 sawplus = 0;
5494  I32 sawopen = 0;
5495  regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5496  bool recompile = 0;
5497  bool runtime_code = 0;
5498  scan_data_t data;
5499  RExC_state_t RExC_state;
5500  RExC_state_t * const pRExC_state = &RExC_state;
5501 #ifdef TRIE_STUDY_OPT
5502  int restudied = 0;
5503  RExC_state_t copyRExC_state;
5504 #endif
5505  GET_RE_DEBUG_FLAGS_DECL;
5506
5507  PERL_ARGS_ASSERT_RE_OP_COMPILE;
5508
5509  DEBUG_r(if (!PL_colorset) reginitcolors());
5510
5511 #ifndef PERL_IN_XSUB_RE
5512  /* Initialize these here instead of as-needed, as is quick and avoids
5513  * having to test them each time otherwise */
5514  if (! PL_AboveLatin1) {
5515   PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5516   PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5517   PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5518
5519   PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5520         = _new_invlist_C_array(L1PosixAlnum_invlist);
5521   PL_Posix_ptrs[_CC_ALPHANUMERIC]
5522         = _new_invlist_C_array(PosixAlnum_invlist);
5523
5524   PL_L1Posix_ptrs[_CC_ALPHA]
5525         = _new_invlist_C_array(L1PosixAlpha_invlist);
5526   PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5527
5528   PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5529   PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5530
5531   /* Cased is the same as Alpha in the ASCII range */
5532   PL_L1Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(L1Cased_invlist);
5533   PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PosixAlpha_invlist);
5534
5535   PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5536   PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5537
5538   PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5539   PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5540
5541   PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5542   PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5543
5544   PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5545   PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5546
5547   PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5548   PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5549
5550   PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5551   PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5552
5553   PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5554   PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5555   PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5556   PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5557
5558   PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5559   PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5560
5561   PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5562
5563   PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5564   PL_L1Posix_ptrs[_CC_WORDCHAR]
5565         = _new_invlist_C_array(L1PosixWord_invlist);
5566
5567   PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5568   PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5569
5570   PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5571  }
5572 #endif
5573
5574  pRExC_state->code_blocks = NULL;
5575  pRExC_state->num_code_blocks = 0;
5576
5577  if (is_bare_re)
5578   *is_bare_re = FALSE;
5579
5580  if (expr && (expr->op_type == OP_LIST ||
5581     (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5582   /* allocate code_blocks if needed */
5583   OP *o;
5584   int ncode = 0;
5585
5586   for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5587    if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5588     ncode++; /* count of DO blocks */
5589   if (ncode) {
5590    pRExC_state->num_code_blocks = ncode;
5591    Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5592   }
5593  }
5594
5595  if (!pat_count) {
5596   /* compile-time pattern with just OP_CONSTs and DO blocks */
5597
5598   int n;
5599   OP *o;
5600
5601   /* find how many CONSTs there are */
5602   assert(expr);
5603   n = 0;
5604   if (expr->op_type == OP_CONST)
5605    n = 1;
5606   else
5607    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5608     if (o->op_type == OP_CONST)
5609      n++;
5610    }
5611
5612   /* fake up an SV array */
5613
5614   assert(!new_patternp);
5615   Newx(new_patternp, n, SV*);
5616   SAVEFREEPV(new_patternp);
5617   pat_count = n;
5618
5619   n = 0;
5620   if (expr->op_type == OP_CONST)
5621    new_patternp[n] = cSVOPx_sv(expr);
5622   else
5623    for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5624     if (o->op_type == OP_CONST)
5625      new_patternp[n++] = cSVOPo_sv;
5626    }
5627
5628  }
5629
5630  DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5631   "Assembling pattern from %d elements%s\n", pat_count,
5632    orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5633
5634  /* set expr to the first arg op */
5635
5636  if (pRExC_state->num_code_blocks
5637   && expr->op_type != OP_CONST)
5638  {
5639    expr = cLISTOPx(expr)->op_first;
5640    assert(   expr->op_type == OP_PUSHMARK
5641     || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5642     || expr->op_type == OP_PADRANGE);
5643    expr = expr->op_sibling;
5644  }
5645
5646  pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5647       expr, &recompile, NULL);
5648
5649  /* handle bare (possibly after overloading) regex: foo =~ $re */
5650  {
5651   SV *re = pat;
5652   if (SvROK(re))
5653    re = SvRV(re);
5654   if (SvTYPE(re) == SVt_REGEXP) {
5655    if (is_bare_re)
5656     *is_bare_re = TRUE;
5657    SvREFCNT_inc(re);
5658    Safefree(pRExC_state->code_blocks);
5659    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5660     "Precompiled pattern%s\n",
5661      orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5662
5663    return (REGEXP*)re;
5664   }
5665  }
5666
5667  exp = SvPV_nomg(pat, plen);
5668
5669  if (!eng->op_comp) {
5670   if ((SvUTF8(pat) && IN_BYTES)
5671     || SvGMAGICAL(pat) || SvAMAGIC(pat))
5672   {
5673    /* make a temporary copy; either to convert to bytes,
5674    * or to avoid repeating get-magic / overloaded stringify */
5675    pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5676           (IN_BYTES ? 0 : SvUTF8(pat)));
5677   }
5678   Safefree(pRExC_state->code_blocks);
5679   return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5680  }
5681
5682  /* ignore the utf8ness if the pattern is 0 length */
5683  RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5684  RExC_uni_semantics = 0;
5685  RExC_contains_locale = 0;
5686  pRExC_state->runtime_code_qr = NULL;
5687
5688  DEBUG_COMPILE_r({
5689    SV *dsv= sv_newmortal();
5690    RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
5691    PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5692       PL_colors[4],PL_colors[5],s);
5693   });
5694
5695   redo_first_pass:
5696  /* we jump here if we upgrade the pattern to utf8 and have to
5697  * recompile */
5698
5699  if ((pm_flags & PMf_USE_RE_EVAL)
5700     /* this second condition covers the non-regex literal case,
5701     * i.e.  $foo =~ '(?{})'. */
5702     || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
5703  )
5704   runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
5705
5706  /* return old regex if pattern hasn't changed */
5707  /* XXX: note in the below we have to check the flags as well as the pattern.
5708  *
5709  * Things get a touch tricky as we have to compare the utf8 flag independently
5710  * from the compile flags.
5711  */
5712
5713  if (   old_re
5714   && !recompile
5715   && !!RX_UTF8(old_re) == !!RExC_utf8
5716   && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
5717   && RX_PRECOMP(old_re)
5718   && RX_PRELEN(old_re) == plen
5719   && memEQ(RX_PRECOMP(old_re), exp, plen)
5720   && !runtime_code /* with runtime code, always recompile */ )
5721  {
5722   Safefree(pRExC_state->code_blocks);
5723   return old_re;
5724  }
5725
5726  rx_flags = orig_rx_flags;
5727
5728  if (initial_charset == REGEX_LOCALE_CHARSET) {
5729   RExC_contains_locale = 1;
5730  }
5731  else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5732
5733   /* Set to use unicode semantics if the pattern is in utf8 and has the
5734   * 'depends' charset specified, as it means unicode when utf8  */
5735   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5736  }
5737
5738  RExC_precomp = exp;
5739  RExC_flags = rx_flags;
5740  RExC_pm_flags = pm_flags;
5741
5742  if (runtime_code) {
5743   if (TAINTING_get && TAINT_get)
5744    Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5745
5746   if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5747    /* whoops, we have a non-utf8 pattern, whilst run-time code
5748    * got compiled as utf8. Try again with a utf8 pattern */
5749    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5750          pRExC_state->num_code_blocks);
5751    goto redo_first_pass;
5752   }
5753  }
5754  assert(!pRExC_state->runtime_code_qr);
5755
5756  RExC_sawback = 0;
5757
5758  RExC_seen = 0;
5759  RExC_in_lookbehind = 0;
5760  RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5761  RExC_extralen = 0;
5762  RExC_override_recoding = 0;
5763  RExC_in_multi_char_class = 0;
5764
5765  /* First pass: determine size, legality. */
5766  RExC_parse = exp;
5767  RExC_start = exp;
5768  RExC_end = exp + plen;
5769  RExC_naughty = 0;
5770  RExC_npar = 1;
5771  RExC_nestroot = 0;
5772  RExC_size = 0L;
5773  RExC_emit = &PL_regdummy;
5774  RExC_whilem_seen = 0;
5775  RExC_open_parens = NULL;
5776  RExC_close_parens = NULL;
5777  RExC_opend = NULL;
5778  RExC_paren_names = NULL;
5779 #ifdef DEBUGGING
5780  RExC_paren_name_list = NULL;
5781 #endif
5782  RExC_recurse = NULL;
5783  RExC_recurse_count = 0;
5784  pRExC_state->code_index = 0;
5785
5786 #if 0 /* REGC() is (currently) a NOP at the first pass.
5787  * Clever compilers notice this and complain. --jhi */
5788  REGC((U8)REG_MAGIC, (char*)RExC_emit);
5789 #endif
5790  DEBUG_PARSE_r(
5791   PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5792   RExC_lastnum=0;
5793   RExC_lastparse=NULL;
5794  );
5795  /* reg may croak on us, not giving us a chance to free
5796  pRExC_state->code_blocks.  We cannot SAVEFREEPV it now, as we may
5797  need it to survive as long as the regexp (qr/(?{})/).
5798  We must check that code_blocksv is not already set, because we may
5799  have jumped back to restart the sizing pass. */
5800  if (pRExC_state->code_blocks && !code_blocksv) {
5801   code_blocksv = newSV_type(SVt_PV);
5802   SAVEFREESV(code_blocksv);
5803   SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5804   SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5805  }
5806  if (reg(pRExC_state, 0, &flags,1) == NULL) {
5807   /* It's possible to write a regexp in ascii that represents Unicode
5808   codepoints outside of the byte range, such as via \x{100}. If we
5809   detect such a sequence we have to convert the entire pattern to utf8
5810   and then recompile, as our sizing calculation will have been based
5811   on 1 byte == 1 character, but we will need to use utf8 to encode
5812   at least some part of the pattern, and therefore must convert the whole
5813   thing.
5814   -- dmq */
5815   if (flags & RESTART_UTF8) {
5816    S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5817          pRExC_state->num_code_blocks);
5818    goto redo_first_pass;
5819   }
5820   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
5821  }
5822  if (code_blocksv)
5823   SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5824
5825  DEBUG_PARSE_r({
5826   PerlIO_printf(Perl_debug_log,
5827    "Required size %"IVdf" nodes\n"
5828    "Starting second pass (creation)\n",
5829    (IV)RExC_size);
5830   RExC_lastnum=0;
5831   RExC_lastparse=NULL;
5832  });
5833
5834  /* The first pass could have found things that force Unicode semantics */
5835  if ((RExC_utf8 || RExC_uni_semantics)
5836   && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5837  {
5838   set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5839  }
5840
5841  /* Small enough for pointer-storage convention?
5842  If extralen==0, this means that we will not need long jumps. */
5843  if (RExC_size >= 0x10000L && RExC_extralen)
5844   RExC_size += RExC_extralen;
5845  else
5846   RExC_extralen = 0;
5847  if (RExC_whilem_seen > 15)
5848   RExC_whilem_seen = 15;
5849
5850  /* Allocate space and zero-initialize. Note, the two step process
5851  of zeroing when in debug mode, thus anything assigned has to
5852  happen after that */
5853  rx = (REGEXP*) newSV_type(SVt_REGEXP);
5854  r = ReANY(rx);
5855  Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5856   char, regexp_internal);
5857  if ( r == NULL || ri == NULL )
5858   FAIL("Regexp out of space");
5859 #ifdef DEBUGGING
5860  /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5861  Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5862 #else
5863  /* bulk initialize base fields with 0. */
5864  Zero(ri, sizeof(regexp_internal), char);
5865 #endif
5866
5867  /* non-zero initialization begins here */
5868  RXi_SET( r, ri );
5869  r->engine= eng;
5870  r->extflags = rx_flags;
5871  RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
5872
5873  if (pm_flags & PMf_IS_QR) {
5874   ri->code_blocks = pRExC_state->code_blocks;
5875   ri->num_code_blocks = pRExC_state->num_code_blocks;
5876  }
5877  else
5878  {
5879   int n;
5880   for (n = 0; n < pRExC_state->num_code_blocks; n++)
5881    if (pRExC_state->code_blocks[n].src_regex)
5882     SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5883   SAVEFREEPV(pRExC_state->code_blocks);
5884  }
5885
5886  {
5887   bool has_p     = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5888   bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5889
5890   /* The caret is output if there are any defaults: if not all the STD
5891   * flags are set, or if no character set specifier is needed */
5892   bool has_default =
5893      (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5894      || ! has_charset);
5895   bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5896   U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5897        >> RXf_PMf_STD_PMMOD_SHIFT);
5898   const char *fptr = STD_PAT_MODS;        /*"msix"*/
5899   char *p;
5900   /* Allocate for the worst case, which is all the std flags are turned
5901   * on.  If more precision is desired, we could do a population count of
5902   * the flags set.  This could be done with a small lookup table, or by
5903   * shifting, masking and adding, or even, when available, assembly
5904   * language for a machine-language population count.
5905   * We never output a minus, as all those are defaults, so are
5906   * covered by the caret */
5907   const STRLEN wraplen = plen + has_p + has_runon
5908    + has_default       /* If needs a caret */
5909
5910     /* If needs a character set specifier */
5911    + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5912    + (sizeof(STD_PAT_MODS) - 1)
5913    + (sizeof("(?:)") - 1);
5914
5915   Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5916   r->xpv_len_u.xpvlenu_pv = p;
5917   if (RExC_utf8)
5918    SvFLAGS(rx) |= SVf_UTF8;
5919   *p++='('; *p++='?';
5920
5921   /* If a default, cover it using the caret */
5922   if (has_default) {
5923    *p++= DEFAULT_PAT_MOD;
5924   }
5925   if (has_charset) {
5926    STRLEN len;
5927    const char* const name = get_regex_charset_name(r->extflags, &len);
5928    Copy(name, p, len, char);
5929    p += len;
5930   }
5931   if (has_p)
5932    *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5933   {
5934    char ch;
5935    while((ch = *fptr++)) {
5936     if(reganch & 1)
5937      *p++ = ch;
5938     reganch >>= 1;
5939    }
5940   }
5941
5942   *p++ = ':';
5943   Copy(RExC_precomp, p, plen, char);
5944   assert ((RX_WRAPPED(rx) - p) < 16);
5945   r->pre_prefix = p - RX_WRAPPED(rx);
5946   p += plen;
5947   if (has_runon)
5948    *p++ = '\n';
5949   *p++ = ')';
5950   *p = 0;
5951   SvCUR_set(rx, p - RX_WRAPPED(rx));
5952  }
5953
5954  r->intflags = 0;
5955  r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5956
5957  if (RExC_seen & REG_SEEN_RECURSE) {
5958   Newxz(RExC_open_parens, RExC_npar,regnode *);
5959   SAVEFREEPV(RExC_open_parens);
5960   Newxz(RExC_close_parens,RExC_npar,regnode *);
5961   SAVEFREEPV(RExC_close_parens);
5962  }
5963
5964  /* Useful during FAIL. */
5965 #ifdef RE_TRACK_PATTERN_OFFSETS
5966  Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5967  DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5968       "%s %"UVuf" bytes for offset annotations.\n",
5969       ri->u.offsets ? "Got" : "Couldn't get",
5970       (UV)((2*RExC_size+1) * sizeof(U32))));
5971 #endif
5972  SetProgLen(ri,RExC_size);
5973  RExC_rx_sv = rx;
5974  RExC_rx = r;
5975  RExC_rxi = ri;
5976  REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
5977
5978  /* Second pass: emit code. */
5979  RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5980  RExC_pm_flags = pm_flags;
5981  RExC_parse = exp;
5982  RExC_end = exp + plen;
5983  RExC_naughty = 0;
5984  RExC_npar = 1;
5985  RExC_emit_start = ri->program;
5986  RExC_emit = ri->program;
5987  RExC_emit_bound = ri->program + RExC_size + 1;
5988  pRExC_state->code_index = 0;
5989
5990  REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5991  if (reg(pRExC_state, 0, &flags,1) == NULL) {
5992   ReREFCNT_dec(rx);
5993   Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags);
5994  }
5995  /* XXXX To minimize changes to RE engine we always allocate
5996  3-units-long substrs field. */
5997  Newx(r->substrs, 1, struct reg_substr_data);
5998  if (RExC_recurse_count) {
5999   Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6000   SAVEFREEPV(RExC_recurse);
6001  }
6002
6003 reStudy:
6004  r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6005  Zero(r->substrs, 1, struct reg_substr_data);
6006
6007 #ifdef TRIE_STUDY_OPT
6008  if (!restudied) {
6009   StructCopy(&zero_scan_data, &data, scan_data_t);
6010   copyRExC_state = RExC_state;
6011  } else {
6012   U32 seen=RExC_seen;
6013   DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6014
6015   RExC_state = copyRExC_state;
6016   if (seen & REG_TOP_LEVEL_BRANCHES)
6017    RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6018   else
6019    RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6020   StructCopy(&zero_scan_data, &data, scan_data_t);
6021  }
6022 #else
6023  StructCopy(&zero_scan_data, &data, scan_data_t);
6024 #endif
6025
6026  /* Dig out information for optimizations. */
6027  r->extflags = RExC_flags; /* was pm_op */
6028  /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6029
6030  if (UTF)
6031   SvUTF8_on(rx); /* Unicode in it? */
6032  ri->regstclass = NULL;
6033  if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6034   r->intflags |= PREGf_NAUGHTY;
6035  scan = ri->program + 1;  /* First BRANCH. */
6036
6037  /* testing for BRANCH here tells us whether there is "must appear"
6038  data in the pattern. If there is then we can use it for optimisations */
6039  if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
6040   I32 fake;
6041   STRLEN longest_float_length, longest_fixed_length;
6042   struct regnode_charclass_class ch_class; /* pointed to by data */
6043   int stclass_flag;
6044   I32 last_close = 0; /* pointed to by data */
6045   regnode *first= scan;
6046   regnode *first_next= regnext(first);
6047   /*
6048   * Skip introductions and multiplicators >= 1
6049   * so that we can extract the 'meat' of the pattern that must
6050   * match in the large if() sequence following.
6051   * NOTE that EXACT is NOT covered here, as it is normally
6052   * picked up by the optimiser separately.
6053   *
6054   * This is unfortunate as the optimiser isnt handling lookahead
6055   * properly currently.
6056   *
6057   */
6058   while ((OP(first) == OPEN && (sawopen = 1)) ||
6059    /* An OR of *one* alternative - should not happen now. */
6060    (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6061    /* for now we can't handle lookbehind IFMATCH*/
6062    (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6063    (OP(first) == PLUS) ||
6064    (OP(first) == MINMOD) ||
6065    /* An {n,m} with n>0 */
6066    (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6067    (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6068   {
6069     /*
6070     * the only op that could be a regnode is PLUS, all the rest
6071     * will be regnode_1 or regnode_2.
6072     *
6073     */
6074     if (OP(first) == PLUS)
6075      sawplus = 1;
6076     else
6077      first += regarglen[OP(first)];
6078
6079     first = NEXTOPER(first);
6080     first_next= regnext(first);
6081   }
6082
6083   /* Starting-point info. */
6084  again:
6085   DEBUG_PEEP("first:",first,0);
6086   /* Ignore EXACT as we deal with it later. */
6087   if (PL_regkind[OP(first)] == EXACT) {
6088    if (OP(first) == EXACT)
6089     NOOP; /* Empty, get anchored substr later. */
6090    else
6091     ri->regstclass = first;
6092   }
6093 #ifdef TRIE_STCLASS
6094   else if (PL_regkind[OP(first)] == TRIE &&
6095     ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6096   {
6097    regnode *trie_op;
6098    /* this can happen only on restudy */
6099    if ( OP(first) == TRIE ) {
6100     struct regnode_1 *trieop = (struct regnode_1 *)
6101      PerlMemShared_calloc(1, sizeof(struct regnode_1));
6102     StructCopy(first,trieop,struct regnode_1);
6103     trie_op=(regnode *)trieop;
6104    } else {
6105     struct regnode_charclass *trieop = (struct regnode_charclass *)
6106      PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6107     StructCopy(first,trieop,struct regnode_charclass);
6108     trie_op=(regnode *)trieop;
6109    }
6110    OP(trie_op)+=2;
6111    make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6112    ri->regstclass = trie_op;
6113   }
6114 #endif
6115   else if (REGNODE_SIMPLE(OP(first)))
6116    ri->regstclass = first;
6117   else if (PL_regkind[OP(first)] == BOUND ||
6118     PL_regkind[OP(first)] == NBOUND)
6119    ri->regstclass = first;
6120   else if (PL_regkind[OP(first)] == BOL) {
6121    r->extflags |= (OP(first) == MBOL
6122       ? RXf_ANCH_MBOL
6123       : (OP(first) == SBOL
6124        ? RXf_ANCH_SBOL
6125        : RXf_ANCH_BOL));
6126    first = NEXTOPER(first);
6127    goto again;
6128   }
6129   else if (OP(first) == GPOS) {
6130    r->extflags |= RXf_ANCH_GPOS;
6131    first = NEXTOPER(first);
6132    goto again;
6133   }
6134   else if ((!sawopen || !RExC_sawback) &&
6135    (OP(first) == STAR &&
6136    PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6137    !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6138   {
6139    /* turn .* into ^.* with an implied $*=1 */
6140    const int type =
6141     (OP(NEXTOPER(first)) == REG_ANY)
6142      ? RXf_ANCH_MBOL
6143      : RXf_ANCH_SBOL;
6144    r->extflags |= type;
6145    r->intflags |= PREGf_IMPLICIT;
6146    first = NEXTOPER(first);
6147    goto again;
6148   }
6149   if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6150    && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6151    /* x+ must match at the 1st pos of run of x's */
6152    r->intflags |= PREGf_SKIP;
6153
6154   /* Scan is after the zeroth branch, first is atomic matcher. */
6155 #ifdef TRIE_STUDY_OPT
6156   DEBUG_PARSE_r(
6157    if (!restudied)
6158     PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6159        (IV)(first - scan + 1))
6160   );
6161 #else
6162   DEBUG_PARSE_r(
6163    PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6164     (IV)(first - scan + 1))
6165   );
6166 #endif
6167
6168
6169   /*
6170   * If there's something expensive in the r.e., find the
6171   * longest literal string that must appear and make it the
6172   * regmust.  Resolve ties in favor of later strings, since
6173   * the regstart check works with the beginning of the r.e.
6174   * and avoiding duplication strengthens checking.  Not a
6175   * strong reason, but sufficient in the absence of others.
6176   * [Now we resolve ties in favor of the earlier string if
6177   * it happens that c_offset_min has been invalidated, since the
6178   * earlier string may buy us something the later one won't.]
6179   */
6180
6181   data.longest_fixed = newSVpvs("");
6182   data.longest_float = newSVpvs("");
6183   data.last_found = newSVpvs("");
6184   data.longest = &(data.longest_fixed);
6185   ENTER_with_name("study_chunk");
6186   SAVEFREESV(data.longest_fixed);
6187   SAVEFREESV(data.longest_float);
6188   SAVEFREESV(data.last_found);
6189   first = scan;
6190   if (!ri->regstclass) {
6191    cl_init(pRExC_state, &ch_class);
6192    data.start_class = &ch_class;
6193    stclass_flag = SCF_DO_STCLASS_AND;
6194   } else    /* XXXX Check for BOUND? */
6195    stclass_flag = 0;
6196   data.last_closep = &last_close;
6197
6198   minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6199    &data, -1, NULL, NULL,
6200    SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6201
6202
6203   CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6204
6205
6206   if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6207    && data.last_start_min == 0 && data.last_end > 0
6208    && !RExC_seen_zerolen
6209    && !(RExC_seen & REG_SEEN_VERBARG)
6210    && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6211    r->extflags |= RXf_CHECK_ALL;
6212   scan_commit(pRExC_state, &data,&minlen,0);
6213
6214   longest_float_length = CHR_SVLEN(data.longest_float);
6215
6216   if (! ((SvCUR(data.longest_fixed)  /* ok to leave SvCUR */
6217     && data.offset_fixed == data.offset_float_min
6218     && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6219    && S_setup_longest (aTHX_ pRExC_state,
6220          data.longest_float,
6221          &(r->float_utf8),
6222          &(r->float_substr),
6223          &(r->float_end_shift),
6224          data.lookbehind_float,
6225          data.offset_float_min,
6226          data.minlen_float,
6227          longest_float_length,
6228          cBOOL(data.flags & SF_FL_BEFORE_EOL),
6229          cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6230   {
6231    r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6232    r->float_max_offset = data.offset_float_max;
6233    if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6234     r->float_max_offset -= data.lookbehind_float;
6235    SvREFCNT_inc_simple_void_NN(data.longest_float);
6236   }
6237   else {
6238    r->float_substr = r->float_utf8 = NULL;
6239    longest_float_length = 0;
6240   }
6241
6242   longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6243
6244   if (S_setup_longest (aTHX_ pRExC_state,
6245         data.longest_fixed,
6246         &(r->anchored_utf8),
6247         &(r->anchored_substr),
6248         &(r->anchored_end_shift),
6249         data.lookbehind_fixed,
6250         data.offset_fixed,
6251         data.minlen_fixed,
6252         longest_fixed_length,
6253         cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6254         cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6255   {
6256    r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6257    SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6258   }
6259   else {
6260    r->anchored_substr = r->anchored_utf8 = NULL;
6261    longest_fixed_length = 0;
6262   }
6263   LEAVE_with_name("study_chunk");
6264
6265   if (ri->regstclass
6266    && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6267    ri->regstclass = NULL;
6268
6269   if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6270    && stclass_flag
6271    && ! TEST_SSC_EOS(data.start_class)
6272    && !cl_is_anything(data.start_class))
6273   {
6274    const U32 n = add_data(pRExC_state, 1, "f");
6275    OP(data.start_class) = ANYOF_SYNTHETIC;
6276
6277    Newx(RExC_rxi->data->data[n], 1,
6278     struct regnode_charclass_class);
6279    StructCopy(data.start_class,
6280      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6281      struct regnode_charclass_class);
6282    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6283    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6284    DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6285      regprop(r, sv, (regnode*)data.start_class);
6286      PerlIO_printf(Perl_debug_log,
6287          "synthetic stclass \"%s\".\n",
6288          SvPVX_const(sv));});
6289   }
6290
6291   /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6292   if (longest_fixed_length > longest_float_length) {
6293    r->check_end_shift = r->anchored_end_shift;
6294    r->check_substr = r->anchored_substr;
6295    r->check_utf8 = r->anchored_utf8;
6296    r->check_offset_min = r->check_offset_max = r->anchored_offset;
6297    if (r->extflags & RXf_ANCH_SINGLE)
6298     r->extflags |= RXf_NOSCAN;
6299   }
6300   else {
6301    r->check_end_shift = r->float_end_shift;
6302    r->check_substr = r->float_substr;
6303    r->check_utf8 = r->float_utf8;
6304    r->check_offset_min = r->float_min_offset;
6305    r->check_offset_max = r->float_max_offset;
6306   }
6307   /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6308   This should be changed ASAP!  */
6309   if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6310    r->extflags |= RXf_USE_INTUIT;
6311    if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6312     r->extflags |= RXf_INTUIT_TAIL;
6313   }
6314   /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6315   if ( (STRLEN)minlen < longest_float_length )
6316    minlen= longest_float_length;
6317   if ( (STRLEN)minlen < longest_fixed_length )
6318    minlen= longest_fixed_length;
6319   */
6320  }
6321  else {
6322   /* Several toplevels. Best we can is to set minlen. */
6323   I32 fake;
6324   struct regnode_charclass_class ch_class;
6325   I32 last_close = 0;
6326
6327   DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6328
6329   scan = ri->program + 1;
6330   cl_init(pRExC_state, &ch_class);
6331   data.start_class = &ch_class;
6332   data.last_closep = &last_close;
6333
6334
6335   minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6336    &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6337
6338   CHECK_RESTUDY_GOTO_butfirst(NOOP);
6339
6340   r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6341     = r->float_substr = r->float_utf8 = NULL;
6342
6343   if (! TEST_SSC_EOS(data.start_class)
6344    && !cl_is_anything(data.start_class))
6345   {
6346    const U32 n = add_data(pRExC_state, 1, "f");
6347    OP(data.start_class) = ANYOF_SYNTHETIC;
6348
6349    Newx(RExC_rxi->data->data[n], 1,
6350     struct regnode_charclass_class);
6351    StructCopy(data.start_class,
6352      (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6353      struct regnode_charclass_class);
6354    ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6355    r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6356    DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6357      regprop(r, sv, (regnode*)data.start_class);
6358      PerlIO_printf(Perl_debug_log,
6359          "synthetic stclass \"%s\".\n",
6360          SvPVX_const(sv));});
6361   }
6362  }
6363
6364  /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6365  the "real" pattern. */
6366  DEBUG_OPTIMISE_r({
6367   PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6368      (IV)minlen, (IV)r->minlen);
6369  });
6370  r->minlenret = minlen;
6371  if (r->minlen < minlen)
6372   r->minlen = minlen;
6373
6374  if (RExC_seen & REG_SEEN_GPOS)
6375   r->extflags |= RXf_GPOS_SEEN;
6376  if (RExC_seen & REG_SEEN_LOOKBEHIND)
6377   r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6378  if (pRExC_state->num_code_blocks)
6379   r->extflags |= RXf_EVAL_SEEN;
6380  if (RExC_seen & REG_SEEN_CANY)
6381   r->extflags |= RXf_CANY_SEEN;
6382  if (RExC_seen & REG_SEEN_VERBARG)
6383  {
6384   r->intflags |= PREGf_VERBARG_SEEN;
6385   r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6386  }
6387  if (RExC_seen & REG_SEEN_CUTGROUP)
6388   r->intflags |= PREGf_CUTGROUP_SEEN;
6389  if (pm_flags & PMf_USE_RE_EVAL)
6390   r->intflags |= PREGf_USE_RE_EVAL;
6391  if (RExC_paren_names)
6392   RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6393  else
6394   RXp_PAREN_NAMES(r) = NULL;
6395
6396  {
6397   regnode *first = ri->program + 1;
6398   U8 fop = OP(first);
6399   regnode *next = NEXTOPER(first);
6400   U8 nop = OP(next);
6401
6402   if (PL_regkind[fop] == NOTHING && nop == END)
6403    r->extflags |= RXf_NULL;
6404   else if (PL_regkind[fop] == BOL && nop == END)
6405    r->extflags |= RXf_START_ONLY;
6406   else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6407    r->extflags |= RXf_WHITE;
6408   else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6409    r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6410
6411  }
6412 #ifdef DEBUGGING
6413  if (RExC_paren_names) {
6414   ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6415   ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6416  } else
6417 #endif
6418   ri->name_list_idx = 0;
6419
6420  if (RExC_recurse_count) {
6421   for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6422    const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6423    ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6424   }
6425  }
6426  Newxz(r->offs, RExC_npar, regexp_paren_pair);
6427  /* assume we don't need to swap parens around before we match */
6428
6429  DEBUG_DUMP_r({
6430   PerlIO_printf(Perl_debug_log,"Final program:\n");
6431   regdump(r);
6432  });
6433 #ifdef RE_TRACK_PATTERN_OFFSETS
6434  DEBUG_OFFSETS_r(if (ri->u.offsets) {
6435   const U32 len = ri->u.offsets[0];
6436   U32 i;
6437   GET_RE_DEBUG_FLAGS_DECL;
6438   PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6439   for (i = 1; i <= len; i++) {
6440    if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6441     PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6442     (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6443    }
6444   PerlIO_printf(Perl_debug_log, "\n");
6445  });
6446 #endif
6447
6448 #ifdef USE_ITHREADS
6449  /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6450  * by setting the regexp SV to readonly-only instead. If the
6451  * pattern's been recompiled, the USEDness should remain. */
6452  if (old_re && SvREADONLY(old_re))
6453   SvREADONLY_on(rx);
6454 #endif
6455  return rx;
6456 }
6457
6458
6459 SV*
6460 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6461      const U32 flags)
6462 {
6463  PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6464
6465  PERL_UNUSED_ARG(value);
6466
6467  if (flags & RXapif_FETCH) {
6468   return reg_named_buff_fetch(rx, key, flags);
6469  } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6470   Perl_croak_no_modify();
6471   return NULL;
6472  } else if (flags & RXapif_EXISTS) {
6473   return reg_named_buff_exists(rx, key, flags)
6474    ? &PL_sv_yes
6475    : &PL_sv_no;
6476  } else if (flags & RXapif_REGNAMES) {
6477   return reg_named_buff_all(rx, flags);
6478  } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6479   return reg_named_buff_scalar(rx, flags);
6480  } else {
6481   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6482   return NULL;
6483  }
6484 }
6485
6486 SV*
6487 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6488       const U32 flags)
6489 {
6490  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6491  PERL_UNUSED_ARG(lastkey);
6492
6493  if (flags & RXapif_FIRSTKEY)
6494   return reg_named_buff_firstkey(rx, flags);
6495  else if (flags & RXapif_NEXTKEY)
6496   return reg_named_buff_nextkey(rx, flags);
6497  else {
6498   Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6499   return NULL;
6500  }
6501 }
6502
6503 SV*
6504 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6505       const U32 flags)
6506 {
6507  AV *retarray = NULL;
6508  SV *ret;
6509  struct regexp *const rx = ReANY(r);
6510
6511  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6512
6513  if (flags & RXapif_ALL)
6514   retarray=newAV();
6515
6516  if (rx && RXp_PAREN_NAMES(rx)) {
6517   HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6518   if (he_str) {
6519    IV i;
6520    SV* sv_dat=HeVAL(he_str);
6521    I32 *nums=(I32*)SvPVX(sv_dat);
6522    for ( i=0; i<SvIVX(sv_dat); i++ ) {
6523     if ((I32)(rx->nparens) >= nums[i]
6524      && rx->offs[nums[i]].start != -1
6525      && rx->offs[nums[i]].end != -1)
6526     {
6527      ret = newSVpvs("");
6528      CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6529      if (!retarray)
6530       return ret;
6531     } else {
6532      if (retarray)
6533       ret = newSVsv(&PL_sv_undef);
6534     }
6535     if (retarray)
6536      av_push(retarray, ret);
6537    }
6538    if (retarray)
6539     return newRV_noinc(MUTABLE_SV(retarray));
6540   }
6541  }
6542  return NULL;
6543 }
6544
6545 bool
6546 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6547       const U32 flags)
6548 {
6549  struct regexp *const rx = ReANY(r);
6550
6551  PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6552
6553  if (rx && RXp_PAREN_NAMES(rx)) {
6554   if (flags & RXapif_ALL) {
6555    return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6556   } else {
6557    SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6558    if (sv) {
6559     SvREFCNT_dec_NN(sv);
6560     return TRUE;
6561    } else {
6562     return FALSE;
6563    }
6564   }
6565  } else {
6566   return FALSE;
6567  }
6568 }
6569
6570 SV*
6571 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6572 {
6573  struct regexp *const rx = ReANY(r);
6574
6575  PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6576
6577  if ( rx && RXp_PAREN_NAMES(rx) ) {
6578   (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6579
6580   return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6581  } else {
6582   return FALSE;
6583  }
6584 }
6585
6586 SV*
6587 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6588 {
6589  struct regexp *const rx = ReANY(r);
6590  GET_RE_DEBUG_FLAGS_DECL;
6591
6592  PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6593
6594  if (rx && RXp_PAREN_NAMES(rx)) {
6595   HV *hv = RXp_PAREN_NAMES(rx);
6596   HE *temphe;
6597   while ( (temphe = hv_iternext_flags(hv,0)) ) {
6598    IV i;
6599    IV parno = 0;
6600    SV* sv_dat = HeVAL(temphe);
6601    I32 *nums = (I32*)SvPVX(sv_dat);
6602    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6603     if ((I32)(rx->lastparen) >= nums[i] &&
6604      rx->offs[nums[i]].start != -1 &&
6605      rx->offs[nums[i]].end != -1)
6606     {
6607      parno = nums[i];
6608      break;
6609     }
6610    }
6611    if (parno || flags & RXapif_ALL) {
6612     return newSVhek(HeKEY_hek(temphe));
6613    }
6614   }
6615  }
6616  return NULL;
6617 }
6618
6619 SV*
6620 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6621 {
6622  SV *ret;
6623  AV *av;
6624  I32 length;
6625  struct regexp *const rx = ReANY(r);
6626
6627  PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6628
6629  if (rx && RXp_PAREN_NAMES(rx)) {
6630   if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6631    return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6632   } else if (flags & RXapif_ONE) {
6633    ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6634    av = MUTABLE_AV(SvRV(ret));
6635    length = av_len(av);
6636    SvREFCNT_dec_NN(ret);
6637    return newSViv(length + 1);
6638   } else {
6639    Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6640    return NULL;
6641   }
6642  }
6643  return &PL_sv_undef;
6644 }
6645
6646 SV*
6647 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6648 {
6649  struct regexp *const rx = ReANY(r);
6650  AV *av = newAV();
6651
6652  PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6653
6654  if (rx && RXp_PAREN_NAMES(rx)) {
6655   HV *hv= RXp_PAREN_NAMES(rx);
6656   HE *temphe;
6657   (void)hv_iterinit(hv);
6658   while ( (temphe = hv_iternext_flags(hv,0)) ) {
6659    IV i;
6660    IV parno = 0;
6661    SV* sv_dat = HeVAL(temphe);
6662    I32 *nums = (I32*)SvPVX(sv_dat);
6663    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6664     if ((I32)(rx->lastparen) >= nums[i] &&
6665      rx->offs[nums[i]].start != -1 &&
6666      rx->offs[nums[i]].end != -1)
6667     {
6668      parno = nums[i];
6669      break;
6670     }
6671    }
6672    if (parno || flags & RXapif_ALL) {
6673     av_push(av, newSVhek(HeKEY_hek(temphe)));
6674    }
6675   }
6676  }
6677
6678  return newRV_noinc(MUTABLE_SV(av));
6679 }
6680
6681 void
6682 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6683        SV * const sv)
6684 {
6685  struct regexp *const rx = ReANY(r);
6686  char *s = NULL;
6687  I32 i = 0;
6688  I32 s1, t1;
6689  I32 n = paren;
6690
6691  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6692
6693  if (      n == RX_BUFF_IDX_CARET_PREMATCH
6694   || n == RX_BUFF_IDX_CARET_FULLMATCH
6695   || n == RX_BUFF_IDX_CARET_POSTMATCH
6696  )
6697  {
6698   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6699   if (!keepcopy) {
6700    /* on something like
6701    *    $r = qr/.../;
6702    *    /$qr/p;
6703    * the KEEPCOPY is set on the PMOP rather than the regex */
6704    if (PL_curpm && r == PM_GETRE(PL_curpm))
6705     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6706   }
6707   if (!keepcopy)
6708    goto ret_undef;
6709  }
6710
6711  if (!rx->subbeg)
6712   goto ret_undef;
6713
6714  if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6715   /* no need to distinguish between them any more */
6716   n = RX_BUFF_IDX_FULLMATCH;
6717
6718  if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6719   && rx->offs[0].start != -1)
6720  {
6721   /* $`, ${^PREMATCH} */
6722   i = rx->offs[0].start;
6723   s = rx->subbeg;
6724  }
6725  else
6726  if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6727   && rx->offs[0].end != -1)
6728  {
6729   /* $', ${^POSTMATCH} */
6730   s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6731   i = rx->sublen + rx->suboffset - rx->offs[0].end;
6732  }
6733  else
6734  if ( 0 <= n && n <= (I32)rx->nparens &&
6735   (s1 = rx->offs[n].start) != -1 &&
6736   (t1 = rx->offs[n].end) != -1)
6737  {
6738   /* $&, ${^MATCH},  $1 ... */
6739   i = t1 - s1;
6740   s = rx->subbeg + s1 - rx->suboffset;
6741  } else {
6742   goto ret_undef;
6743  }
6744
6745  assert(s >= rx->subbeg);
6746  assert(rx->sublen >= (s - rx->subbeg) + i );
6747  if (i >= 0) {
6748 #if NO_TAINT_SUPPORT
6749   sv_setpvn(sv, s, i);
6750 #else
6751   const int oldtainted = TAINT_get;
6752   TAINT_NOT;
6753   sv_setpvn(sv, s, i);
6754   TAINT_set(oldtainted);
6755 #endif
6756   if ( (rx->extflags & RXf_CANY_SEEN)
6757    ? (RXp_MATCH_UTF8(rx)
6758       && (!i || is_utf8_string((U8*)s, i)))
6759    : (RXp_MATCH_UTF8(rx)) )
6760   {
6761    SvUTF8_on(sv);
6762   }
6763   else
6764    SvUTF8_off(sv);
6765   if (TAINTING_get) {
6766    if (RXp_MATCH_TAINTED(rx)) {
6767     if (SvTYPE(sv) >= SVt_PVMG) {
6768      MAGIC* const mg = SvMAGIC(sv);
6769      MAGIC* mgt;
6770      TAINT;
6771      SvMAGIC_set(sv, mg->mg_moremagic);
6772      SvTAINT(sv);
6773      if ((mgt = SvMAGIC(sv))) {
6774       mg->mg_moremagic = mgt;
6775       SvMAGIC_set(sv, mg);
6776      }
6777     } else {
6778      TAINT;
6779      SvTAINT(sv);
6780     }
6781    } else
6782     SvTAINTED_off(sv);
6783   }
6784  } else {
6785  ret_undef:
6786   sv_setsv(sv,&PL_sv_undef);
6787   return;
6788  }
6789 }
6790
6791 void
6792 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6793               SV const * const value)
6794 {
6795  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6796
6797  PERL_UNUSED_ARG(rx);
6798  PERL_UNUSED_ARG(paren);
6799  PERL_UNUSED_ARG(value);
6800
6801  if (!PL_localizing)
6802   Perl_croak_no_modify();
6803 }
6804
6805 I32
6806 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6807        const I32 paren)
6808 {
6809  struct regexp *const rx = ReANY(r);
6810  I32 i;
6811  I32 s1, t1;
6812
6813  PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6814
6815  if (   paren == RX_BUFF_IDX_CARET_PREMATCH
6816   || paren == RX_BUFF_IDX_CARET_FULLMATCH
6817   || paren == RX_BUFF_IDX_CARET_POSTMATCH
6818  )
6819  {
6820   bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6821   if (!keepcopy) {
6822    /* on something like
6823    *    $r = qr/.../;
6824    *    /$qr/p;
6825    * the KEEPCOPY is set on the PMOP rather than the regex */
6826    if (PL_curpm && r == PM_GETRE(PL_curpm))
6827     keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6828   }
6829   if (!keepcopy)
6830    goto warn_undef;
6831  }
6832
6833  /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6834  switch (paren) {
6835  case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6836  case RX_BUFF_IDX_PREMATCH:       /* $` */
6837   if (rx->offs[0].start != -1) {
6838       i = rx->offs[0].start;
6839       if (i > 0) {
6840         s1 = 0;
6841         t1 = i;
6842         goto getlen;
6843       }
6844    }
6845   return 0;
6846
6847  case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6848  case RX_BUFF_IDX_POSTMATCH:       /* $' */
6849    if (rx->offs[0].end != -1) {
6850       i = rx->sublen - rx->offs[0].end;
6851       if (i > 0) {
6852         s1 = rx->offs[0].end;
6853         t1 = rx->sublen;
6854         goto getlen;
6855       }
6856    }
6857   return 0;
6858
6859  default: /* $& / ${^MATCH}, $1, $2, ... */
6860    if (paren <= (I32)rx->nparens &&
6861    (s1 = rx->offs[paren].start) != -1 &&
6862    (t1 = rx->offs[paren].end) != -1)
6863    {
6864    i = t1 - s1;
6865    goto getlen;
6866   } else {
6867   warn_undef:
6868    if (ckWARN(WARN_UNINITIALIZED))
6869     report_uninit((const SV *)sv);
6870    return 0;
6871   }
6872  }
6873   getlen:
6874  if (i > 0 && RXp_MATCH_UTF8(rx)) {
6875   const char * const s = rx->subbeg - rx->suboffset + s1;
6876   const U8 *ep;
6877   STRLEN el;
6878
6879   i = t1 - s1;
6880   if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6881       i = el;
6882  }
6883  return i;
6884 }
6885
6886 SV*
6887 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6888 {
6889  PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6890   PERL_UNUSED_ARG(rx);
6891   if (0)
6892    return NULL;
6893   else
6894    return newSVpvs("Regexp");
6895 }
6896
6897 /* Scans the name of a named buffer from the pattern.
6898  * If flags is REG_RSN_RETURN_NULL returns null.
6899  * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6900  * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6901  * to the parsed name as looked up in the RExC_paren_names hash.
6902  * If there is an error throws a vFAIL().. type exception.
6903  */
6904
6905 #define REG_RSN_RETURN_NULL    0
6906 #define REG_RSN_RETURN_NAME    1
6907 #define REG_RSN_RETURN_DATA    2
6908
6909 STATIC SV*
6910 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6911 {
6912  char *name_start = RExC_parse;
6913
6914  PERL_ARGS_ASSERT_REG_SCAN_NAME;
6915
6916  if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6917   /* skip IDFIRST by using do...while */
6918   if (UTF)
6919    do {
6920     RExC_parse += UTF8SKIP(RExC_parse);
6921    } while (isWORDCHAR_utf8((U8*)RExC_parse));
6922   else
6923    do {
6924     RExC_parse++;
6925    } while (isWORDCHAR(*RExC_parse));
6926  } else {
6927   RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6928   vFAIL("Group name must start with a non-digit word character");
6929  }
6930  if ( flags ) {
6931   SV* sv_name
6932    = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6933        SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6934   if ( flags == REG_RSN_RETURN_NAME)
6935    return sv_name;
6936   else if (flags==REG_RSN_RETURN_DATA) {
6937    HE *he_str = NULL;
6938    SV *sv_dat = NULL;
6939    if ( ! sv_name )      /* should not happen*/
6940     Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6941    if (RExC_paren_names)
6942     he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6943    if ( he_str )
6944     sv_dat = HeVAL(he_str);
6945    if ( ! sv_dat )
6946     vFAIL("Reference to nonexistent named group");
6947    return sv_dat;
6948   }
6949   else {
6950    Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6951      (unsigned long) flags);
6952   }
6953   assert(0); /* NOT REACHED */
6954  }
6955  return NULL;
6956 }
6957
6958 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
6959  int rem=(int)(RExC_end - RExC_parse);                       \
6960  int cut;                                                    \
6961  int num;                                                    \
6962  int iscut=0;                                                \
6963  if (rem>10) {                                               \
6964   rem=10;                                                 \
6965   iscut=1;                                                \
6966  }                                                           \
6967  cut=10-rem;                                                 \
6968  if (RExC_lastparse!=RExC_parse)                             \
6969   PerlIO_printf(Perl_debug_log," >%.*s%-*s",              \
6970    rem, RExC_parse,                                    \
6971    cut + 4,                                            \
6972    iscut ? "..." : "<"                                 \
6973   );                                                      \
6974  else                                                        \
6975   PerlIO_printf(Perl_debug_log,"%16s","");                \
6976                 \
6977  if (SIZE_ONLY)                                              \
6978  num = RExC_size + 1;                                     \
6979  else                                                        \
6980  num=REG_NODE_NUM(RExC_emit);                             \
6981  if (RExC_lastnum!=num)                                      \
6982  PerlIO_printf(Perl_debug_log,"|%4d",num);                \
6983  else                                                        \
6984  PerlIO_printf(Perl_debug_log,"|%4s","");                 \
6985  PerlIO_printf(Perl_debug_log,"|%*s%-4s",                    \
6986   (int)((depth*2)), "",                                   \
6987   (funcname)                                              \
6988  );                                                          \
6989  RExC_lastnum=num;                                           \
6990  RExC_lastparse=RExC_parse;                                  \
6991 })
6992
6993
6994
6995 #define DEBUG_PARSE(funcname)     DEBUG_PARSE_r({           \
6996  DEBUG_PARSE_MSG((funcname));                            \
6997  PerlIO_printf(Perl_debug_log,"%4s","\n");               \
6998 })
6999 #define DEBUG_PARSE_FMT(funcname,fmt,args)     DEBUG_PARSE_r({           \
7000  DEBUG_PARSE_MSG((funcname));                            \
7001  PerlIO_printf(Perl_debug_log,fmt "\n",args);               \
7002 })
7003
7004 /* This section of code defines the inversion list object and its methods.  The
7005  * interfaces are highly subject to change, so as much as possible is static to
7006  * this file.  An inversion list is here implemented as a malloc'd C UV array
7007  * with some added info that is placed as UVs at the beginning in a header
7008  * portion.  An inversion list for Unicode is an array of code points, sorted
7009  * by ordinal number.  The zeroth element is the first code point in the list.
7010  * The 1th element is the first element beyond that not in the list.  In other
7011  * words, the first range is
7012  *  invlist[0]..(invlist[1]-1)
7013  * The other ranges follow.  Thus every element whose index is divisible by two
7014  * marks the beginning of a range that is in the list, and every element not
7015  * divisible by two marks the beginning of a range not in the list.  A single
7016  * element inversion list that contains the single code point N generally
7017  * consists of two elements
7018  *  invlist[0] == N
7019  *  invlist[1] == N+1
7020  * (The exception is when N is the highest representable value on the
7021  * machine, in which case the list containing just it would be a single
7022  * element, itself.  By extension, if the last range in the list extends to
7023  * infinity, then the first element of that range will be in the inversion list
7024  * at a position that is divisible by two, and is the final element in the
7025  * list.)
7026  * Taking the complement (inverting) an inversion list is quite simple, if the
7027  * first element is 0, remove it; otherwise add a 0 element at the beginning.
7028  * This implementation reserves an element at the beginning of each inversion
7029  * list to contain 0 when the list contains 0, and contains 1 otherwise.  The
7030  * actual beginning of the list is either that element if 0, or the next one if
7031  * 1.
7032  *
7033  * More about inversion lists can be found in "Unicode Demystified"
7034  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7035  * More will be coming when functionality is added later.
7036  *
7037  * The inversion list data structure is currently implemented as an SV pointing
7038  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
7039  * array of UV whose memory management is automatically handled by the existing
7040  * facilities for SV's.
7041  *
7042  * Some of the methods should always be private to the implementation, and some
7043  * should eventually be made public */
7044
7045 /* The header definitions are in F<inline_invlist.c> */
7046 #define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
7047 #define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
7048
7049 #define INVLIST_INITIAL_LEN 10
7050
7051 PERL_STATIC_INLINE UV*
7052 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7053 {
7054  /* Returns a pointer to the first element in the inversion list's array.
7055  * This is called upon initialization of an inversion list.  Where the
7056  * array begins depends on whether the list has the code point U+0000
7057  * in it or not.  The other parameter tells it whether the code that
7058  * follows this call is about to put a 0 in the inversion list or not.
7059  * The first element is either the element with 0, if 0, or the next one,
7060  * if 1 */
7061
7062  UV* zero = get_invlist_zero_addr(invlist);
7063
7064  PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7065
7066  /* Must be empty */
7067  assert(! *_get_invlist_len_addr(invlist));
7068
7069  /* 1^1 = 0; 1^0 = 1 */
7070  *zero = 1 ^ will_have_0;
7071  return zero + *zero;
7072 }
7073
7074 PERL_STATIC_INLINE UV*
7075 S_invlist_array(pTHX_ SV* const invlist)
7076 {
7077  /* Returns the pointer to the inversion list's array.  Every time the
7078  * length changes, this needs to be called in case malloc or realloc moved
7079  * it */
7080
7081  PERL_ARGS_ASSERT_INVLIST_ARRAY;
7082
7083  /* Must not be empty.  If these fail, you probably didn't check for <len>
7084  * being non-zero before trying to get the array */
7085  assert(*_get_invlist_len_addr(invlist));
7086  assert(*get_invlist_zero_addr(invlist) == 0
7087   || *get_invlist_zero_addr(invlist) == 1);
7088
7089  /* The array begins either at the element reserved for zero if the
7090  * list contains 0 (that element will be set to 0), or otherwise the next
7091  * element (in which case the reserved element will be set to 1). */
7092  return (UV *) (get_invlist_zero_addr(invlist)
7093     + *get_invlist_zero_addr(invlist));
7094 }
7095
7096 PERL_STATIC_INLINE void
7097 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7098 {
7099  /* Sets the current number of elements stored in the inversion list */
7100
7101  PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7102
7103  *_get_invlist_len_addr(invlist) = len;
7104
7105  assert(len <= SvLEN(invlist));
7106
7107  SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7108  /* If the list contains U+0000, that element is part of the header,
7109  * and should not be counted as part of the array.  It will contain
7110  * 0 in that case, and 1 otherwise.  So we could flop 0=>1, 1=>0 and
7111  * subtract:
7112  * SvCUR_set(invlist,
7113  *    TO_INTERNAL_SIZE(len
7114  *       - (*get_invlist_zero_addr(inv_list) ^ 1)));
7115  * But, this is only valid if len is not 0.  The consequences of not doing
7116  * this is that the memory allocation code may think that 1 more UV is
7117  * being used than actually is, and so might do an unnecessary grow.  That
7118  * seems worth not bothering to make this the precise amount.
7119  *
7120  * Note that when inverting, SvCUR shouldn't change */
7121 }
7122
7123 PERL_STATIC_INLINE IV*
7124 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7125 {
7126  /* Return the address of the UV that is reserved to hold the cached index
7127  * */
7128
7129  PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7130
7131  return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7132 }
7133
7134 PERL_STATIC_INLINE IV
7135 S_invlist_previous_index(pTHX_ SV* const invlist)
7136 {
7137  /* Returns cached index of previous search */
7138
7139  PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7140
7141  return *get_invlist_previous_index_addr(invlist);
7142 }
7143
7144 PERL_STATIC_INLINE void
7145 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7146 {
7147  /* Caches <index> for later retrieval */
7148
7149  PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7150
7151  assert(index == 0 || index < (int) _invlist_len(invlist));
7152
7153  *get_invlist_previous_index_addr(invlist) = index;
7154 }
7155
7156 PERL_STATIC_INLINE UV
7157 S_invlist_max(pTHX_ SV* const invlist)
7158 {
7159  /* Returns the maximum number of elements storable in the inversion list's
7160  * array, without having to realloc() */
7161
7162  PERL_ARGS_ASSERT_INVLIST_MAX;
7163
7164  return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
7165   ? _invlist_len(invlist)
7166   : FROM_INTERNAL_SIZE(SvLEN(invlist));
7167 }
7168
7169 PERL_STATIC_INLINE UV*
7170 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7171 {
7172  /* Return the address of the UV that is reserved to hold 0 if the inversion
7173  * list contains 0.  This has to be the last element of the heading, as the
7174  * list proper starts with either it if 0, or the next element if not.
7175  * (But we force it to contain either 0 or 1) */
7176
7177  PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7178
7179  return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7180 }
7181
7182 #ifndef PERL_IN_XSUB_RE
7183 SV*
7184 Perl__new_invlist(pTHX_ IV initial_size)
7185 {
7186
7187  /* Return a pointer to a newly constructed inversion list, with enough
7188  * space to store 'initial_size' elements.  If that number is negative, a
7189  * system default is used instead */
7190
7191  SV* new_list;
7192
7193  if (initial_size < 0) {
7194   initial_size = INVLIST_INITIAL_LEN;
7195  }
7196
7197  /* Allocate the initial space */
7198  new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7199  invlist_set_len(new_list, 0);
7200
7201  /* Force iterinit() to be used to get iteration to work */
7202  *get_invlist_iter_addr(new_list) = UV_MAX;
7203
7204  /* This should force a segfault if a method doesn't initialize this
7205  * properly */
7206  *get_invlist_zero_addr(new_list) = UV_MAX;
7207
7208  *get_invlist_previous_index_addr(new_list) = 0;
7209  *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7210 #if HEADER_LENGTH != 5
7211 #   error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7212 #endif
7213
7214  return new_list;
7215 }
7216 #endif
7217
7218 STATIC SV*
7219 S__new_invlist_C_array(pTHX_ UV* list)
7220 {
7221  /* Return a pointer to a newly constructed inversion list, initialized to
7222  * point to <list>, which has to be in the exact correct inversion list
7223  * form, including internal fields.  Thus this is a dangerous routine that
7224  * should not be used in the wrong hands */
7225
7226  SV* invlist = newSV_type(SVt_PV);
7227
7228  PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7229
7230  SvPV_set(invlist, (char *) list);
7231  SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
7232        shouldn't touch it */
7233  SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7234
7235  if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7236   Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7237  }
7238
7239  /* Initialize the iteration pointer.
7240  * XXX This could be done at compile time in charclass_invlists.h, but I
7241  * (khw) am not confident that the suffixes for specifying the C constant
7242  * UV_MAX are portable, e.g.  'ull' on a 32 bit machine that is configured
7243  * to use 64 bits; might need a Configure probe */
7244  invlist_iterfinish(invlist);
7245
7246  return invlist;
7247 }
7248
7249 STATIC void
7250 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7251 {
7252  /* Grow the maximum size of an inversion list */
7253
7254  PERL_ARGS_ASSERT_INVLIST_EXTEND;
7255
7256  SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7257 }
7258
7259 PERL_STATIC_INLINE void
7260 S_invlist_trim(pTHX_ SV* const invlist)
7261 {
7262  PERL_ARGS_ASSERT_INVLIST_TRIM;
7263
7264  /* Change the length of the inversion list to how many entries it currently
7265  * has */
7266
7267  SvPV_shrink_to_cur((SV *) invlist);
7268 }
7269
7270 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7271
7272 STATIC void
7273 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7274 {
7275    /* Subject to change or removal.  Append the range from 'start' to 'end' at
7276  * the end of the inversion list.  The range must be above any existing
7277  * ones. */
7278
7279  UV* array;
7280  UV max = invlist_max(invlist);
7281  UV len = _invlist_len(invlist);
7282
7283  PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7284
7285  if (len == 0) { /* Empty lists must be initialized */
7286   array = _invlist_array_init(invlist, start == 0);
7287  }
7288  else {
7289   /* Here, the existing list is non-empty. The current max entry in the
7290   * list is generally the first value not in the set, except when the
7291   * set extends to the end of permissible values, in which case it is
7292   * the first entry in that final set, and so this call is an attempt to
7293   * append out-of-order */
7294
7295   UV final_element = len - 1;
7296   array = invlist_array(invlist);
7297   if (array[final_element] > start
7298    || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7299   {
7300    Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7301      array[final_element], start,
7302      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7303   }
7304
7305   /* Here, it is a legal append.  If the new range begins with the first
7306   * value not in the set, it is extending the set, so the new first
7307   * value not in the set is one greater than the newly extended range.
7308   * */
7309   if (array[final_element] == start) {
7310    if (end != UV_MAX) {
7311     array[final_element] = end + 1;
7312    }
7313    else {
7314     /* But if the end is the maximum representable on the machine,
7315     * just let the range that this would extend to have no end */
7316     invlist_set_len(invlist, len - 1);
7317    }
7318    return;
7319   }
7320  }
7321
7322  /* Here the new range doesn't extend any existing set.  Add it */
7323
7324  len += 2; /* Includes an element each for the start and end of range */
7325
7326  /* If overflows the existing space, extend, which may cause the array to be
7327  * moved */
7328  if (max < len) {
7329   invlist_extend(invlist, len);
7330   invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7331           failure in invlist_array() */
7332   array = invlist_array(invlist);
7333  }
7334  else {
7335   invlist_set_len(invlist, len);
7336  }
7337
7338  /* The next item on the list starts the range, the one after that is
7339  * one past the new range.  */
7340  array[len - 2] = start;
7341  if (end != UV_MAX) {
7342   array[len - 1] = end + 1;
7343  }
7344  else {
7345   /* But if the end is the maximum representable on the machine, just let
7346   * the range have no end */
7347   invlist_set_len(invlist, len - 1);
7348  }
7349 }
7350
7351 #ifndef PERL_IN_XSUB_RE
7352
7353 IV
7354 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7355 {
7356  /* Searches the inversion list for the entry that contains the input code
7357  * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
7358  * return value is the index into the list's array of the range that
7359  * contains <cp> */
7360
7361  IV low = 0;
7362  IV mid;
7363  IV high = _invlist_len(invlist);
7364  const IV highest_element = high - 1;
7365  const UV* array;
7366
7367  PERL_ARGS_ASSERT__INVLIST_SEARCH;
7368
7369  /* If list is empty, return failure. */
7370  if (high == 0) {
7371   return -1;
7372  }
7373
7374  /* (We can't get the array unless we know the list is non-empty) */
7375  array = invlist_array(invlist);
7376
7377  mid = invlist_previous_index(invlist);
7378  assert(mid >=0 && mid <= highest_element);
7379
7380  /* <mid> contains the cache of the result of the previous call to this
7381  * function (0 the first time).  See if this call is for the same result,
7382  * or if it is for mid-1.  This is under the theory that calls to this
7383  * function will often be for related code points that are near each other.
7384  * And benchmarks show that caching gives better results.  We also test
7385  * here if the code point is within the bounds of the list.  These tests
7386  * replace others that would have had to be made anyway to make sure that
7387  * the array bounds were not exceeded, and these give us extra information
7388  * at the same time */
7389  if (cp >= array[mid]) {
7390   if (cp >= array[highest_element]) {
7391    return highest_element;
7392   }
7393
7394   /* Here, array[mid] <= cp < array[highest_element].  This means that
7395   * the final element is not the answer, so can exclude it; it also
7396   * means that <mid> is not the final element, so can refer to 'mid + 1'
7397   * safely */
7398   if (cp < array[mid + 1]) {
7399    return mid;
7400   }
7401   high--;
7402   low = mid + 1;
7403  }
7404  else { /* cp < aray[mid] */
7405   if (cp < array[0]) { /* Fail if outside the array */
7406    return -1;
7407   }
7408   high = mid;
7409   if (cp >= array[mid - 1]) {
7410    goto found_entry;
7411   }
7412  }
7413
7414  /* Binary search.  What we are looking for is <i> such that
7415  * array[i] <= cp < array[i+1]
7416  * The loop below converges on the i+1.  Note that there may not be an
7417  * (i+1)th element in the array, and things work nonetheless */
7418  while (low < high) {
7419   mid = (low + high) / 2;
7420   assert(mid <= highest_element);
7421   if (array[mid] <= cp) { /* cp >= array[mid] */
7422    low = mid + 1;
7423
7424    /* We could do this extra test to exit the loop early.
7425    if (cp < array[low]) {
7426     return mid;
7427    }
7428    */
7429   }
7430   else { /* cp < array[mid] */
7431    high = mid;
7432   }
7433  }
7434
7435   found_entry:
7436  high--;
7437  invlist_set_previous_index(invlist, high);
7438  return high;
7439 }
7440
7441 void
7442 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7443 {
7444  /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7445  * but is used when the swash has an inversion list.  This makes this much
7446  * faster, as it uses a binary search instead of a linear one.  This is
7447  * intimately tied to that function, and perhaps should be in utf8.c,
7448  * except it is intimately tied to inversion lists as well.  It assumes
7449  * that <swatch> is all 0's on input */
7450
7451  UV current = start;
7452  const IV len = _invlist_len(invlist);
7453  IV i;
7454  const UV * array;
7455
7456  PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7457
7458  if (len == 0) { /* Empty inversion list */
7459   return;
7460  }
7461
7462  array = invlist_array(invlist);
7463
7464  /* Find which element it is */
7465  i = _invlist_search(invlist, start);
7466
7467  /* We populate from <start> to <end> */
7468  while (current < end) {
7469   UV upper;
7470
7471   /* The inversion list gives the results for every possible code point
7472   * after the first one in the list.  Only those ranges whose index is
7473   * even are ones that the inversion list matches.  For the odd ones,
7474   * and if the initial code point is not in the list, we have to skip
7475   * forward to the next element */
7476   if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7477    i++;
7478    if (i >= len) { /* Finished if beyond the end of the array */
7479     return;
7480    }
7481    current = array[i];
7482    if (current >= end) {   /* Finished if beyond the end of what we
7483          are populating */
7484     if (LIKELY(end < UV_MAX)) {
7485      return;
7486     }
7487
7488     /* We get here when the upper bound is the maximum
7489     * representable on the machine, and we are looking for just
7490     * that code point.  Have to special case it */
7491     i = len;
7492     goto join_end_of_list;
7493    }
7494   }
7495   assert(current >= start);
7496
7497   /* The current range ends one below the next one, except don't go past
7498   * <end> */
7499   i++;
7500   upper = (i < len && array[i] < end) ? array[i] : end;
7501
7502   /* Here we are in a range that matches.  Populate a bit in the 3-bit U8
7503   * for each code point in it */
7504   for (; current < upper; current++) {
7505    const STRLEN offset = (STRLEN)(current - start);
7506    swatch[offset >> 3] |= 1 << (offset & 7);
7507   }
7508
7509  join_end_of_list:
7510
7511   /* Quit if at the end of the list */
7512   if (i >= len) {
7513
7514    /* But first, have to deal with the highest possible code point on
7515    * the platform.  The previous code assumes that <end> is one
7516    * beyond where we want to populate, but that is impossible at the
7517    * platform's infinity, so have to handle it specially */
7518    if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7519    {
7520     const STRLEN offset = (STRLEN)(end - start);
7521     swatch[offset >> 3] |= 1 << (offset & 7);
7522    }
7523    return;
7524   }
7525
7526   /* Advance to the next range, which will be for code points not in the
7527   * inversion list */
7528   current = array[i];
7529  }
7530
7531  return;
7532 }
7533
7534 void
7535 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7536 {
7537  /* Take the union of two inversion lists and point <output> to it.  *output
7538  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7539  * the reference count to that list will be decremented.  The first list,
7540  * <a>, may be NULL, in which case a copy of the second list is returned.
7541  * If <complement_b> is TRUE, the union is taken of the complement
7542  * (inversion) of <b> instead of b itself.
7543  *
7544  * The basis for this comes from "Unicode Demystified" Chapter 13 by
7545  * Richard Gillam, published by Addison-Wesley, and explained at some
7546  * length there.  The preface says to incorporate its examples into your
7547  * code at your own risk.
7548  *
7549  * The algorithm is like a merge sort.
7550  *
7551  * XXX A potential performance improvement is to keep track as we go along
7552  * if only one of the inputs contributes to the result, meaning the other
7553  * is a subset of that one.  In that case, we can skip the final copy and
7554  * return the larger of the input lists, but then outside code might need
7555  * to keep track of whether to free the input list or not */
7556
7557  UV* array_a;    /* a's array */
7558  UV* array_b;
7559  UV len_a;     /* length of a's array */
7560  UV len_b;
7561
7562  SV* u;   /* the resulting union */
7563  UV* array_u;
7564  UV len_u;
7565
7566  UV i_a = 0;      /* current index into a's array */
7567  UV i_b = 0;
7568  UV i_u = 0;
7569
7570  /* running count, as explained in the algorithm source book; items are
7571  * stopped accumulating and are output when the count changes to/from 0.
7572  * The count is incremented when we start a range that's in the set, and
7573  * decremented when we start a range that's not in the set.  So its range
7574  * is 0 to 2.  Only when the count is zero is something not in the set.
7575  */
7576  UV count = 0;
7577
7578  PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7579  assert(a != b);
7580
7581  /* If either one is empty, the union is the other one */
7582  if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7583   if (*output == a) {
7584    if (a != NULL) {
7585     SvREFCNT_dec_NN(a);
7586    }
7587   }
7588   if (*output != b) {
7589    *output = invlist_clone(b);
7590    if (complement_b) {
7591     _invlist_invert(*output);
7592    }
7593   } /* else *output already = b; */
7594   return;
7595  }
7596  else if ((len_b = _invlist_len(b)) == 0) {
7597   if (*output == b) {
7598    SvREFCNT_dec_NN(b);
7599   }
7600
7601   /* The complement of an empty list is a list that has everything in it,
7602   * so the union with <a> includes everything too */
7603   if (complement_b) {
7604    if (a == *output) {
7605     SvREFCNT_dec_NN(a);
7606    }
7607    *output = _new_invlist(1);
7608    _append_range_to_invlist(*output, 0, UV_MAX);
7609   }
7610   else if (*output != a) {
7611    *output = invlist_clone(a);
7612   }
7613   /* else *output already = a; */
7614   return;
7615  }
7616
7617  /* Here both lists exist and are non-empty */
7618  array_a = invlist_array(a);
7619  array_b = invlist_array(b);
7620
7621  /* If are to take the union of 'a' with the complement of b, set it
7622  * up so are looking at b's complement. */
7623  if (complement_b) {
7624
7625   /* To complement, we invert: if the first element is 0, remove it.  To
7626   * do this, we just pretend the array starts one later, and clear the
7627   * flag as we don't have to do anything else later */
7628   if (array_b[0] == 0) {
7629    array_b++;
7630    len_b--;
7631    complement_b = FALSE;
7632   }
7633   else {
7634
7635    /* But if the first element is not zero, we unshift a 0 before the
7636    * array.  The data structure reserves a space for that 0 (which
7637    * should be a '1' right now), so physical shifting is unneeded,
7638    * but temporarily change that element to 0.  Before exiting the
7639    * routine, we must restore the element to '1' */
7640    array_b--;
7641    len_b++;
7642    array_b[0] = 0;
7643   }
7644  }
7645
7646  /* Size the union for the worst case: that the sets are completely
7647  * disjoint */
7648  u = _new_invlist(len_a + len_b);
7649
7650  /* Will contain U+0000 if either component does */
7651  array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7652          || (len_b > 0 && array_b[0] == 0));
7653
7654  /* Go through each list item by item, stopping when exhausted one of
7655  * them */
7656  while (i_a < len_a && i_b < len_b) {
7657   UV cp;     /* The element to potentially add to the union's array */
7658   bool cp_in_set;   /* is it in the the input list's set or not */
7659
7660   /* We need to take one or the other of the two inputs for the union.
7661   * Since we are merging two sorted lists, we take the smaller of the
7662   * next items.  In case of a tie, we take the one that is in its set
7663   * first.  If we took one not in the set first, it would decrement the
7664   * count, possibly to 0 which would cause it to be output as ending the
7665   * range, and the next time through we would take the same number, and
7666   * output it again as beginning the next range.  By doing it the
7667   * opposite way, there is no possibility that the count will be
7668   * momentarily decremented to 0, and thus the two adjoining ranges will
7669   * be seamlessly merged.  (In a tie and both are in the set or both not
7670   * in the set, it doesn't matter which we take first.) */
7671   if (array_a[i_a] < array_b[i_b]
7672    || (array_a[i_a] == array_b[i_b]
7673     && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7674   {
7675    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7676    cp= array_a[i_a++];
7677   }
7678   else {
7679    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7680    cp = array_b[i_b++];
7681   }
7682
7683   /* Here, have chosen which of the two inputs to look at.  Only output
7684   * if the running count changes to/from 0, which marks the
7685   * beginning/end of a range in that's in the set */
7686   if (cp_in_set) {
7687    if (count == 0) {
7688     array_u[i_u++] = cp;
7689    }
7690    count++;
7691   }
7692   else {
7693    count--;
7694    if (count == 0) {
7695     array_u[i_u++] = cp;
7696    }
7697   }
7698  }
7699
7700  /* Here, we are finished going through at least one of the lists, which
7701  * means there is something remaining in at most one.  We check if the list
7702  * that hasn't been exhausted is positioned such that we are in the middle
7703  * of a range in its set or not.  (i_a and i_b point to the element beyond
7704  * the one we care about.) If in the set, we decrement 'count'; if 0, there
7705  * is potentially more to output.
7706  * There are four cases:
7707  * 1) Both weren't in their sets, count is 0, and remains 0.  What's left
7708  *    in the union is entirely from the non-exhausted set.
7709  * 2) Both were in their sets, count is 2.  Nothing further should
7710  *    be output, as everything that remains will be in the exhausted
7711  *    list's set, hence in the union; decrementing to 1 but not 0 insures
7712  *    that
7713  * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7714  *    Nothing further should be output because the union includes
7715  *    everything from the exhausted set.  Not decrementing ensures that.
7716  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7717  *    decrementing to 0 insures that we look at the remainder of the
7718  *    non-exhausted set */
7719  if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7720   || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7721  {
7722   count--;
7723  }
7724
7725  /* The final length is what we've output so far, plus what else is about to
7726  * be output.  (If 'count' is non-zero, then the input list we exhausted
7727  * has everything remaining up to the machine's limit in its set, and hence
7728  * in the union, so there will be no further output. */
7729  len_u = i_u;
7730  if (count == 0) {
7731   /* At most one of the subexpressions will be non-zero */
7732   len_u += (len_a - i_a) + (len_b - i_b);
7733  }
7734
7735  /* Set result to final length, which can change the pointer to array_u, so
7736  * re-find it */
7737  if (len_u != _invlist_len(u)) {
7738   invlist_set_len(u, len_u);
7739   invlist_trim(u);
7740   array_u = invlist_array(u);
7741  }
7742
7743  /* When 'count' is 0, the list that was exhausted (if one was shorter than
7744  * the other) ended with everything above it not in its set.  That means
7745  * that the remaining part of the union is precisely the same as the
7746  * non-exhausted list, so can just copy it unchanged.  (If both list were
7747  * exhausted at the same time, then the operations below will be both 0.)
7748  */
7749  if (count == 0) {
7750   IV copy_count; /* At most one will have a non-zero copy count */
7751   if ((copy_count = len_a - i_a) > 0) {
7752    Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7753   }
7754   else if ((copy_count = len_b - i_b) > 0) {
7755    Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7756   }
7757  }
7758
7759  /* If we've changed b, restore it */
7760  if (complement_b) {
7761   array_b[0] = 1;
7762  }
7763
7764  /*  We may be removing a reference to one of the inputs */
7765  if (a == *output || b == *output) {
7766   assert(! invlist_is_iterating(*output));
7767   SvREFCNT_dec_NN(*output);
7768  }
7769
7770  *output = u;
7771  return;
7772 }
7773
7774 void
7775 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7776 {
7777  /* Take the intersection of two inversion lists and point <i> to it.  *i
7778  * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7779  * the reference count to that list will be decremented.
7780  * If <complement_b> is TRUE, the result will be the intersection of <a>
7781  * and the complement (or inversion) of <b> instead of <b> directly.
7782  *
7783  * The basis for this comes from "Unicode Demystified" Chapter 13 by
7784  * Richard Gillam, published by Addison-Wesley, and explained at some
7785  * length there.  The preface says to incorporate its examples into your
7786  * code at your own risk.  In fact, it had bugs
7787  *
7788  * The algorithm is like a merge sort, and is essentially the same as the
7789  * union above
7790  */
7791
7792  UV* array_a;  /* a's array */
7793  UV* array_b;
7794  UV len_a; /* length of a's array */
7795  UV len_b;
7796
7797  SV* r;       /* the resulting intersection */
7798  UV* array_r;
7799  UV len_r;
7800
7801  UV i_a = 0;      /* current index into a's array */
7802  UV i_b = 0;
7803  UV i_r = 0;
7804
7805  /* running count, as explained in the algorithm source book; items are
7806  * stopped accumulating and are output when the count changes to/from 2.
7807  * The count is incremented when we start a range that's in the set, and
7808  * decremented when we start a range that's not in the set.  So its range
7809  * is 0 to 2.  Only when the count is 2 is something in the intersection.
7810  */
7811  UV count = 0;
7812
7813  PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7814  assert(a != b);
7815
7816  /* Special case if either one is empty */
7817  len_a = _invlist_len(a);
7818  if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7819
7820   if (len_a != 0 && complement_b) {
7821
7822    /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7823    * be empty.  Here, also we are using 'b's complement, which hence
7824    * must be every possible code point.  Thus the intersection is
7825    * simply 'a'. */
7826    if (*i != a) {
7827     *i = invlist_clone(a);
7828
7829     if (*i == b) {
7830      SvREFCNT_dec_NN(b);
7831     }
7832    }
7833    /* else *i is already 'a' */
7834    return;
7835   }
7836
7837   /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
7838   * intersection must be empty */
7839   if (*i == a) {
7840    SvREFCNT_dec_NN(a);
7841   }
7842   else if (*i == b) {
7843    SvREFCNT_dec_NN(b);
7844   }
7845   *i = _new_invlist(0);
7846   return;
7847  }
7848
7849  /* Here both lists exist and are non-empty */
7850  array_a = invlist_array(a);
7851  array_b = invlist_array(b);
7852
7853  /* If are to take the intersection of 'a' with the complement of b, set it
7854  * up so are looking at b's complement. */
7855  if (complement_b) {
7856
7857   /* To complement, we invert: if the first element is 0, remove it.  To
7858   * do this, we just pretend the array starts one later, and clear the
7859   * flag as we don't have to do anything else later */
7860   if (array_b[0] == 0) {
7861    array_b++;
7862    len_b--;
7863    complement_b = FALSE;
7864   }
7865   else {
7866
7867    /* But if the first element is not zero, we unshift a 0 before the
7868    * array.  The data structure reserves a space for that 0 (which
7869    * should be a '1' right now), so physical shifting is unneeded,
7870    * but temporarily change that element to 0.  Before exiting the
7871    * routine, we must restore the element to '1' */
7872    array_b--;
7873    len_b++;
7874    array_b[0] = 0;
7875   }
7876  }
7877
7878  /* Size the intersection for the worst case: that the intersection ends up
7879  * fragmenting everything to be completely disjoint */
7880  r= _new_invlist(len_a + len_b);
7881
7882  /* Will contain U+0000 iff both components do */
7883  array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7884          && len_b > 0 && array_b[0] == 0);
7885
7886  /* Go through each list item by item, stopping when exhausted one of
7887  * them */
7888  while (i_a < len_a && i_b < len_b) {
7889   UV cp;     /* The element to potentially add to the intersection's
7890      array */
7891   bool cp_in_set; /* Is it in the input list's set or not */
7892
7893   /* We need to take one or the other of the two inputs for the
7894   * intersection.  Since we are merging two sorted lists, we take the
7895   * smaller of the next items.  In case of a tie, we take the one that
7896   * is not in its set first (a difference from the union algorithm).  If
7897   * we took one in the set first, it would increment the count, possibly
7898   * to 2 which would cause it to be output as starting a range in the
7899   * intersection, and the next time through we would take that same
7900   * number, and output it again as ending the set.  By doing it the
7901   * opposite of this, there is no possibility that the count will be
7902   * momentarily incremented to 2.  (In a tie and both are in the set or
7903   * both not in the set, it doesn't matter which we take first.) */
7904   if (array_a[i_a] < array_b[i_b]
7905    || (array_a[i_a] == array_b[i_b]
7906     && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7907   {
7908    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7909    cp= array_a[i_a++];
7910   }
7911   else {
7912    cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7913    cp= array_b[i_b++];
7914   }
7915
7916   /* Here, have chosen which of the two inputs to look at.  Only output
7917   * if the running count changes to/from 2, which marks the
7918   * beginning/end of a range that's in the intersection */
7919   if (cp_in_set) {
7920    count++;
7921    if (count == 2) {
7922     array_r[i_r++] = cp;
7923    }
7924   }
7925   else {
7926    if (count == 2) {
7927     array_r[i_r++] = cp;
7928    }
7929    count--;
7930   }
7931  }
7932
7933  /* Here, we are finished going through at least one of the lists, which
7934  * means there is something remaining in at most one.  We check if the list
7935  * that has been exhausted is positioned such that we are in the middle
7936  * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
7937  * the ones we care about.)  There are four cases:
7938  * 1) Both weren't in their sets, count is 0, and remains 0.  There's
7939  *    nothing left in the intersection.
7940  * 2) Both were in their sets, count is 2 and perhaps is incremented to
7941  *    above 2.  What should be output is exactly that which is in the
7942  *    non-exhausted set, as everything it has is also in the intersection
7943  *    set, and everything it doesn't have can't be in the intersection
7944  * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7945  *    gets incremented to 2.  Like the previous case, the intersection is
7946  *    everything that remains in the non-exhausted set.
7947  * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7948  *    remains 1.  And the intersection has nothing more. */
7949  if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7950   || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7951  {
7952   count++;
7953  }
7954
7955  /* The final length is what we've output so far plus what else is in the
7956  * intersection.  At most one of the subexpressions below will be non-zero */
7957  len_r = i_r;
7958  if (count >= 2) {
7959   len_r += (len_a - i_a) + (len_b - i_b);
7960  }
7961
7962  /* Set result to final length, which can change the pointer to array_r, so
7963  * re-find it */
7964  if (len_r != _invlist_len(r)) {
7965   invlist_set_len(r, len_r);
7966   invlist_trim(r);
7967   array_r = invlist_array(r);
7968  }
7969
7970  /* Finish outputting any remaining */
7971  if (count >= 2) { /* At most one will have a non-zero copy count */
7972   IV copy_count;
7973   if ((copy_count = len_a - i_a) > 0) {
7974    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7975   }
7976   else if ((copy_count = len_b - i_b) > 0) {
7977    Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7978   }
7979  }
7980
7981  /* If we've changed b, restore it */
7982  if (complement_b) {
7983   array_b[0] = 1;
7984  }
7985
7986  /*  We may be removing a reference to one of the inputs */
7987  if (a == *i || b == *i) {
7988   assert(! invlist_is_iterating(*i));
7989   SvREFCNT_dec_NN(*i);
7990  }
7991
7992  *i = r;
7993  return;
7994 }
7995
7996 SV*
7997 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7998 {
7999  /* Add the range from 'start' to 'end' inclusive to the inversion list's
8000  * set.  A pointer to the inversion list is returned.  This may actually be
8001  * a new list, in which case the passed in one has been destroyed.  The
8002  * passed in inversion list can be NULL, in which case a new one is created
8003  * with just the one range in it */
8004
8005  SV* range_invlist;
8006  UV len;
8007
8008  if (invlist == NULL) {
8009   invlist = _new_invlist(2);
8010   len = 0;
8011  }
8012  else {
8013   len = _invlist_len(invlist);
8014  }
8015
8016  /* If comes after the final entry actually in the list, can just append it
8017  * to the end, */
8018  if (len == 0
8019   || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8020    && start >= invlist_array(invlist)[len - 1]))
8021  {
8022   _append_range_to_invlist(invlist, start, end);
8023   return invlist;
8024  }
8025
8026  /* Here, can't just append things, create and return a new inversion list
8027  * which is the union of this range and the existing inversion list */
8028  range_invlist = _new_invlist(2);
8029  _append_range_to_invlist(range_invlist, start, end);
8030
8031  _invlist_union(invlist, range_invlist, &invlist);
8032
8033  /* The temporary can be freed */
8034  SvREFCNT_dec_NN(range_invlist);
8035
8036  return invlist;
8037 }
8038
8039 #endif
8040
8041 PERL_STATIC_INLINE SV*
8042 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8043  return _add_range_to_invlist(invlist, cp, cp);
8044 }
8045
8046 #ifndef PERL_IN_XSUB_RE
8047 void
8048 Perl__invlist_invert(pTHX_ SV* const invlist)
8049 {
8050  /* Complement the input inversion list.  This adds a 0 if the list didn't
8051  * have a zero; removes it otherwise.  As described above, the data
8052  * structure is set up so that this is very efficient */
8053
8054  UV* len_pos = _get_invlist_len_addr(invlist);
8055
8056  PERL_ARGS_ASSERT__INVLIST_INVERT;
8057
8058  assert(! invlist_is_iterating(invlist));
8059
8060  /* The inverse of matching nothing is matching everything */
8061  if (*len_pos == 0) {
8062   _append_range_to_invlist(invlist, 0, UV_MAX);
8063   return;
8064  }
8065
8066  /* The exclusive or complents 0 to 1; and 1 to 0.  If the result is 1, the
8067  * zero element was a 0, so it is being removed, so the length decrements
8068  * by 1; and vice-versa.  SvCUR is unaffected */
8069  if (*get_invlist_zero_addr(invlist) ^= 1) {
8070   (*len_pos)--;
8071  }
8072  else {
8073   (*len_pos)++;
8074  }
8075 }
8076
8077 void
8078 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8079 {
8080  /* Complement the input inversion list (which must be a Unicode property,
8081  * all of which don't match above the Unicode maximum code point.)  And
8082  * Perl has chosen to not have the inversion match above that either.  This
8083  * adds a 0x110000 if the list didn't end with it, and removes it if it did
8084  */
8085
8086  UV len;
8087  UV* array;
8088
8089  PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8090
8091  _invlist_invert(invlist);
8092
8093  len = _invlist_len(invlist);
8094
8095  if (len != 0) { /* If empty do nothing */
8096   array = invlist_array(invlist);
8097   if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8098    /* Add 0x110000.  First, grow if necessary */
8099    len++;
8100    if (invlist_max(invlist) < len) {
8101     invlist_extend(invlist, len);
8102     array = invlist_array(invlist);
8103    }
8104    invlist_set_len(invlist, len);
8105    array[len - 1] = PERL_UNICODE_MAX + 1;
8106   }
8107   else {  /* Remove the 0x110000 */
8108    invlist_set_len(invlist, len - 1);
8109   }
8110  }
8111
8112  return;
8113 }
8114 #endif
8115
8116 PERL_STATIC_INLINE SV*
8117 S_invlist_clone(pTHX_ SV* const invlist)
8118 {
8119
8120  /* Return a new inversion list that is a copy of the input one, which is
8121  * unchanged */
8122
8123  /* Need to allocate extra space to accommodate Perl's addition of a
8124  * trailing NUL to SvPV's, since it thinks they are always strings */
8125  SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8126  STRLEN length = SvCUR(invlist);
8127
8128  PERL_ARGS_ASSERT_INVLIST_CLONE;
8129
8130  SvCUR_set(new_invlist, length); /* This isn't done automatically */
8131  Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8132
8133  return new_invlist;
8134 }
8135
8136 PERL_STATIC_INLINE UV*
8137 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8138 {
8139  /* Return the address of the UV that contains the current iteration
8140  * position */
8141
8142  PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8143
8144  return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8145 }
8146
8147 PERL_STATIC_INLINE UV*
8148 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8149 {
8150  /* Return the address of the UV that contains the version id. */
8151
8152  PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8153
8154  return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8155 }
8156
8157 PERL_STATIC_INLINE void
8158 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8159 {
8160  PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8161
8162  *get_invlist_iter_addr(invlist) = 0;
8163 }
8164
8165 PERL_STATIC_INLINE void
8166 S_invlist_iterfinish(pTHX_ SV* invlist)
8167 {
8168  /* Terminate iterator for invlist.  This is to catch development errors.
8169  * Any iteration that is interrupted before completed should call this
8170  * function.  Functions that add code points anywhere else but to the end
8171  * of an inversion list assert that they are not in the middle of an
8172  * iteration.  If they were, the addition would make the iteration
8173  * problematical: if the iteration hadn't reached the place where things
8174  * were being added, it would be ok */
8175
8176  PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8177
8178  *get_invlist_iter_addr(invlist) = UV_MAX;
8179 }
8180
8181 STATIC bool
8182 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8183 {
8184  /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8185  * This call sets in <*start> and <*end>, the next range in <invlist>.
8186  * Returns <TRUE> if successful and the next call will return the next
8187  * range; <FALSE> if was already at the end of the list.  If the latter,
8188  * <*start> and <*end> are unchanged, and the next call to this function
8189  * will start over at the beginning of the list */
8190
8191  UV* pos = get_invlist_iter_addr(invlist);
8192  UV len = _invlist_len(invlist);
8193  UV *array;
8194
8195  PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8196
8197  if (*pos >= len) {
8198   *pos = UV_MAX; /* Force iterinit() to be required next time */
8199   return FALSE;
8200  }
8201
8202  array = invlist_array(invlist);
8203
8204  *start = array[(*pos)++];
8205
8206  if (*pos >= len) {
8207   *end = UV_MAX;
8208  }
8209  else {
8210   *end = array[(*pos)++] - 1;
8211  }
8212
8213  return TRUE;
8214 }
8215
8216 PERL_STATIC_INLINE bool
8217 S_invlist_is_iterating(pTHX_ SV* const invlist)
8218 {
8219  PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8220
8221  return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8222 }
8223
8224 PERL_STATIC_INLINE UV
8225 S_invlist_highest(pTHX_ SV* const invlist)
8226 {
8227  /* Returns the highest code point that matches an inversion list.  This API
8228  * has an ambiguity, as it returns 0 under either the highest is actually
8229  * 0, or if the list is empty.  If this distinction matters to you, check
8230  * for emptiness before calling this function */
8231
8232  UV len = _invlist_len(invlist);
8233  UV *array;
8234
8235  PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8236
8237  if (len == 0) {
8238   return 0;
8239  }
8240
8241  array = invlist_array(invlist);
8242
8243  /* The last element in the array in the inversion list always starts a
8244  * range that goes to infinity.  That range may be for code points that are
8245  * matched in the inversion list, or it may be for ones that aren't
8246  * matched.  In the latter case, the highest code point in the set is one
8247  * less than the beginning of this range; otherwise it is the final element
8248  * of this range: infinity */
8249  return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8250   ? UV_MAX
8251   : array[len - 1] - 1;
8252 }
8253
8254 #ifndef PERL_IN_XSUB_RE
8255 SV *
8256 Perl__invlist_contents(pTHX_ SV* const invlist)
8257 {
8258  /* Get the contents of an inversion list into a string SV so that they can
8259  * be printed out.  It uses the format traditionally done for debug tracing
8260  */
8261
8262  UV start, end;
8263  SV* output = newSVpvs("\n");
8264
8265  PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8266
8267  assert(! invlist_is_iterating(invlist));
8268
8269  invlist_iterinit(invlist);
8270  while (invlist_iternext(invlist, &start, &end)) {
8271   if (end == UV_MAX) {
8272    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8273   }
8274   else if (end != start) {
8275    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8276      start,       end);
8277   }
8278   else {
8279    Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8280   }
8281  }
8282
8283  return output;
8284 }
8285 #endif
8286
8287 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8288 void
8289 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8290 {
8291  /* Dumps out the ranges in an inversion list.  The string 'header'
8292  * if present is output on a line before the first range */
8293
8294  UV start, end;
8295
8296  PERL_ARGS_ASSERT__INVLIST_DUMP;
8297
8298  if (header && strlen(header)) {
8299   PerlIO_printf(Perl_debug_log, "%s\n", header);
8300  }
8301  if (invlist_is_iterating(invlist)) {
8302   PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8303   return;
8304  }
8305
8306  invlist_iterinit(invlist);
8307  while (invlist_iternext(invlist, &start, &end)) {
8308   if (end == UV_MAX) {
8309    PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8310   }
8311   else if (end != start) {
8312    PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8313             start,         end);
8314   }
8315   else {
8316    PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8317   }
8318  }
8319 }
8320 #endif
8321
8322 #if 0
8323 bool
8324 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8325 {
8326  /* Return a boolean as to if the two passed in inversion lists are
8327  * identical.  The final argument, if TRUE, says to take the complement of
8328  * the second inversion list before doing the comparison */
8329
8330  UV* array_a = invlist_array(a);
8331  UV* array_b = invlist_array(b);
8332  UV len_a = _invlist_len(a);
8333  UV len_b = _invlist_len(b);
8334
8335  UV i = 0;      /* current index into the arrays */
8336  bool retval = TRUE;     /* Assume are identical until proven otherwise */
8337
8338  PERL_ARGS_ASSERT__INVLISTEQ;
8339
8340  /* If are to compare 'a' with the complement of b, set it
8341  * up so are looking at b's complement. */
8342  if (complement_b) {
8343
8344   /* The complement of nothing is everything, so <a> would have to have
8345   * just one element, starting at zero (ending at infinity) */
8346   if (len_b == 0) {
8347    return (len_a == 1 && array_a[0] == 0);
8348   }
8349   else if (array_b[0] == 0) {
8350
8351    /* Otherwise, to complement, we invert.  Here, the first element is
8352    * 0, just remove it.  To do this, we just pretend the array starts
8353    * one later, and clear the flag as we don't have to do anything
8354    * else later */
8355
8356    array_b++;
8357    len_b--;
8358    complement_b = FALSE;
8359   }
8360   else {
8361
8362    /* But if the first element is not zero, we unshift a 0 before the
8363    * array.  The data structure reserves a space for that 0 (which
8364    * should be a '1' right now), so physical shifting is unneeded,
8365    * but temporarily change that element to 0.  Before exiting the
8366    * routine, we must restore the element to '1' */
8367    array_b--;
8368    len_b++;
8369    array_b[0] = 0;
8370   }
8371  }
8372
8373  /* Make sure that the lengths are the same, as well as the final element
8374  * before looping through the remainder.  (Thus we test the length, final,
8375  * and first elements right off the bat) */
8376  if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8377   retval = FALSE;
8378  }
8379  else for (i = 0; i < len_a - 1; i++) {
8380   if (array_a[i] != array_b[i]) {
8381    retval = FALSE;
8382    break;
8383   }
8384  }
8385
8386  if (complement_b) {
8387   array_b[0] = 1;
8388  }
8389  return retval;
8390 }
8391 #endif
8392
8393 #undef HEADER_LENGTH
8394 #undef INVLIST_INITIAL_LENGTH
8395 #undef TO_INTERNAL_SIZE
8396 #undef FROM_INTERNAL_SIZE
8397 #undef INVLIST_LEN_OFFSET
8398 #undef INVLIST_ZERO_OFFSET
8399 #undef INVLIST_ITER_OFFSET
8400 #undef INVLIST_VERSION_ID
8401 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8402
8403 /* End of inversion list object */
8404
8405 STATIC void
8406 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8407 {
8408  /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8409  * constructs, and updates RExC_flags with them.  On input, RExC_parse
8410  * should point to the first flag; it is updated on output to point to the
8411  * final ')' or ':'.  There needs to be at least one flag, or this will
8412  * abort */
8413
8414  /* for (?g), (?gc), and (?o) warnings; warning
8415  about (?c) will warn about (?g) -- japhy    */
8416
8417 #define WASTED_O  0x01
8418 #define WASTED_G  0x02
8419 #define WASTED_C  0x04
8420 #define WASTED_GC (0x02|0x04)
8421  I32 wastedflags = 0x00;
8422  U32 posflags = 0, negflags = 0;
8423  U32 *flagsp = &posflags;
8424  char has_charset_modifier = '\0';
8425  regex_charset cs;
8426  bool has_use_defaults = FALSE;
8427  const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8428
8429  PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8430
8431  /* '^' as an initial flag sets certain defaults */
8432  if (UCHARAT(RExC_parse) == '^') {
8433   RExC_parse++;
8434   has_use_defaults = TRUE;
8435   STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8436   set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8437           ? REGEX_UNICODE_CHARSET
8438           : REGEX_DEPENDS_CHARSET);
8439  }
8440
8441  cs = get_regex_charset(RExC_flags);
8442  if (cs == REGEX_DEPENDS_CHARSET
8443   && (RExC_utf8 || RExC_uni_semantics))
8444  {
8445   cs = REGEX_UNICODE_CHARSET;
8446  }
8447
8448  while (*RExC_parse) {
8449   /* && strchr("iogcmsx", *RExC_parse) */
8450   /* (?g), (?gc) and (?o) are useless here
8451   and must be globally applied -- japhy */
8452   switch (*RExC_parse) {
8453
8454    /* Code for the imsx flags */
8455    CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8456
8457    case LOCALE_PAT_MOD:
8458     if (has_charset_modifier) {
8459      goto excess_modifier;
8460     }
8461     else if (flagsp == &negflags) {
8462      goto neg_modifier;
8463     }
8464     cs = REGEX_LOCALE_CHARSET;
8465     has_charset_modifier = LOCALE_PAT_MOD;
8466     RExC_contains_locale = 1;
8467     break;
8468    case UNICODE_PAT_MOD:
8469     if (has_charset_modifier) {
8470      goto excess_modifier;
8471     }
8472     else if (flagsp == &negflags) {
8473      goto neg_modifier;
8474     }
8475     cs = REGEX_UNICODE_CHARSET;
8476     has_charset_modifier = UNICODE_PAT_MOD;
8477     break;
8478    case ASCII_RESTRICT_PAT_MOD:
8479     if (flagsp == &negflags) {
8480      goto neg_modifier;
8481     }
8482     if (has_charset_modifier) {
8483      if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8484       goto excess_modifier;
8485      }
8486      /* Doubled modifier implies more restricted */
8487      cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8488     }
8489     else {
8490      cs = REGEX_ASCII_RESTRICTED_CHARSET;
8491     }
8492     has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8493     break;
8494    case DEPENDS_PAT_MOD:
8495     if (has_use_defaults) {
8496      goto fail_modifiers;
8497     }
8498     else if (flagsp == &negflags) {
8499      goto neg_modifier;
8500     }
8501     else if (has_charset_modifier) {
8502      goto excess_modifier;
8503     }
8504
8505     /* The dual charset means unicode semantics if the
8506     * pattern (or target, not known until runtime) are
8507     * utf8, or something in the pattern indicates unicode
8508     * semantics */
8509     cs = (RExC_utf8 || RExC_uni_semantics)
8510      ? REGEX_UNICODE_CHARSET
8511      : REGEX_DEPENDS_CHARSET;
8512     has_charset_modifier = DEPENDS_PAT_MOD;
8513     break;
8514    excess_modifier:
8515     RExC_parse++;
8516     if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8517      vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8518     }
8519     else if (has_charset_modifier == *(RExC_parse - 1)) {
8520      vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8521     }
8522     else {
8523      vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8524     }
8525     /*NOTREACHED*/
8526    neg_modifier:
8527     RExC_parse++;
8528     vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8529     /*NOTREACHED*/
8530    case ONCE_PAT_MOD: /* 'o' */
8531    case GLOBAL_PAT_MOD: /* 'g' */
8532     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8533      const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8534      if (! (wastedflags & wflagbit) ) {
8535       wastedflags |= wflagbit;
8536       vWARN5(
8537        RExC_parse + 1,
8538        "Useless (%s%c) - %suse /%c modifier",
8539        flagsp == &negflags ? "?-" : "?",
8540        *RExC_parse,
8541        flagsp == &negflags ? "don't " : "",
8542        *RExC_parse
8543       );
8544      }
8545     }
8546     break;
8547
8548    case CONTINUE_PAT_MOD: /* 'c' */
8549     if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8550      if (! (wastedflags & WASTED_C) ) {
8551       wastedflags |= WASTED_GC;
8552       vWARN3(
8553        RExC_parse + 1,
8554        "Useless (%sc) - %suse /gc modifier",
8555        flagsp == &negflags ? "?-" : "?",
8556        flagsp == &negflags ? "don't " : ""
8557       );
8558      }
8559     }
8560     break;
8561    case KEEPCOPY_PAT_MOD: /* 'p' */
8562     if (flagsp == &negflags) {
8563      if (SIZE_ONLY)
8564       ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8565     } else {
8566      *flagsp |= RXf_PMf_KEEPCOPY;
8567     }
8568     break;
8569    case '-':
8570     /* A flag is a default iff it is following a minus, so
8571     * if there is a minus, it means will be trying to
8572     * re-specify a default which is an error */
8573     if (has_use_defaults || flagsp == &negflags) {
8574      goto fail_modifiers;
8575     }
8576     flagsp = &negflags;
8577     wastedflags = 0;  /* reset so (?g-c) warns twice */
8578     break;
8579    case ':':
8580    case ')':
8581     RExC_flags |= posflags;
8582     RExC_flags &= ~negflags;
8583     set_regex_charset(&RExC_flags, cs);
8584     return;
8585     /*NOTREACHED*/
8586    default:
8587    fail_modifiers:
8588     RExC_parse++;
8589     vFAIL3("Sequence (%.*s...) not recognized",
8590      RExC_parse-seqstart, seqstart);
8591     /*NOTREACHED*/
8592   }
8593
8594   ++RExC_parse;
8595  }
8596 }
8597
8598 /*
8599  - reg - regular expression, i.e. main body or parenthesized thing
8600  *
8601  * Caller must absorb opening parenthesis.
8602  *
8603  * Combining parenthesis handling with the base level of regular expression
8604  * is a trifle forced, but the need to tie the tails of the branches to what
8605  * follows makes it hard to avoid.
8606  */
8607 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8608 #ifdef DEBUGGING
8609 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8610 #else
8611 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8612 #endif
8613
8614 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8615    flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8616    needs to be restarted.
8617    Otherwise would only return NULL if regbranch() returns NULL, which
8618    cannot happen.  */
8619 STATIC regnode *
8620 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8621  /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
8622  * 2 is like 1, but indicates that nextchar() has been called to advance
8623  * RExC_parse beyond the '('.  Things like '(?' are indivisible tokens, and
8624  * this flag alerts us to the need to check for that */
8625 {
8626  dVAR;
8627  regnode *ret;  /* Will be the head of the group. */
8628  regnode *br;
8629  regnode *lastbr;
8630  regnode *ender = NULL;
8631  I32 parno = 0;
8632  I32 flags;
8633  U32 oregflags = RExC_flags;
8634  bool have_branch = 0;
8635  bool is_open = 0;
8636  I32 freeze_paren = 0;
8637  I32 after_freeze = 0;
8638
8639  char * parse_start = RExC_parse; /* MJD */
8640  char * const oregcomp_parse = RExC_parse;
8641
8642  GET_RE_DEBUG_FLAGS_DECL;
8643
8644  PERL_ARGS_ASSERT_REG;
8645  DEBUG_PARSE("reg ");
8646
8647  *flagp = 0;    /* Tentatively. */
8648
8649
8650  /* Make an OPEN node, if parenthesized. */
8651  if (paren) {
8652
8653   /* Under /x, space and comments can be gobbled up between the '(' and
8654   * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
8655   * intervening space, as the sequence is a token, and a token should be
8656   * indivisible */
8657   bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
8658
8659   if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8660    char *start_verb = RExC_parse;
8661    STRLEN verb_len = 0;
8662    char *start_arg = NULL;
8663    unsigned char op = 0;
8664    int argok = 1;
8665    int internal_argval = 0; /* internal_argval is only useful if !argok */
8666
8667    if (has_intervening_patws && SIZE_ONLY) {
8668     ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
8669    }
8670    while ( *RExC_parse && *RExC_parse != ')' ) {
8671     if ( *RExC_parse == ':' ) {
8672      start_arg = RExC_parse + 1;
8673      break;
8674     }
8675     RExC_parse++;
8676    }
8677    ++start_verb;
8678    verb_len = RExC_parse - start_verb;
8679    if ( start_arg ) {
8680     RExC_parse++;
8681     while ( *RExC_parse && *RExC_parse != ')' )
8682      RExC_parse++;
8683     if ( *RExC_parse != ')' )
8684      vFAIL("Unterminated verb pattern argument");
8685     if ( RExC_parse == start_arg )
8686      start_arg = NULL;
8687    } else {
8688     if ( *RExC_parse != ')' )
8689      vFAIL("Unterminated verb pattern");
8690    }
8691
8692    switch ( *start_verb ) {
8693    case 'A':  /* (*ACCEPT) */
8694     if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8695      op = ACCEPT;
8696      internal_argval = RExC_nestroot;
8697     }
8698     break;
8699    case 'C':  /* (*COMMIT) */
8700     if ( memEQs(start_verb,verb_len,"COMMIT") )
8701      op = COMMIT;
8702     break;
8703    case 'F':  /* (*FAIL) */
8704     if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8705      op = OPFAIL;
8706      argok = 0;
8707     }
8708     break;
8709    case ':':  /* (*:NAME) */
8710    case 'M':  /* (*MARK:NAME) */
8711     if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8712      op = MARKPOINT;
8713      argok = -1;
8714     }
8715     break;
8716    case 'P':  /* (*PRUNE) */
8717     if ( memEQs(start_verb,verb_len,"PRUNE") )
8718      op = PRUNE;
8719     break;
8720    case 'S':   /* (*SKIP) */
8721     if ( memEQs(start_verb,verb_len,"SKIP") )
8722      op = SKIP;
8723     break;
8724    case 'T':  /* (*THEN) */
8725     /* [19:06] <TimToady> :: is then */
8726     if ( memEQs(start_verb,verb_len,"THEN") ) {
8727      op = CUTGROUP;
8728      RExC_seen |= REG_SEEN_CUTGROUP;
8729     }
8730     break;
8731    }
8732    if ( ! op ) {
8733     RExC_parse++;
8734     vFAIL3("Unknown verb pattern '%.*s'",
8735      verb_len, start_verb);
8736    }
8737    if ( argok ) {
8738     if ( start_arg && internal_argval ) {
8739      vFAIL3("Verb pattern '%.*s' may not have an argument",
8740       verb_len, start_verb);
8741     } else if ( argok < 0 && !start_arg ) {
8742      vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8743       verb_len, start_verb);
8744     } else {
8745      ret = reganode(pRExC_state, op, internal_argval);
8746      if ( ! internal_argval && ! SIZE_ONLY ) {
8747       if (start_arg) {
8748        SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8749        ARG(ret) = add_data( pRExC_state, 1, "S" );
8750        RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8751        ret->flags = 0;
8752       } else {
8753        ret->flags = 1;
8754       }
8755      }
8756     }
8757     if (!internal_argval)
8758      RExC_seen |= REG_SEEN_VERBARG;
8759    } else if ( start_arg ) {
8760     vFAIL3("Verb pattern '%.*s' may not have an argument",
8761       verb_len, start_verb);
8762    } else {
8763     ret = reg_node(pRExC_state, op);
8764    }
8765    nextchar(pRExC_state);
8766    return ret;
8767   } else
8768   if (*RExC_parse == '?') { /* (?...) */
8769    bool is_logical = 0;
8770    const char * const seqstart = RExC_parse;
8771    if (has_intervening_patws && SIZE_ONLY) {
8772     ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
8773    }
8774
8775    RExC_parse++;
8776    paren = *RExC_parse++;
8777    ret = NULL;   /* For look-ahead/behind. */
8778    switch (paren) {
8779
8780    case 'P': /* (?P...) variants for those used to PCRE/Python */
8781     paren = *RExC_parse++;
8782     if ( paren == '<')         /* (?P<...>) named capture */
8783      goto named_capture;
8784     else if (paren == '>') {   /* (?P>name) named recursion */
8785      goto named_recursion;
8786     }
8787     else if (paren == '=') {   /* (?P=...)  named backref */
8788      /* this pretty much dupes the code for \k<NAME> in regatom(), if
8789      you change this make sure you change that */
8790      char* name_start = RExC_parse;
8791      U32 num = 0;
8792      SV *sv_dat = reg_scan_name(pRExC_state,
8793       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8794      if (RExC_parse == name_start || *RExC_parse != ')')
8795       vFAIL2("Sequence %.3s... not terminated",parse_start);
8796
8797      if (!SIZE_ONLY) {
8798       num = add_data( pRExC_state, 1, "S" );
8799       RExC_rxi->data->data[num]=(void*)sv_dat;
8800       SvREFCNT_inc_simple_void(sv_dat);
8801      }
8802      RExC_sawback = 1;
8803      ret = reganode(pRExC_state,
8804         ((! FOLD)
8805          ? NREF
8806          : (ASCII_FOLD_RESTRICTED)
8807          ? NREFFA
8808          : (AT_LEAST_UNI_SEMANTICS)
8809           ? NREFFU
8810           : (LOC)
8811           ? NREFFL
8812           : NREFF),
8813          num);
8814      *flagp |= HASWIDTH;
8815
8816      Set_Node_Offset(ret, parse_start+1);
8817      Set_Node_Cur_Length(ret); /* MJD */
8818
8819      nextchar(pRExC_state);
8820      return ret;
8821     }
8822     RExC_parse++;
8823     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8824     /*NOTREACHED*/
8825    case '<':           /* (?<...) */
8826     if (*RExC_parse == '!')
8827      paren = ',';
8828     else if (*RExC_parse != '=')
8829    named_capture:
8830     {               /* (?<...>) */
8831      char *name_start;
8832      SV *svname;
8833      paren= '>';
8834    case '\'':          /* (?'...') */
8835       name_start= RExC_parse;
8836       svname = reg_scan_name(pRExC_state,
8837        SIZE_ONLY ?  /* reverse test from the others */
8838        REG_RSN_RETURN_NAME :
8839        REG_RSN_RETURN_NULL);
8840      if (RExC_parse == name_start) {
8841       RExC_parse++;
8842       vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8843       /*NOTREACHED*/
8844      }
8845      if (*RExC_parse != paren)
8846       vFAIL2("Sequence (?%c... not terminated",
8847        paren=='>' ? '<' : paren);
8848      if (SIZE_ONLY) {
8849       HE *he_str;
8850       SV *sv_dat = NULL;
8851       if (!svname) /* shouldn't happen */
8852        Perl_croak(aTHX_
8853         "panic: reg_scan_name returned NULL");
8854       if (!RExC_paren_names) {
8855        RExC_paren_names= newHV();
8856        sv_2mortal(MUTABLE_SV(RExC_paren_names));
8857 #ifdef DEBUGGING
8858        RExC_paren_name_list= newAV();
8859        sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8860 #endif
8861       }
8862       he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8863       if ( he_str )
8864        sv_dat = HeVAL(he_str);
8865       if ( ! sv_dat ) {
8866        /* croak baby croak */
8867        Perl_croak(aTHX_
8868         "panic: paren_name hash element allocation failed");
8869       } else if ( SvPOK(sv_dat) ) {
8870        /* (?|...) can mean we have dupes so scan to check
8871        its already been stored. Maybe a flag indicating
8872        we are inside such a construct would be useful,
8873        but the arrays are likely to be quite small, so
8874        for now we punt -- dmq */
8875        IV count = SvIV(sv_dat);
8876        I32 *pv = (I32*)SvPVX(sv_dat);
8877        IV i;
8878        for ( i = 0 ; i < count ; i++ ) {
8879         if ( pv[i] == RExC_npar ) {
8880          count = 0;
8881          break;
8882         }
8883        }
8884        if ( count ) {
8885         pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8886         SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8887         pv[count] = RExC_npar;
8888         SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8889        }
8890       } else {
8891        (void)SvUPGRADE(sv_dat,SVt_PVNV);
8892        sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8893        SvIOK_on(sv_dat);
8894        SvIV_set(sv_dat, 1);
8895       }
8896 #ifdef DEBUGGING
8897       /* Yes this does cause a memory leak in debugging Perls */
8898       if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8899        SvREFCNT_dec_NN(svname);
8900 #endif
8901
8902       /*sv_dump(sv_dat);*/
8903      }
8904      nextchar(pRExC_state);
8905      paren = 1;
8906      goto capturing_parens;
8907     }
8908     RExC_seen |= REG_SEEN_LOOKBEHIND;
8909     RExC_in_lookbehind++;
8910     RExC_parse++;
8911    case '=':           /* (?=...) */
8912     RExC_seen_zerolen++;
8913     break;
8914    case '!':           /* (?!...) */
8915     RExC_seen_zerolen++;
8916     if (*RExC_parse == ')') {
8917      ret=reg_node(pRExC_state, OPFAIL);
8918      nextchar(pRExC_state);
8919      return ret;
8920     }
8921     break;
8922    case '|':           /* (?|...) */
8923     /* branch reset, behave like a (?:...) except that
8924     buffers in alternations share the same numbers */
8925     paren = ':';
8926     after_freeze = freeze_paren = RExC_npar;
8927     break;
8928    case ':':           /* (?:...) */
8929    case '>':           /* (?>...) */
8930     break;
8931    case '$':           /* (?$...) */
8932    case '@':           /* (?@...) */
8933     vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8934     break;
8935    case '#':           /* (?#...) */
8936     /* XXX As soon as we disallow separating the '?' and '*' (by
8937     * spaces or (?#...) comment), it is believed that this case
8938     * will be unreachable and can be removed.  See
8939     * [perl #117327] */
8940     while (*RExC_parse && *RExC_parse != ')')
8941      RExC_parse++;
8942     if (*RExC_parse != ')')
8943      FAIL("Sequence (?#... not terminated");
8944     nextchar(pRExC_state);
8945     *flagp = TRYAGAIN;
8946     return NULL;
8947    case '0' :           /* (?0) */
8948    case 'R' :           /* (?R) */
8949     if (*RExC_parse != ')')
8950      FAIL("Sequence (?R) not terminated");
8951     ret = reg_node(pRExC_state, GOSTART);
8952     *flagp |= POSTPONED;
8953     nextchar(pRExC_state);
8954     return ret;
8955     /*notreached*/
8956    { /* named and numeric backreferences */
8957     I32 num;
8958    case '&':            /* (?&NAME) */
8959     parse_start = RExC_parse - 1;
8960    named_recursion:
8961     {
8962       SV *sv_dat = reg_scan_name(pRExC_state,
8963        SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8964       num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8965     }
8966     goto gen_recurse_regop;
8967     assert(0); /* NOT REACHED */
8968    case '+':
8969     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8970      RExC_parse++;
8971      vFAIL("Illegal pattern");
8972     }
8973     goto parse_recursion;
8974     /* NOT REACHED*/
8975    case '-': /* (?-1) */
8976     if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8977      RExC_parse--; /* rewind to let it be handled later */
8978      goto parse_flags;
8979     }
8980     /*FALLTHROUGH */
8981    case '1': case '2': case '3': case '4': /* (?1) */
8982    case '5': case '6': case '7': case '8': case '9':
8983     RExC_parse--;
8984    parse_recursion:
8985     num = atoi(RExC_parse);
8986     parse_start = RExC_parse - 1; /* MJD */
8987     if (*RExC_parse == '-')
8988      RExC_parse++;
8989     while (isDIGIT(*RExC_parse))
8990       RExC_parse++;
8991     if (*RExC_parse!=')')
8992      vFAIL("Expecting close bracket");
8993
8994    gen_recurse_regop:
8995     if ( paren == '-' ) {
8996      /*
8997      Diagram of capture buffer numbering.
8998      Top line is the normal capture buffer numbers
8999      Bottom line is the negative indexing as from
9000      the X (the (?-2))
9001
9002      +   1 2    3 4 5 X          6 7
9003      /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9004      -   5 4    3 2 1 X          x x
9005
9006      */
9007      num = RExC_npar + num;
9008      if (num < 1)  {
9009       RExC_parse++;
9010       vFAIL("Reference to nonexistent group");
9011      }
9012     } else if ( paren == '+' ) {
9013      num = RExC_npar + num - 1;
9014     }
9015
9016     ret = reganode(pRExC_state, GOSUB, num);
9017     if (!SIZE_ONLY) {
9018      if (num > (I32)RExC_rx->nparens) {
9019       RExC_parse++;
9020       vFAIL("Reference to nonexistent group");
9021      }
9022      ARG2L_SET( ret, RExC_recurse_count++);
9023      RExC_emit++;
9024      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9025       "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9026     } else {
9027      RExC_size++;
9028      }
9029      RExC_seen |= REG_SEEN_RECURSE;
9030     Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9031     Set_Node_Offset(ret, parse_start); /* MJD */
9032
9033     *flagp |= POSTPONED;
9034     nextchar(pRExC_state);
9035     return ret;
9036    } /* named and numeric backreferences */
9037    assert(0); /* NOT REACHED */
9038
9039    case '?':           /* (??...) */
9040     is_logical = 1;
9041     if (*RExC_parse != '{') {
9042      RExC_parse++;
9043      vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9044      /*NOTREACHED*/
9045     }
9046     *flagp |= POSTPONED;
9047     paren = *RExC_parse++;
9048     /* FALL THROUGH */
9049    case '{':           /* (?{...}) */
9050    {
9051     U32 n = 0;
9052     struct reg_code_block *cb;
9053
9054     RExC_seen_zerolen++;
9055
9056     if (   !pRExC_state->num_code_blocks
9057      || pRExC_state->code_index >= pRExC_state->num_code_blocks
9058      || pRExC_state->code_blocks[pRExC_state->code_index].start
9059       != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9060        - RExC_start)
9061     ) {
9062      if (RExC_pm_flags & PMf_USE_RE_EVAL)
9063       FAIL("panic: Sequence (?{...}): no code block found\n");
9064      FAIL("Eval-group not allowed at runtime, use re 'eval'");
9065     }
9066     /* this is a pre-compiled code block (?{...}) */
9067     cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9068     RExC_parse = RExC_start + cb->end;
9069     if (!SIZE_ONLY) {
9070      OP *o = cb->block;
9071      if (cb->src_regex) {
9072       n = add_data(pRExC_state, 2, "rl");
9073       RExC_rxi->data->data[n] =
9074        (void*)SvREFCNT_inc((SV*)cb->src_regex);
9075       RExC_rxi->data->data[n+1] = (void*)o;
9076      }
9077      else {
9078       n = add_data(pRExC_state, 1,
9079        (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9080       RExC_rxi->data->data[n] = (void*)o;
9081      }
9082     }
9083     pRExC_state->code_index++;
9084     nextchar(pRExC_state);
9085
9086     if (is_logical) {
9087      regnode *eval;
9088      ret = reg_node(pRExC_state, LOGICAL);
9089      eval = reganode(pRExC_state, EVAL, n);
9090      if (!SIZE_ONLY) {
9091       ret->flags = 2;
9092       /* for later propagation into (??{}) return value */
9093       eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9094      }
9095      REGTAIL(pRExC_state, ret, eval);
9096      /* deal with the length of this later - MJD */
9097      return ret;
9098     }
9099     ret = reganode(pRExC_state, EVAL, n);
9100     Set_Node_Length(ret, RExC_parse - parse_start + 1);
9101     Set_Node_Offset(ret, parse_start);
9102     return ret;
9103    }
9104    case '(':           /* (?(?{...})...) and (?(?=...)...) */
9105    {
9106     int is_define= 0;
9107     if (RExC_parse[0] == '?') {        /* (?(?...)) */
9108      if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9109       || RExC_parse[1] == '<'
9110       || RExC_parse[1] == '{') { /* Lookahead or eval. */
9111       I32 flag;
9112       regnode *tail;
9113
9114       ret = reg_node(pRExC_state, LOGICAL);
9115       if (!SIZE_ONLY)
9116        ret->flags = 1;
9117
9118       tail = reg(pRExC_state, 1, &flag, depth+1);
9119       if (flag & RESTART_UTF8) {
9120        *flagp = RESTART_UTF8;
9121        return NULL;
9122       }
9123       REGTAIL(pRExC_state, ret, tail);
9124       goto insert_if;
9125      }
9126     }
9127     else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
9128       || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9129     {
9130      char ch = RExC_parse[0] == '<' ? '>' : '\'';
9131      char *name_start= RExC_parse++;
9132      U32 num = 0;
9133      SV *sv_dat=reg_scan_name(pRExC_state,
9134       SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9135      if (RExC_parse == name_start || *RExC_parse != ch)
9136       vFAIL2("Sequence (?(%c... not terminated",
9137        (ch == '>' ? '<' : ch));
9138      RExC_parse++;
9139      if (!SIZE_ONLY) {
9140       num = add_data( pRExC_state, 1, "S" );
9141       RExC_rxi->data->data[num]=(void*)sv_dat;
9142       SvREFCNT_inc_simple_void(sv_dat);
9143      }
9144      ret = reganode(pRExC_state,NGROUPP,num);
9145      goto insert_if_check_paren;
9146     }
9147     else if (RExC_parse[0] == 'D' &&
9148       RExC_parse[1] == 'E' &&
9149       RExC_parse[2] == 'F' &&
9150       RExC_parse[3] == 'I' &&
9151       RExC_parse[4] == 'N' &&
9152       RExC_parse[5] == 'E')
9153     {
9154      ret = reganode(pRExC_state,DEFINEP,0);
9155      RExC_parse +=6 ;
9156      is_define = 1;
9157      goto insert_if_check_paren;
9158     }
9159     else if (RExC_parse[0] == 'R') {
9160      RExC_parse++;
9161      parno = 0;
9162      if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9163       parno = atoi(RExC_parse++);
9164       while (isDIGIT(*RExC_parse))
9165        RExC_parse++;
9166      } else if (RExC_parse[0] == '&') {
9167       SV *sv_dat;
9168       RExC_parse++;
9169       sv_dat = reg_scan_name(pRExC_state,
9170         SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9171        parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9172      }
9173      ret = reganode(pRExC_state,INSUBP,parno);
9174      goto insert_if_check_paren;
9175     }
9176     else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9177      /* (?(1)...) */
9178      char c;
9179      parno = atoi(RExC_parse++);
9180
9181      while (isDIGIT(*RExC_parse))
9182       RExC_parse++;
9183      ret = reganode(pRExC_state, GROUPP, parno);
9184
9185     insert_if_check_paren:
9186      if ((c = *nextchar(pRExC_state)) != ')')
9187       vFAIL("Switch condition not recognized");
9188     insert_if:
9189      REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9190      br = regbranch(pRExC_state, &flags, 1,depth+1);
9191      if (br == NULL) {
9192       if (flags & RESTART_UTF8) {
9193        *flagp = RESTART_UTF8;
9194        return NULL;
9195       }
9196       FAIL2("panic: regbranch returned NULL, flags=%#X",
9197        flags);
9198      } else
9199       REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9200      c = *nextchar(pRExC_state);
9201      if (flags&HASWIDTH)
9202       *flagp |= HASWIDTH;
9203      if (c == '|') {
9204       if (is_define)
9205        vFAIL("(?(DEFINE)....) does not allow branches");
9206       lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9207       if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9208        if (flags & RESTART_UTF8) {
9209         *flagp = RESTART_UTF8;
9210         return NULL;
9211        }
9212        FAIL2("panic: regbranch returned NULL, flags=%#X",
9213         flags);
9214       }
9215       REGTAIL(pRExC_state, ret, lastbr);
9216       if (flags&HASWIDTH)
9217        *flagp |= HASWIDTH;
9218       c = *nextchar(pRExC_state);
9219      }
9220      else
9221       lastbr = NULL;
9222      if (c != ')')
9223       vFAIL("Switch (?(condition)... contains too many branches");
9224      ender = reg_node(pRExC_state, TAIL);
9225      REGTAIL(pRExC_state, br, ender);
9226      if (lastbr) {
9227       REGTAIL(pRExC_state, lastbr, ender);
9228       REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9229      }
9230      else
9231       REGTAIL(pRExC_state, ret, ender);
9232      RExC_size++; /* XXX WHY do we need this?!!
9233          For large programs it seems to be required
9234          but I can't figure out why. -- dmq*/
9235      return ret;
9236     }
9237     else {
9238      vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9239     }
9240    }
9241    case '[':           /* (?[ ... ]) */
9242     return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9243           oregcomp_parse);
9244    case 0:
9245     RExC_parse--; /* for vFAIL to print correctly */
9246     vFAIL("Sequence (? incomplete");
9247     break;
9248    default: /* e.g., (?i) */
9249     --RExC_parse;
9250    parse_flags:
9251     parse_lparen_question_flags(pRExC_state);
9252     if (UCHARAT(RExC_parse) != ':') {
9253      nextchar(pRExC_state);
9254      *flagp = TRYAGAIN;
9255      return NULL;
9256     }
9257     paren = ':';
9258     nextchar(pRExC_state);
9259     ret = NULL;
9260     goto parse_rest;
9261    } /* end switch */
9262   }
9263   else {                  /* (...) */
9264   capturing_parens:
9265    parno = RExC_npar;
9266    RExC_npar++;
9267
9268    ret = reganode(pRExC_state, OPEN, parno);
9269    if (!SIZE_ONLY ){
9270     if (!RExC_nestroot)
9271      RExC_nestroot = parno;
9272     if (RExC_seen & REG_SEEN_RECURSE
9273      && !RExC_open_parens[parno-1])
9274     {
9275      DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9276       "Setting open paren #%"IVdf" to %d\n",
9277       (IV)parno, REG_NODE_NUM(ret)));
9278      RExC_open_parens[parno-1]= ret;
9279     }
9280    }
9281    Set_Node_Length(ret, 1); /* MJD */
9282    Set_Node_Offset(ret, RExC_parse); /* MJD */
9283    is_open = 1;
9284   }
9285  }
9286  else                        /* ! paren */
9287   ret = NULL;
9288
9289    parse_rest:
9290  /* Pick up the branches, linking them together. */
9291  parse_start = RExC_parse;   /* MJD */
9292  br = regbranch(pRExC_state, &flags, 1,depth+1);
9293
9294  /*     branch_len = (paren != 0); */
9295
9296  if (br == NULL) {
9297   if (flags & RESTART_UTF8) {
9298    *flagp = RESTART_UTF8;
9299    return NULL;
9300   }
9301   FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9302  }
9303  if (*RExC_parse == '|') {
9304   if (!SIZE_ONLY && RExC_extralen) {
9305    reginsert(pRExC_state, BRANCHJ, br, depth+1);
9306   }
9307   else {                  /* MJD */
9308    reginsert(pRExC_state, BRANCH, br, depth+1);
9309    Set_Node_Length(br, paren != 0);
9310    Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9311   }
9312   have_branch = 1;
9313   if (SIZE_ONLY)
9314    RExC_extralen += 1;  /* For BRANCHJ-BRANCH. */
9315  }
9316  else if (paren == ':') {
9317   *flagp |= flags&SIMPLE;
9318  }
9319  if (is_open) {    /* Starts with OPEN. */
9320   REGTAIL(pRExC_state, ret, br);          /* OPEN -> first. */
9321  }
9322  else if (paren != '?')  /* Not Conditional */
9323   ret = br;
9324  *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9325  lastbr = br;
9326  while (*RExC_parse == '|') {
9327   if (!SIZE_ONLY && RExC_extralen) {
9328    ender = reganode(pRExC_state, LONGJMP,0);
9329    REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9330   }
9331   if (SIZE_ONLY)
9332    RExC_extralen += 2;  /* Account for LONGJMP. */
9333   nextchar(pRExC_state);
9334   if (freeze_paren) {
9335    if (RExC_npar > after_freeze)
9336     after_freeze = RExC_npar;
9337    RExC_npar = freeze_paren;
9338   }
9339   br = regbranch(pRExC_state, &flags, 0, depth+1);
9340
9341   if (br == NULL) {
9342    if (flags & RESTART_UTF8) {
9343     *flagp = RESTART_UTF8;
9344     return NULL;
9345    }
9346    FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
9347   }
9348   REGTAIL(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
9349   lastbr = br;
9350   *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9351  }
9352
9353  if (have_branch || paren != ':') {
9354   /* Make a closing node, and hook it on the end. */
9355   switch (paren) {
9356   case ':':
9357    ender = reg_node(pRExC_state, TAIL);
9358    break;
9359   case 1: case 2:
9360    ender = reganode(pRExC_state, CLOSE, parno);
9361    if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9362     DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9363       "Setting close paren #%"IVdf" to %d\n",
9364       (IV)parno, REG_NODE_NUM(ender)));
9365     RExC_close_parens[parno-1]= ender;
9366     if (RExC_nestroot == parno)
9367      RExC_nestroot = 0;
9368    }
9369    Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9370    Set_Node_Length(ender,1); /* MJD */
9371    break;
9372   case '<':
9373   case ',':
9374   case '=':
9375   case '!':
9376    *flagp &= ~HASWIDTH;
9377    /* FALL THROUGH */
9378   case '>':
9379    ender = reg_node(pRExC_state, SUCCEED);
9380    break;
9381   case 0:
9382    ender = reg_node(pRExC_state, END);
9383    if (!SIZE_ONLY) {
9384     assert(!RExC_opend); /* there can only be one! */
9385     RExC_opend = ender;
9386    }
9387    break;
9388   }
9389   DEBUG_PARSE_r(if (!SIZE_ONLY) {
9390    SV * const mysv_val1=sv_newmortal();
9391    SV * const mysv_val2=sv_newmortal();
9392    DEBUG_PARSE_MSG("lsbr");
9393    regprop(RExC_rx, mysv_val1, lastbr);
9394    regprop(RExC_rx, mysv_val2, ender);
9395    PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9396       SvPV_nolen_const(mysv_val1),
9397       (IV)REG_NODE_NUM(lastbr),
9398       SvPV_nolen_const(mysv_val2),
9399       (IV)REG_NODE_NUM(ender),
9400       (IV)(ender - lastbr)
9401    );
9402   });
9403   REGTAIL(pRExC_state, lastbr, ender);
9404
9405   if (have_branch && !SIZE_ONLY) {
9406    char is_nothing= 1;
9407    if (depth==1)
9408     RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9409
9410    /* Hook the tails of the branches to the closing node. */
9411    for (br = ret; br; br = regnext(br)) {
9412     const U8 op = PL_regkind[OP(br)];
9413     if (op == BRANCH) {
9414      REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9415      if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9416       is_nothing= 0;
9417     }
9418     else if (op == BRANCHJ) {
9419      REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9420      /* for now we always disable this optimisation * /
9421      if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9422      */
9423       is_nothing= 0;
9424     }
9425    }
9426    if (is_nothing) {
9427     br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9428     DEBUG_PARSE_r(if (!SIZE_ONLY) {
9429      SV * const mysv_val1=sv_newmortal();
9430      SV * const mysv_val2=sv_newmortal();
9431      DEBUG_PARSE_MSG("NADA");
9432      regprop(RExC_rx, mysv_val1, ret);
9433      regprop(RExC_rx, mysv_val2, ender);
9434      PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9435         SvPV_nolen_const(mysv_val1),
9436         (IV)REG_NODE_NUM(ret),
9437         SvPV_nolen_const(mysv_val2),
9438         (IV)REG_NODE_NUM(ender),
9439         (IV)(ender - ret)
9440      );
9441     });
9442     OP(br)= NOTHING;
9443     if (OP(ender) == TAIL) {
9444      NEXT_OFF(br)= 0;
9445      RExC_emit= br + 1;
9446     } else {
9447      regnode *opt;
9448      for ( opt= br + 1; opt < ender ; opt++ )
9449       OP(opt)= OPTIMIZED;
9450      NEXT_OFF(br)= ender - br;
9451     }
9452    }
9453   }
9454  }
9455
9456  {
9457   const char *p;
9458   static const char parens[] = "=!<,>";
9459
9460   if (paren && (p = strchr(parens, paren))) {
9461    U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9462    int flag = (p - parens) > 1;
9463
9464    if (paren == '>')
9465     node = SUSPEND, flag = 0;
9466    reginsert(pRExC_state, node,ret, depth+1);
9467    Set_Node_Cur_Length(ret);
9468    Set_Node_Offset(ret, parse_start + 1);
9469    ret->flags = flag;
9470    REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9471   }
9472  }
9473
9474  /* Check for proper termination. */
9475  if (paren) {
9476   /* restore original flags, but keep (?p) */
9477   RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9478   if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9479    RExC_parse = oregcomp_parse;
9480    vFAIL("Unmatched (");
9481   }
9482  }
9483  else if (!paren && RExC_parse < RExC_end) {
9484   if (*RExC_parse == ')') {
9485    RExC_parse++;
9486    vFAIL("Unmatched )");
9487   }
9488   else
9489    FAIL("Junk on end of regexp"); /* "Can't happen". */
9490   assert(0); /* NOTREACHED */
9491  }
9492
9493  if (RExC_in_lookbehind) {
9494   RExC_in_lookbehind--;
9495  }
9496  if (after_freeze > RExC_npar)
9497   RExC_npar = after_freeze;
9498  return(ret);
9499 }
9500
9501 /*
9502  - regbranch - one alternative of an | operator
9503  *
9504  * Implements the concatenation operator.
9505  *
9506  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9507  * restarted.
9508  */
9509 STATIC regnode *
9510 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9511 {
9512  dVAR;
9513  regnode *ret;
9514  regnode *chain = NULL;
9515  regnode *latest;
9516  I32 flags = 0, c = 0;
9517  GET_RE_DEBUG_FLAGS_DECL;
9518
9519  PERL_ARGS_ASSERT_REGBRANCH;
9520
9521  DEBUG_PARSE("brnc");
9522
9523  if (first)
9524   ret = NULL;
9525  else {
9526   if (!SIZE_ONLY && RExC_extralen)
9527    ret = reganode(pRExC_state, BRANCHJ,0);
9528   else {
9529    ret = reg_node(pRExC_state, BRANCH);
9530    Set_Node_Length(ret, 1);
9531   }
9532  }
9533
9534  if (!first && SIZE_ONLY)
9535   RExC_extralen += 1;   /* BRANCHJ */
9536
9537  *flagp = WORST;   /* Tentatively. */
9538
9539  RExC_parse--;
9540  nextchar(pRExC_state);
9541  while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9542   flags &= ~TRYAGAIN;
9543   latest = regpiece(pRExC_state, &flags,depth+1);
9544   if (latest == NULL) {
9545    if (flags & TRYAGAIN)
9546     continue;
9547    if (flags & RESTART_UTF8) {
9548     *flagp = RESTART_UTF8;
9549     return NULL;
9550    }
9551    FAIL2("panic: regpiece returned NULL, flags=%#X", flags);
9552   }
9553   else if (ret == NULL)
9554    ret = latest;
9555   *flagp |= flags&(HASWIDTH|POSTPONED);
9556   if (chain == NULL)  /* First piece. */
9557    *flagp |= flags&SPSTART;
9558   else {
9559    RExC_naughty++;
9560    REGTAIL(pRExC_state, chain, latest);
9561   }
9562   chain = latest;
9563   c++;
9564  }
9565  if (chain == NULL) { /* Loop ran zero times. */
9566   chain = reg_node(pRExC_state, NOTHING);
9567   if (ret == NULL)
9568    ret = chain;
9569  }
9570  if (c == 1) {
9571   *flagp |= flags&SIMPLE;
9572  }
9573
9574  return ret;
9575 }
9576
9577 /*
9578  - regpiece - something followed by possible [*+?]
9579  *
9580  * Note that the branching code sequences used for ? and the general cases
9581  * of * and + are somewhat optimized:  they use the same NOTHING node as
9582  * both the endmarker for their branch list and the body of the last branch.
9583  * It might seem that this node could be dispensed with entirely, but the
9584  * endmarker role is not redundant.
9585  *
9586  * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9587  * TRYAGAIN.
9588  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9589  * restarted.
9590  */
9591 STATIC regnode *
9592 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9593 {
9594  dVAR;
9595  regnode *ret;
9596  char op;
9597  char *next;
9598  I32 flags;
9599  const char * const origparse = RExC_parse;
9600  I32 min;
9601  I32 max = REG_INFTY;
9602 #ifdef RE_TRACK_PATTERN_OFFSETS
9603  char *parse_start;
9604 #endif
9605  const char *maxpos = NULL;
9606
9607  /* Save the original in case we change the emitted regop to a FAIL. */
9608  regnode * const orig_emit = RExC_emit;
9609
9610  GET_RE_DEBUG_FLAGS_DECL;
9611
9612  PERL_ARGS_ASSERT_REGPIECE;
9613
9614  DEBUG_PARSE("piec");
9615
9616  ret = regatom(pRExC_state, &flags,depth+1);
9617  if (ret == NULL) {
9618   if (flags & (TRYAGAIN|RESTART_UTF8))
9619    *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9620   else
9621    FAIL2("panic: regatom returned NULL, flags=%#X", flags);
9622   return(NULL);
9623  }
9624
9625  op = *RExC_parse;
9626
9627  if (op == '{' && regcurly(RExC_parse, FALSE)) {
9628   maxpos = NULL;
9629 #ifdef RE_TRACK_PATTERN_OFFSETS
9630   parse_start = RExC_parse; /* MJD */
9631 #endif
9632   next = RExC_parse + 1;
9633   while (isDIGIT(*next) || *next == ',') {
9634    if (*next == ',') {
9635     if (maxpos)
9636      break;
9637     else
9638      maxpos = next;
9639    }
9640    next++;
9641   }
9642   if (*next == '}') {  /* got one */
9643    if (!maxpos)
9644     maxpos = next;
9645    RExC_parse++;
9646    min = atoi(RExC_parse);
9647    if (*maxpos == ',')
9648     maxpos++;
9649    else
9650     maxpos = RExC_parse;
9651    max = atoi(maxpos);
9652    if (!max && *maxpos != '0')
9653     max = REG_INFTY;  /* meaning "infinity" */
9654    else if (max >= REG_INFTY)
9655     vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9656    RExC_parse = next;
9657    nextchar(pRExC_state);
9658    if (max < min) {    /* If can't match, warn and optimize to fail
9659         unconditionally */
9660     if (SIZE_ONLY) {
9661      ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9662
9663      /* We can't back off the size because we have to reserve
9664      * enough space for all the things we are about to throw
9665      * away, but we can shrink it by the ammount we are about
9666      * to re-use here */
9667      RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9668     }
9669     else {
9670      RExC_emit = orig_emit;
9671     }
9672     ret = reg_node(pRExC_state, OPFAIL);
9673     return ret;
9674    }
9675
9676   do_curly:
9677    if ((flags&SIMPLE)) {
9678     RExC_naughty += 2 + RExC_naughty / 2;
9679     reginsert(pRExC_state, CURLY, ret, depth+1);
9680     Set_Node_Offset(ret, parse_start+1); /* MJD */
9681     Set_Node_Cur_Length(ret);
9682    }
9683    else {
9684     regnode * const w = reg_node(pRExC_state, WHILEM);
9685
9686     w->flags = 0;
9687     REGTAIL(pRExC_state, ret, w);
9688     if (!SIZE_ONLY && RExC_extralen) {
9689      reginsert(pRExC_state, LONGJMP,ret, depth+1);
9690      reginsert(pRExC_state, NOTHING,ret, depth+1);
9691      NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9692     }
9693     reginsert(pRExC_state, CURLYX,ret, depth+1);
9694         /* MJD hk */
9695     Set_Node_Offset(ret, parse_start+1);
9696     Set_Node_Length(ret,
9697         op == '{' ? (RExC_parse - parse_start) : 1);
9698
9699     if (!SIZE_ONLY && RExC_extralen)
9700      NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9701     REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9702     if (SIZE_ONLY)
9703      RExC_whilem_seen++, RExC_extralen += 3;
9704     RExC_naughty += 4 + RExC_naughty; /* compound interest */
9705    }
9706    ret->flags = 0;
9707
9708    if (min > 0)
9709     *flagp = WORST;
9710    if (max > 0)
9711     *flagp |= HASWIDTH;
9712    if (!SIZE_ONLY) {
9713     ARG1_SET(ret, (U16)min);
9714     ARG2_SET(ret, (U16)max);
9715    }
9716
9717    goto nest_check;
9718   }
9719  }
9720
9721  if (!ISMULT1(op)) {
9722   *flagp = flags;
9723   return(ret);
9724  }
9725
9726 #if 0    /* Now runtime fix should be reliable. */
9727
9728  /* if this is reinstated, don't forget to put this back into perldiag:
9729
9730    =item Regexp *+ operand could be empty at {#} in regex m/%s/
9731
9732   (F) The part of the regexp subject to either the * or + quantifier
9733   could match an empty string. The {#} shows in the regular
9734   expression about where the problem was discovered.
9735
9736  */
9737
9738  if (!(flags&HASWIDTH) && op != '?')
9739  vFAIL("Regexp *+ operand could be empty");
9740 #endif
9741
9742 #ifdef RE_TRACK_PATTERN_OFFSETS
9743  parse_start = RExC_parse;
9744 #endif
9745  nextchar(pRExC_state);
9746
9747  *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9748
9749  if (op == '*' && (flags&SIMPLE)) {
9750   reginsert(pRExC_state, STAR, ret, depth+1);
9751   ret->flags = 0;
9752   RExC_naughty += 4;
9753  }
9754  else if (op == '*') {
9755   min = 0;
9756   goto do_curly;
9757  }
9758  else if (op == '+' && (flags&SIMPLE)) {
9759   reginsert(pRExC_state, PLUS, ret, depth+1);
9760   ret->flags = 0;
9761   RExC_naughty += 3;
9762  }
9763  else if (op == '+') {
9764   min = 1;
9765   goto do_curly;
9766  }
9767  else if (op == '?') {
9768   min = 0; max = 1;
9769   goto do_curly;
9770  }
9771   nest_check:
9772  if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9773   SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9774   ckWARN3reg(RExC_parse,
9775     "%.*s matches null string many times",
9776     (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9777     origparse);
9778   (void)ReREFCNT_inc(RExC_rx_sv);
9779  }
9780
9781  if (RExC_parse < RExC_end && *RExC_parse == '?') {
9782   nextchar(pRExC_state);
9783   reginsert(pRExC_state, MINMOD, ret, depth+1);
9784   REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9785  }
9786 #ifndef REG_ALLOW_MINMOD_SUSPEND
9787  else
9788 #endif
9789  if (RExC_parse < RExC_end && *RExC_parse == '+') {
9790   regnode *ender;
9791   nextchar(pRExC_state);
9792   ender = reg_node(pRExC_state, SUCCEED);
9793   REGTAIL(pRExC_state, ret, ender);
9794   reginsert(pRExC_state, SUSPEND, ret, depth+1);
9795   ret->flags = 0;
9796   ender = reg_node(pRExC_state, TAIL);
9797   REGTAIL(pRExC_state, ret, ender);
9798   /*ret= ender;*/
9799  }
9800
9801  if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9802   RExC_parse++;
9803   vFAIL("Nested quantifiers");
9804  }
9805
9806  return(ret);
9807 }
9808
9809 STATIC bool
9810 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9811   const bool strict   /* Apply stricter parsing rules? */
9812  )
9813 {
9814
9815  /* This is expected to be called by a parser routine that has recognized '\N'
9816    and needs to handle the rest. RExC_parse is expected to point at the first
9817    char following the N at the time of the call.  On successful return,
9818    RExC_parse has been updated to point to just after the sequence identified
9819    by this routine, and <*flagp> has been updated.
9820
9821    The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9822    character class.
9823
9824    \N may begin either a named sequence, or if outside a character class, mean
9825    to match a non-newline.  For non single-quoted regexes, the tokenizer has
9826    attempted to decide which, and in the case of a named sequence, converted it
9827    into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9828    where c1... are the characters in the sequence.  For single-quoted regexes,
9829    the tokenizer passes the \N sequence through unchanged; this code will not
9830    attempt to determine this nor expand those, instead raising a syntax error.
9831    The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9832    or there is no '}', it signals that this \N occurrence means to match a
9833    non-newline.
9834
9835    Only the \N{U+...} form should occur in a character class, for the same
9836    reason that '.' inside a character class means to just match a period: it
9837    just doesn't make sense.
9838
9839    The function raises an error (via vFAIL), and doesn't return for various
9840    syntax errors.  Otherwise it returns TRUE and sets <node_p> or <valuep> on
9841    success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9842    RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9843    only possible if node_p is non-NULL.
9844
9845
9846    If <valuep> is non-null, it means the caller can accept an input sequence
9847    consisting of a just a single code point; <*valuep> is set to that value
9848    if the input is such.
9849
9850    If <node_p> is non-null it signifies that the caller can accept any other
9851    legal sequence (i.e., one that isn't just a single code point).  <*node_p>
9852    is set as follows:
9853  1) \N means not-a-NL: points to a newly created REG_ANY node;
9854  2) \N{}:              points to a new NOTHING node;
9855  3) otherwise:         points to a new EXACT node containing the resolved
9856       string.
9857    Note that FALSE is returned for single code point sequences if <valuep> is
9858    null.
9859  */
9860
9861  char * endbrace;    /* '}' following the name */
9862  char* p;
9863  char *endchar; /* Points to '.' or '}' ending cur char in the input
9864       stream */
9865  bool has_multiple_chars; /* true if the input stream contains a sequence of
9866         more than one character */
9867
9868  GET_RE_DEBUG_FLAGS_DECL;
9869
9870  PERL_ARGS_ASSERT_GROK_BSLASH_N;
9871
9872  GET_RE_DEBUG_FLAGS;
9873
9874  assert(cBOOL(node_p) ^ cBOOL(valuep));  /* Exactly one should be set */
9875
9876  /* The [^\n] meaning of \N ignores spaces and comments under the /x
9877  * modifier.  The other meaning does not */
9878  p = (RExC_flags & RXf_PMf_EXTENDED)
9879   ? regwhite( pRExC_state, RExC_parse )
9880   : RExC_parse;
9881
9882  /* Disambiguate between \N meaning a named character versus \N meaning
9883  * [^\n].  The former is assumed when it can't be the latter. */
9884  if (*p != '{' || regcurly(p, FALSE)) {
9885   RExC_parse = p;
9886   if (! node_p) {
9887    /* no bare \N in a charclass */
9888    if (in_char_class) {
9889     vFAIL("\\N in a character class must be a named character: \\N{...}");
9890    }
9891    return FALSE;
9892   }
9893   nextchar(pRExC_state);
9894   *node_p = reg_node(pRExC_state, REG_ANY);
9895   *flagp |= HASWIDTH|SIMPLE;
9896   RExC_naughty++;
9897   RExC_parse--;
9898   Set_Node_Length(*node_p, 1); /* MJD */
9899   return TRUE;
9900  }
9901
9902  /* Here, we have decided it should be a named character or sequence */
9903
9904  /* The test above made sure that the next real character is a '{', but
9905  * under the /x modifier, it could be separated by space (or a comment and
9906  * \n) and this is not allowed (for consistency with \x{...} and the
9907  * tokenizer handling of \N{NAME}). */
9908  if (*RExC_parse != '{') {
9909   vFAIL("Missing braces on \\N{}");
9910  }
9911
9912  RExC_parse++; /* Skip past the '{' */
9913
9914  if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9915   || ! (endbrace == RExC_parse  /* nothing between the {} */
9916    || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9917     && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9918  {
9919   if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9920   vFAIL("\\N{NAME} must be resolved by the lexer");
9921  }
9922
9923  if (endbrace == RExC_parse) {   /* empty: \N{} */
9924   bool ret = TRUE;
9925   if (node_p) {
9926    *node_p = reg_node(pRExC_state,NOTHING);
9927   }
9928   else if (in_char_class) {
9929    if (SIZE_ONLY && in_char_class) {
9930     if (strict) {
9931      RExC_parse++;   /* Position after the "}" */
9932      vFAIL("Zero length \\N{}");
9933     }
9934     else {
9935      ckWARNreg(RExC_parse,
9936        "Ignoring zero length \\N{} in character class");
9937     }
9938    }
9939    ret = FALSE;
9940   }
9941   else {
9942    return FALSE;
9943   }
9944   nextchar(pRExC_state);
9945   return ret;
9946  }
9947
9948  RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9949  RExC_parse += 2; /* Skip past the 'U+' */
9950
9951  endchar = RExC_parse + strcspn(RExC_parse, ".}");
9952
9953  /* Code points are separated by dots.  If none, there is only one code
9954  * point, and is terminated by the brace */
9955  has_multiple_chars = (endchar < endbrace);
9956
9957  if (valuep && (! has_multiple_chars || in_char_class)) {
9958   /* We only pay attention to the first char of
9959   multichar strings being returned in char classes. I kinda wonder
9960   if this makes sense as it does change the behaviour
9961   from earlier versions, OTOH that behaviour was broken
9962   as well. XXX Solution is to recharacterize as
9963   [rest-of-class]|multi1|multi2... */
9964
9965   STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9966   I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9967    | PERL_SCAN_DISALLOW_PREFIX
9968    | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9969
9970   *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9971
9972   /* The tokenizer should have guaranteed validity, but it's possible to
9973   * bypass it by using single quoting, so check */
9974   if (length_of_hex == 0
9975    || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9976   {
9977    RExC_parse += length_of_hex; /* Includes all the valid */
9978    RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9979        ? UTF8SKIP(RExC_parse)
9980        : 1;
9981    /* Guard against malformed utf8 */
9982    if (RExC_parse >= endchar) {
9983     RExC_parse = endchar;
9984    }
9985    vFAIL("Invalid hexadecimal number in \\N{U+...}");
9986   }
9987
9988   if (in_char_class && has_multiple_chars) {
9989    if (strict) {
9990     RExC_parse = endbrace;
9991     vFAIL("\\N{} in character class restricted to one character");
9992    }
9993    else {
9994     ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9995    }
9996   }
9997
9998   RExC_parse = endbrace + 1;
9999  }
10000  else if (! node_p || ! has_multiple_chars) {
10001
10002   /* Here, the input is legal, but not according to the caller's
10003   * options.  We fail without advancing the parse, so that the
10004   * caller can try again */
10005   RExC_parse = p;
10006   return FALSE;
10007  }
10008  else {
10009
10010   /* What is done here is to convert this to a sub-pattern of the form
10011   * (?:\x{char1}\x{char2}...)
10012   * and then call reg recursively.  That way, it retains its atomicness,
10013   * while not having to worry about special handling that some code
10014   * points may have.  toke.c has converted the original Unicode values
10015   * to native, so that we can just pass on the hex values unchanged.  We
10016   * do have to set a flag to keep recoding from happening in the
10017   * recursion */
10018
10019   SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10020   STRLEN len;
10021   char *orig_end = RExC_end;
10022   I32 flags;
10023
10024   while (RExC_parse < endbrace) {
10025
10026    /* Convert to notation the rest of the code understands */
10027    sv_catpv(substitute_parse, "\\x{");
10028    sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10029    sv_catpv(substitute_parse, "}");
10030
10031    /* Point to the beginning of the next character in the sequence. */
10032    RExC_parse = endchar + 1;
10033    endchar = RExC_parse + strcspn(RExC_parse, ".}");
10034   }
10035   sv_catpv(substitute_parse, ")");
10036
10037   RExC_parse = SvPV(substitute_parse, len);
10038
10039   /* Don't allow empty number */
10040   if (len < 8) {
10041    vFAIL("Invalid hexadecimal number in \\N{U+...}");
10042   }
10043   RExC_end = RExC_parse + len;
10044
10045   /* The values are Unicode, and therefore not subject to recoding */
10046   RExC_override_recoding = 1;
10047
10048   if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10049    if (flags & RESTART_UTF8) {
10050     *flagp = RESTART_UTF8;
10051     return FALSE;
10052    }
10053    FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X",
10054     flags);
10055   }
10056   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10057
10058   RExC_parse = endbrace;
10059   RExC_end = orig_end;
10060   RExC_override_recoding = 0;
10061
10062   nextchar(pRExC_state);
10063  }
10064
10065  return TRUE;
10066 }
10067
10068
10069 /*
10070  * reg_recode
10071  *
10072  * It returns the code point in utf8 for the value in *encp.
10073  *    value: a code value in the source encoding
10074  *    encp:  a pointer to an Encode object
10075  *
10076  * If the result from Encode is not a single character,
10077  * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10078  */
10079 STATIC UV
10080 S_reg_recode(pTHX_ const char value, SV **encp)
10081 {
10082  STRLEN numlen = 1;
10083  SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10084  const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10085  const STRLEN newlen = SvCUR(sv);
10086  UV uv = UNICODE_REPLACEMENT;
10087
10088  PERL_ARGS_ASSERT_REG_RECODE;
10089
10090  if (newlen)
10091   uv = SvUTF8(sv)
10092    ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10093    : *(U8*)s;
10094
10095  if (!newlen || numlen != newlen) {
10096   uv = UNICODE_REPLACEMENT;
10097   *encp = NULL;
10098  }
10099  return uv;
10100 }
10101
10102 PERL_STATIC_INLINE U8
10103 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10104 {
10105  U8 op;
10106
10107  PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10108
10109  if (! FOLD) {
10110   return EXACT;
10111  }
10112
10113  op = get_regex_charset(RExC_flags);
10114  if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10115   op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10116     been, so there is no hole */
10117  }
10118
10119  return op + EXACTF;
10120 }
10121
10122 PERL_STATIC_INLINE void
10123 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10124 {
10125  /* This knows the details about sizing an EXACTish node, setting flags for
10126  * it (by setting <*flagp>, and potentially populating it with a single
10127  * character.
10128  *
10129  * If <len> (the length in bytes) is non-zero, this function assumes that
10130  * the node has already been populated, and just does the sizing.  In this
10131  * case <code_point> should be the final code point that has already been
10132  * placed into the node.  This value will be ignored except that under some
10133  * circumstances <*flagp> is set based on it.
10134  *
10135  * If <len> is zero, the function assumes that the node is to contain only
10136  * the single character given by <code_point> and calculates what <len>
10137  * should be.  In pass 1, it sizes the node appropriately.  In pass 2, it
10138  * additionally will populate the node's STRING with <code_point>, if <len>
10139  * is 0.  In both cases <*flagp> is appropriately set
10140  *
10141  * It knows that under FOLD, the Latin Sharp S and UTF characters above
10142  * 255, must be folded (the former only when the rules indicate it can
10143  * match 'ss') */
10144
10145  bool len_passed_in = cBOOL(len != 0);
10146  U8 character[UTF8_MAXBYTES_CASE+1];
10147
10148  PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10149
10150  if (! len_passed_in) {
10151   if (UTF) {
10152    if (FOLD && (! LOC || code_point > 255)) {
10153     _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
10154         character,
10155         &len,
10156         FOLD_FLAGS_FULL | ((LOC)
10157              ? FOLD_FLAGS_LOCALE
10158              : (ASCII_FOLD_RESTRICTED)
10159              ? FOLD_FLAGS_NOMIX_ASCII
10160              : 0));
10161    }
10162    else {
10163     uvchr_to_utf8( character, code_point);
10164     len = UTF8SKIP(character);
10165    }
10166   }
10167   else if (! FOLD
10168     || code_point != LATIN_SMALL_LETTER_SHARP_S
10169     || ASCII_FOLD_RESTRICTED
10170     || ! AT_LEAST_UNI_SEMANTICS)
10171   {
10172    *character = (U8) code_point;
10173    len = 1;
10174   }
10175   else {
10176    *character = 's';
10177    *(character + 1) = 's';
10178    len = 2;
10179   }
10180  }
10181
10182  if (SIZE_ONLY) {
10183   RExC_size += STR_SZ(len);
10184  }
10185  else {
10186   RExC_emit += STR_SZ(len);
10187   STR_LEN(node) = len;
10188   if (! len_passed_in) {
10189    Copy((char *) character, STRING(node), len, char);
10190   }
10191  }
10192
10193  *flagp |= HASWIDTH;
10194
10195  /* A single character node is SIMPLE, except for the special-cased SHARP S
10196  * under /di. */
10197  if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10198   && (code_point != LATIN_SMALL_LETTER_SHARP_S
10199    || ! FOLD || ! DEPENDS_SEMANTICS))
10200  {
10201   *flagp |= SIMPLE;
10202  }
10203 }
10204
10205 /*
10206  - regatom - the lowest level
10207
10208    Try to identify anything special at the start of the pattern. If there
10209    is, then handle it as required. This may involve generating a single regop,
10210    such as for an assertion; or it may involve recursing, such as to
10211    handle a () structure.
10212
10213    If the string doesn't start with something special then we gobble up
10214    as much literal text as we can.
10215
10216    Once we have been able to handle whatever type of thing started the
10217    sequence, we return.
10218
10219    Note: we have to be careful with escapes, as they can be both literal
10220    and special, and in the case of \10 and friends, context determines which.
10221
10222    A summary of the code structure is:
10223
10224    switch (first_byte) {
10225   cases for each special:
10226    handle this special;
10227    break;
10228   case '\\':
10229    switch (2nd byte) {
10230     cases for each unambiguous special:
10231      handle this special;
10232      break;
10233     cases for each ambigous special/literal:
10234      disambiguate;
10235      if (special)  handle here
10236      else goto defchar;
10237     default: // unambiguously literal:
10238      goto defchar;
10239    }
10240   default:  // is a literal char
10241    // FALL THROUGH
10242   defchar:
10243    create EXACTish node for literal;
10244    while (more input and node isn't full) {
10245     switch (input_byte) {
10246     cases for each special;
10247      make sure parse pointer is set so that the next call to
10248       regatom will see this special first
10249      goto loopdone; // EXACTish node terminated by prev. char
10250     default:
10251      append char to EXACTISH node;
10252     }
10253     get next input byte;
10254    }
10255   loopdone:
10256    }
10257    return the generated node;
10258
10259    Specifically there are two separate switches for handling
10260    escape sequences, with the one for handling literal escapes requiring
10261    a dummy entry for all of the special escapes that are actually handled
10262    by the other.
10263
10264    Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10265    TRYAGAIN.
10266    Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10267    restarted.
10268    Otherwise does not return NULL.
10269 */
10270
10271 STATIC regnode *
10272 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10273 {
10274  dVAR;
10275  regnode *ret = NULL;
10276  I32 flags = 0;
10277  char *parse_start = RExC_parse;
10278  U8 op;
10279  int invert = 0;
10280
10281  GET_RE_DEBUG_FLAGS_DECL;
10282
10283  *flagp = WORST;  /* Tentatively. */
10284
10285  DEBUG_PARSE("atom");
10286
10287  PERL_ARGS_ASSERT_REGATOM;
10288
10289 tryagain:
10290  switch ((U8)*RExC_parse) {
10291  case '^':
10292   RExC_seen_zerolen++;
10293   nextchar(pRExC_state);
10294   if (RExC_flags & RXf_PMf_MULTILINE)
10295    ret = reg_node(pRExC_state, MBOL);
10296   else if (RExC_flags & RXf_PMf_SINGLELINE)
10297    ret = reg_node(pRExC_state, SBOL);
10298   else
10299    ret = reg_node(pRExC_state, BOL);
10300   Set_Node_Length(ret, 1); /* MJD */
10301   break;
10302  case '$':
10303   nextchar(pRExC_state);
10304   if (*RExC_parse)
10305    RExC_seen_zerolen++;
10306   if (RExC_flags & RXf_PMf_MULTILINE)
10307    ret = reg_node(pRExC_state, MEOL);
10308   else if (RExC_flags & RXf_PMf_SINGLELINE)
10309    ret = reg_node(pRExC_state, SEOL);
10310   else
10311    ret = reg_node(pRExC_state, EOL);
10312   Set_Node_Length(ret, 1); /* MJD */
10313   break;
10314  case '.':
10315   nextchar(pRExC_state);
10316   if (RExC_flags & RXf_PMf_SINGLELINE)
10317    ret = reg_node(pRExC_state, SANY);
10318   else
10319    ret = reg_node(pRExC_state, REG_ANY);
10320   *flagp |= HASWIDTH|SIMPLE;
10321   RExC_naughty++;
10322   Set_Node_Length(ret, 1); /* MJD */
10323   break;
10324  case '[':
10325  {
10326   char * const oregcomp_parse = ++RExC_parse;
10327   ret = regclass(pRExC_state, flagp,depth+1,
10328      FALSE, /* means parse the whole char class */
10329      TRUE, /* allow multi-char folds */
10330      FALSE, /* don't silence non-portable warnings. */
10331      NULL);
10332   if (*RExC_parse != ']') {
10333    RExC_parse = oregcomp_parse;
10334    vFAIL("Unmatched [");
10335   }
10336   if (ret == NULL) {
10337    if (*flagp & RESTART_UTF8)
10338     return NULL;
10339    FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10340     *flagp);
10341   }
10342   nextchar(pRExC_state);
10343   Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10344   break;
10345  }
10346  case '(':
10347   nextchar(pRExC_state);
10348   ret = reg(pRExC_state, 2, &flags,depth+1);
10349   if (ret == NULL) {
10350     if (flags & TRYAGAIN) {
10351      if (RExC_parse == RExC_end) {
10352       /* Make parent create an empty node if needed. */
10353       *flagp |= TRYAGAIN;
10354       return(NULL);
10355      }
10356      goto tryagain;
10357     }
10358     if (flags & RESTART_UTF8) {
10359      *flagp = RESTART_UTF8;
10360      return NULL;
10361     }
10362     FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags);
10363   }
10364   *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10365   break;
10366  case '|':
10367  case ')':
10368   if (flags & TRYAGAIN) {
10369    *flagp |= TRYAGAIN;
10370    return NULL;
10371   }
10372   vFAIL("Internal urp");
10373         /* Supposed to be caught earlier. */
10374   break;
10375  case '{':
10376   if (!regcurly(RExC_parse, FALSE)) {
10377    RExC_parse++;
10378    goto defchar;
10379   }
10380   /* FALL THROUGH */
10381  case '?':
10382  case '+':
10383  case '*':
10384   RExC_parse++;
10385   vFAIL("Quantifier follows nothing");
10386   break;
10387  case '\\':
10388   /* Special Escapes
10389
10390   This switch handles escape sequences that resolve to some kind
10391   of special regop and not to literal text. Escape sequnces that
10392   resolve to literal text are handled below in the switch marked
10393   "Literal Escapes".
10394
10395   Every entry in this switch *must* have a corresponding entry
10396   in the literal escape switch. However, the opposite is not
10397   required, as the default for this switch is to jump to the
10398   literal text handling code.
10399   */
10400   switch ((U8)*++RExC_parse) {
10401    U8 arg;
10402   /* Special Escapes */
10403   case 'A':
10404    RExC_seen_zerolen++;
10405    ret = reg_node(pRExC_state, SBOL);
10406    *flagp |= SIMPLE;
10407    goto finish_meta_pat;
10408   case 'G':
10409    ret = reg_node(pRExC_state, GPOS);
10410    RExC_seen |= REG_SEEN_GPOS;
10411    *flagp |= SIMPLE;
10412    goto finish_meta_pat;
10413   case 'K':
10414    RExC_seen_zerolen++;
10415    ret = reg_node(pRExC_state, KEEPS);
10416    *flagp |= SIMPLE;
10417    /* XXX:dmq : disabling in-place substitution seems to
10418    * be necessary here to avoid cases of memory corruption, as
10419    * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10420    */
10421    RExC_seen |= REG_SEEN_LOOKBEHIND;
10422    goto finish_meta_pat;
10423   case 'Z':
10424    ret = reg_node(pRExC_state, SEOL);
10425    *flagp |= SIMPLE;
10426    RExC_seen_zerolen++;  /* Do not optimize RE away */
10427    goto finish_meta_pat;
10428   case 'z':
10429    ret = reg_node(pRExC_state, EOS);
10430    *flagp |= SIMPLE;
10431    RExC_seen_zerolen++;  /* Do not optimize RE away */
10432    goto finish_meta_pat;
10433   case 'C':
10434    ret = reg_node(pRExC_state, CANY);
10435    RExC_seen |= REG_SEEN_CANY;
10436    *flagp |= HASWIDTH|SIMPLE;
10437    goto finish_meta_pat;
10438   case 'X':
10439    ret = reg_node(pRExC_state, CLUMP);
10440    *flagp |= HASWIDTH;
10441    goto finish_meta_pat;
10442
10443   case 'W':
10444    invert = 1;
10445    /* FALLTHROUGH */
10446   case 'w':
10447    arg = ANYOF_WORDCHAR;
10448    goto join_posix;
10449
10450   case 'b':
10451    RExC_seen_zerolen++;
10452    RExC_seen |= REG_SEEN_LOOKBEHIND;
10453    op = BOUND + get_regex_charset(RExC_flags);
10454    if (op > BOUNDA) {  /* /aa is same as /a */
10455     op = BOUNDA;
10456    }
10457    ret = reg_node(pRExC_state, op);
10458    FLAGS(ret) = get_regex_charset(RExC_flags);
10459    *flagp |= SIMPLE;
10460    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10461     ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10462    }
10463    goto finish_meta_pat;
10464   case 'B':
10465    RExC_seen_zerolen++;
10466    RExC_seen |= REG_SEEN_LOOKBEHIND;
10467    op = NBOUND + get_regex_charset(RExC_flags);
10468    if (op > NBOUNDA) { /* /aa is same as /a */
10469     op = NBOUNDA;
10470    }
10471    ret = reg_node(pRExC_state, op);
10472    FLAGS(ret) = get_regex_charset(RExC_flags);
10473    *flagp |= SIMPLE;
10474    if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10475     ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10476    }
10477    goto finish_meta_pat;
10478
10479   case 'D':
10480    invert = 1;
10481    /* FALLTHROUGH */
10482   case 'd':
10483    arg = ANYOF_DIGIT;
10484    goto join_posix;
10485
10486   case 'R':
10487    ret = reg_node(pRExC_state, LNBREAK);
10488    *flagp |= HASWIDTH|SIMPLE;
10489    goto finish_meta_pat;
10490
10491   case 'H':
10492    invert = 1;
10493    /* FALLTHROUGH */
10494   case 'h':
10495    arg = ANYOF_BLANK;
10496    op = POSIXU;
10497    goto join_posix_op_known;
10498
10499   case 'V':
10500    invert = 1;
10501    /* FALLTHROUGH */
10502   case 'v':
10503    arg = ANYOF_VERTWS;
10504    op = POSIXU;
10505    goto join_posix_op_known;
10506
10507   case 'S':
10508    invert = 1;
10509    /* FALLTHROUGH */
10510   case 's':
10511    arg = ANYOF_SPACE;
10512
10513   join_posix:
10514
10515    op = POSIXD + get_regex_charset(RExC_flags);
10516    if (op > POSIXA) {  /* /aa is same as /a */
10517     op = POSIXA;
10518    }
10519
10520   join_posix_op_known:
10521
10522    if (invert) {
10523     op += NPOSIXD - POSIXD;
10524    }
10525
10526    ret = reg_node(pRExC_state, op);
10527    if (! SIZE_ONLY) {
10528     FLAGS(ret) = namedclass_to_classnum(arg);
10529    }
10530
10531    *flagp |= HASWIDTH|SIMPLE;
10532    /* FALL THROUGH */
10533
10534   finish_meta_pat:
10535    nextchar(pRExC_state);
10536    Set_Node_Length(ret, 2); /* MJD */
10537    break;
10538   case 'p':
10539   case 'P':
10540    {
10541 #ifdef DEBUGGING
10542     char* parse_start = RExC_parse - 2;
10543 #endif
10544
10545     RExC_parse--;
10546
10547     ret = regclass(pRExC_state, flagp,depth+1,
10548        TRUE, /* means just parse this element */
10549        FALSE, /* don't allow multi-char folds */
10550        FALSE, /* don't silence non-portable warnings.
10551           It would be a bug if these returned
10552           non-portables */
10553        NULL);
10554     /* regclass() can only return RESTART_UTF8 if multi-char folds
10555     are allowed.  */
10556     if (!ret)
10557      FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
10558       *flagp);
10559
10560     RExC_parse--;
10561
10562     Set_Node_Offset(ret, parse_start + 2);
10563     Set_Node_Cur_Length(ret);
10564     nextchar(pRExC_state);
10565    }
10566    break;
10567   case 'N':
10568    /* Handle \N and \N{NAME} with multiple code points here and not
10569    * below because it can be multicharacter. join_exact() will join
10570    * them up later on.  Also this makes sure that things like
10571    * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10572    * The options to the grok function call causes it to fail if the
10573    * sequence is just a single code point.  We then go treat it as
10574    * just another character in the current EXACT node, and hence it
10575    * gets uniform treatment with all the other characters.  The
10576    * special treatment for quantifiers is not needed for such single
10577    * character sequences */
10578    ++RExC_parse;
10579    if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10580         FALSE /* not strict */ )) {
10581     if (*flagp & RESTART_UTF8)
10582      return NULL;
10583     RExC_parse--;
10584     goto defchar;
10585    }
10586    break;
10587   case 'k':    /* Handle \k<NAME> and \k'NAME' */
10588   parse_named_seq:
10589   {
10590    char ch= RExC_parse[1];
10591    if (ch != '<' && ch != '\'' && ch != '{') {
10592     RExC_parse++;
10593     vFAIL2("Sequence %.2s... not terminated",parse_start);
10594    } else {
10595     /* this pretty much dupes the code for (?P=...) in reg(), if
10596     you change this make sure you change that */
10597     char* name_start = (RExC_parse += 2);
10598     U32 num = 0;
10599     SV *sv_dat = reg_scan_name(pRExC_state,
10600      SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10601     ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10602     if (RExC_parse == name_start || *RExC_parse != ch)
10603      vFAIL2("Sequence %.3s... not terminated",parse_start);
10604
10605     if (!SIZE_ONLY) {
10606      num = add_data( pRExC_state, 1, "S" );
10607      RExC_rxi->data->data[num]=(void*)sv_dat;
10608      SvREFCNT_inc_simple_void(sv_dat);
10609     }
10610
10611     RExC_sawback = 1;
10612     ret = reganode(pRExC_state,
10613        ((! FOLD)
10614         ? NREF
10615         : (ASCII_FOLD_RESTRICTED)
10616         ? NREFFA
10617         : (AT_LEAST_UNI_SEMANTICS)
10618          ? NREFFU
10619          : (LOC)
10620          ? NREFFL
10621          : NREFF),
10622         num);
10623     *flagp |= HASWIDTH;
10624
10625     /* override incorrect value set in reganode MJD */
10626     Set_Node_Offset(ret, parse_start+1);
10627     Set_Node_Cur_Length(ret); /* MJD */
10628     nextchar(pRExC_state);
10629
10630    }
10631    break;
10632   }
10633   case 'g':
10634   case '1': case '2': case '3': case '4':
10635   case '5': case '6': case '7': case '8': case '9':
10636    {
10637     I32 num;
10638     bool isg = *RExC_parse == 'g';
10639     bool isrel = 0;
10640     bool hasbrace = 0;
10641     if (isg) {
10642      RExC_parse++;
10643      if (*RExC_parse == '{') {
10644       RExC_parse++;
10645       hasbrace = 1;
10646      }
10647      if (*RExC_parse == '-') {
10648       RExC_parse++;
10649       isrel = 1;
10650      }
10651      if (hasbrace && !isDIGIT(*RExC_parse)) {
10652       if (isrel) RExC_parse--;
10653       RExC_parse -= 2;
10654       goto parse_named_seq;
10655     }   }
10656     num = atoi(RExC_parse);
10657     if (isg && num == 0)
10658      vFAIL("Reference to invalid group 0");
10659     if (isrel) {
10660      num = RExC_npar - num;
10661      if (num < 1)
10662       vFAIL("Reference to nonexistent or unclosed group");
10663     }
10664     if (!isg && num > 9 && num >= RExC_npar)
10665      /* Probably a character specified in octal, e.g. \35 */
10666      goto defchar;
10667     else {
10668      char * const parse_start = RExC_parse - 1; /* MJD */
10669      while (isDIGIT(*RExC_parse))
10670       RExC_parse++;
10671      if (parse_start == RExC_parse - 1)
10672       vFAIL("Unterminated \\g... pattern");
10673      if (hasbrace) {
10674       if (*RExC_parse != '}')
10675        vFAIL("Unterminated \\g{...} pattern");
10676       RExC_parse++;
10677      }
10678      if (!SIZE_ONLY) {
10679       if (num > (I32)RExC_rx->nparens)
10680        vFAIL("Reference to nonexistent group");
10681      }
10682      RExC_sawback = 1;
10683      ret = reganode(pRExC_state,
10684         ((! FOLD)
10685          ? REF
10686          : (ASCII_FOLD_RESTRICTED)
10687          ? REFFA
10688          : (AT_LEAST_UNI_SEMANTICS)
10689           ? REFFU
10690           : (LOC)
10691           ? REFFL
10692           : REFF),
10693          num);
10694      *flagp |= HASWIDTH;
10695
10696      /* override incorrect value set in reganode MJD */
10697      Set_Node_Offset(ret, parse_start+1);
10698      Set_Node_Cur_Length(ret); /* MJD */
10699      RExC_parse--;
10700      nextchar(pRExC_state);
10701     }
10702    }
10703    break;
10704   case '\0':
10705    if (RExC_parse >= RExC_end)
10706     FAIL("Trailing \\");
10707    /* FALL THROUGH */
10708   default:
10709    /* Do not generate "unrecognized" warnings here, we fall
10710    back into the quick-grab loop below */
10711    parse_start--;
10712    goto defchar;
10713   }
10714   break;
10715
10716  case '#':
10717   if (RExC_flags & RXf_PMf_EXTENDED) {
10718    if ( reg_skipcomment( pRExC_state ) )
10719     goto tryagain;
10720   }
10721   /* FALL THROUGH */
10722
10723  default:
10724
10725    parse_start = RExC_parse - 1;
10726
10727    RExC_parse++;
10728
10729   defchar: {
10730    STRLEN len = 0;
10731    UV ender;
10732    char *p;
10733    char *s;
10734 #define MAX_NODE_STRING_SIZE 127
10735    char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10736    char *s0;
10737    U8 upper_parse = MAX_NODE_STRING_SIZE;
10738    STRLEN foldlen;
10739    U8 node_type;
10740    bool next_is_quantifier;
10741    char * oldp = NULL;
10742
10743    /* If a folding node contains only code points that don't
10744    * participate in folds, it can be changed into an EXACT node,
10745    * which allows the optimizer more things to look for */
10746    bool maybe_exact;
10747
10748    ender = 0;
10749    node_type = compute_EXACTish(pRExC_state);
10750    ret = reg_node(pRExC_state, node_type);
10751
10752    /* In pass1, folded, we use a temporary buffer instead of the
10753    * actual node, as the node doesn't exist yet */
10754    s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10755
10756    s0 = s;
10757
10758   reparse:
10759
10760    /* We do the EXACTFish to EXACT node only if folding, and not if in
10761    * locale, as whether a character folds or not isn't known until
10762    * runtime */
10763    maybe_exact = FOLD && ! LOC;
10764
10765    /* XXX The node can hold up to 255 bytes, yet this only goes to
10766    * 127.  I (khw) do not know why.  Keeping it somewhat less than
10767    * 255 allows us to not have to worry about overflow due to
10768    * converting to utf8 and fold expansion, but that value is
10769    * 255-UTF8_MAXBYTES_CASE.  join_exact() may join adjacent nodes
10770    * split up by this limit into a single one using the real max of
10771    * 255.  Even at 127, this breaks under rare circumstances.  If
10772    * folding, we do not want to split a node at a character that is a
10773    * non-final in a multi-char fold, as an input string could just
10774    * happen to want to match across the node boundary.  The join
10775    * would solve that problem if the join actually happens.  But a
10776    * series of more than two nodes in a row each of 127 would cause
10777    * the first join to succeed to get to 254, but then there wouldn't
10778    * be room for the next one, which could at be one of those split
10779    * multi-char folds.  I don't know of any fool-proof solution.  One
10780    * could back off to end with only a code point that isn't such a
10781    * non-final, but it is possible for there not to be any in the
10782    * entire node. */
10783    for (p = RExC_parse - 1;
10784     len < upper_parse && p < RExC_end;
10785     len++)
10786    {
10787     oldp = p;
10788
10789     if (RExC_flags & RXf_PMf_EXTENDED)
10790      p = regwhite( pRExC_state, p );
10791     switch ((U8)*p) {
10792     case '^':
10793     case '$':
10794     case '.':
10795     case '[':
10796     case '(':
10797     case ')':
10798     case '|':
10799      goto loopdone;
10800     case '\\':
10801      /* Literal Escapes Switch
10802
10803      This switch is meant to handle escape sequences that
10804      resolve to a literal character.
10805
10806      Every escape sequence that represents something
10807      else, like an assertion or a char class, is handled
10808      in the switch marked 'Special Escapes' above in this
10809      routine, but also has an entry here as anything that
10810      isn't explicitly mentioned here will be treated as
10811      an unescaped equivalent literal.
10812      */
10813
10814      switch ((U8)*++p) {
10815      /* These are all the special escapes. */
10816      case 'A':             /* Start assertion */
10817      case 'b': case 'B':   /* Word-boundary assertion*/
10818      case 'C':             /* Single char !DANGEROUS! */
10819      case 'd': case 'D':   /* digit class */
10820      case 'g': case 'G':   /* generic-backref, pos assertion */
10821      case 'h': case 'H':   /* HORIZWS */
10822      case 'k': case 'K':   /* named backref, keep marker */
10823      case 'p': case 'P':   /* Unicode property */
10824        case 'R':   /* LNBREAK */
10825      case 's': case 'S':   /* space class */
10826      case 'v': case 'V':   /* VERTWS */
10827      case 'w': case 'W':   /* word class */
10828      case 'X':             /* eXtended Unicode "combining character sequence" */
10829      case 'z': case 'Z':   /* End of line/string assertion */
10830       --p;
10831       goto loopdone;
10832
10833      /* Anything after here is an escape that resolves to a
10834      literal. (Except digits, which may or may not)
10835      */
10836      case 'n':
10837       ender = '\n';
10838       p++;
10839       break;
10840      case 'N': /* Handle a single-code point named character. */
10841       /* The options cause it to fail if a multiple code
10842       * point sequence.  Handle those in the switch() above
10843       * */
10844       RExC_parse = p + 1;
10845       if (! grok_bslash_N(pRExC_state, NULL, &ender,
10846            flagp, depth, FALSE,
10847            FALSE /* not strict */ ))
10848       {
10849        if (*flagp & RESTART_UTF8)
10850         FAIL("panic: grok_bslash_N set RESTART_UTF8");
10851        RExC_parse = p = oldp;
10852        goto loopdone;
10853       }
10854       p = RExC_parse;
10855       if (ender > 0xff) {
10856        REQUIRE_UTF8;
10857       }
10858       break;
10859      case 'r':
10860       ender = '\r';
10861       p++;
10862       break;
10863      case 't':
10864       ender = '\t';
10865       p++;
10866       break;
10867      case 'f':
10868       ender = '\f';
10869       p++;
10870       break;
10871      case 'e':
10872       ender = ASCII_TO_NATIVE('\033');
10873       p++;
10874       break;
10875      case 'a':
10876       ender = ASCII_TO_NATIVE('\007');
10877       p++;
10878       break;
10879      case 'o':
10880       {
10881        UV result;
10882        const char* error_msg;
10883
10884        bool valid = grok_bslash_o(&p,
10885              &result,
10886              &error_msg,
10887              TRUE, /* out warnings */
10888              FALSE, /* not strict */
10889              TRUE, /* Output warnings
10890                 for non-
10891                 portables */
10892              UTF);
10893        if (! valid) {
10894         RExC_parse = p; /* going to die anyway; point
10895             to exact spot of failure */
10896         vFAIL(error_msg);
10897        }
10898        ender = result;
10899        if (PL_encoding && ender < 0x100) {
10900         goto recode_encoding;
10901        }
10902        if (ender > 0xff) {
10903         REQUIRE_UTF8;
10904        }
10905        break;
10906       }
10907      case 'x':
10908       {
10909        UV result = UV_MAX; /* initialize to erroneous
10910             value */
10911        const char* error_msg;
10912
10913        bool valid = grok_bslash_x(&p,
10914              &result,
10915              &error_msg,
10916              TRUE, /* out warnings */
10917              FALSE, /* not strict */
10918              TRUE, /* Output warnings
10919                 for non-
10920                 portables */
10921              UTF);
10922        if (! valid) {
10923         RExC_parse = p; /* going to die anyway; point
10924             to exact spot of failure */
10925         vFAIL(error_msg);
10926        }
10927        ender = result;
10928
10929        if (PL_encoding && ender < 0x100) {
10930         goto recode_encoding;
10931        }
10932        if (ender > 0xff) {
10933         REQUIRE_UTF8;
10934        }
10935        break;
10936       }
10937      case 'c':
10938       p++;
10939       ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10940       break;
10941      case '0': case '1': case '2': case '3':case '4':
10942      case '5': case '6': case '7':
10943       if (*p == '0' ||
10944        (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10945       {
10946        I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10947        STRLEN numlen = 3;
10948        ender = grok_oct(p, &numlen, &flags, NULL);
10949        if (ender > 0xff) {
10950         REQUIRE_UTF8;
10951        }
10952        p += numlen;
10953        if (SIZE_ONLY   /* like \08, \178 */
10954         && numlen < 3
10955         && p < RExC_end
10956         && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10957        {
10958         reg_warn_non_literal_string(
10959           p + 1,
10960           form_short_octal_warning(p, numlen));
10961        }
10962       }
10963       else {  /* Not to be treated as an octal constant, go
10964         find backref */
10965        --p;
10966        goto loopdone;
10967       }
10968       if (PL_encoding && ender < 0x100)
10969        goto recode_encoding;
10970       break;
10971      case '8': case '9': /* These are illegal unless backrefs */
10972       if (atoi(p) <= RExC_npar) {
10973        --p;   /* backup to backslash; handle as backref */
10974        goto loopdone;
10975       }
10976       goto unrecognized;
10977      recode_encoding:
10978       if (! RExC_override_recoding) {
10979        SV* enc = PL_encoding;
10980        ender = reg_recode((const char)(U8)ender, &enc);
10981        if (!enc && SIZE_ONLY)
10982         ckWARNreg(p, "Invalid escape in the specified encoding");
10983        REQUIRE_UTF8;
10984       }
10985       break;
10986      case '\0':
10987       if (p >= RExC_end)
10988        FAIL("Trailing \\");
10989       /* FALL THROUGH */
10990      default:
10991      unrecognized:
10992       if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10993        /* Include any { following the alpha to emphasize
10994        * that it could be part of an escape at some point
10995        * in the future */
10996        int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10997        ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10998       }
10999       goto normal_default;
11000      } /* End of switch on '\' */
11001      break;
11002     default:    /* A literal character */
11003
11004      if (! SIZE_ONLY
11005       && RExC_flags & RXf_PMf_EXTENDED
11006       && ckWARN(WARN_DEPRECATED)
11007       && is_PATWS_non_low(p, UTF))
11008      {
11009       vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11010         "Escape literal pattern white space under /x");
11011      }
11012
11013     normal_default:
11014      if (UTF8_IS_START(*p) && UTF) {
11015       STRLEN numlen;
11016       ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11017            &numlen, UTF8_ALLOW_DEFAULT);
11018       p += numlen;
11019      }
11020      else
11021       ender = (U8) *p++;
11022      break;
11023     } /* End of switch on the literal */
11024
11025     /* Here, have looked at the literal character and <ender>
11026     * contains its ordinal, <p> points to the character after it
11027     */
11028
11029     if ( RExC_flags & RXf_PMf_EXTENDED)
11030      p = regwhite( pRExC_state, p );
11031
11032     /* If the next thing is a quantifier, it applies to this
11033     * character only, which means that this character has to be in
11034     * its own node and can't just be appended to the string in an
11035     * existing node, so if there are already other characters in
11036     * the node, close the node with just them, and set up to do
11037     * this character again next time through, when it will be the
11038     * only thing in its new node */
11039     if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11040     {
11041      p = oldp;
11042      goto loopdone;
11043     }
11044
11045     if (FOLD) {
11046      if (UTF
11047        /* See comments for join_exact() as to why we fold
11048        * this non-UTF at compile time */
11049       || (node_type == EXACTFU
11050        && ender == LATIN_SMALL_LETTER_SHARP_S))
11051      {
11052
11053
11054       /* Prime the casefolded buffer.  Locale rules, which
11055       * apply only to code points < 256, aren't known until
11056       * execution, so for them, just output the original
11057       * character using utf8.  If we start to fold non-UTF
11058       * patterns, be sure to update join_exact() */
11059       if (LOC && ender < 256) {
11060        if (UNI_IS_INVARIANT(ender)) {
11061         *s = (U8) ender;
11062         foldlen = 1;
11063        } else {
11064         *s = UTF8_TWO_BYTE_HI(ender);
11065         *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11066         foldlen = 2;
11067        }
11068       }
11069       else {
11070        UV folded = _to_uni_fold_flags(
11071           ender,
11072           (U8 *) s,
11073           &foldlen,
11074           FOLD_FLAGS_FULL
11075           | ((LOC) ?  FOLD_FLAGS_LOCALE
11076              : (ASCII_FOLD_RESTRICTED)
11077              ? FOLD_FLAGS_NOMIX_ASCII
11078              : 0)
11079            );
11080
11081        /* If this node only contains non-folding code
11082        * points so far, see if this new one is also
11083        * non-folding */
11084        if (maybe_exact) {
11085         if (folded != ender) {
11086          maybe_exact = FALSE;
11087         }
11088         else {
11089          /* Here the fold is the original; we have
11090          * to check further to see if anything
11091          * folds to it */
11092          if (! PL_utf8_foldable) {
11093           SV* swash = swash_init("utf8",
11094               "_Perl_Any_Folds",
11095               &PL_sv_undef, 1, 0);
11096           PL_utf8_foldable =
11097              _get_swash_invlist(swash);
11098           SvREFCNT_dec_NN(swash);
11099          }
11100          if (_invlist_contains_cp(PL_utf8_foldable,
11101                ender))
11102          {
11103           maybe_exact = FALSE;
11104          }
11105         }
11106        }
11107        ender = folded;
11108       }
11109       s += foldlen;
11110
11111       /* The loop increments <len> each time, as all but this
11112       * path (and the one just below for UTF) through it add
11113       * a single byte to the EXACTish node.  But this one
11114       * has changed len to be the correct final value, so
11115       * subtract one to cancel out the increment that
11116       * follows */
11117       len += foldlen - 1;
11118      }
11119      else {
11120       *(s++) = (char) ender;
11121       maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
11122      }
11123     }
11124     else if (UTF) {
11125      const STRLEN unilen = reguni(pRExC_state, ender, s);
11126      if (unilen > 0) {
11127      s   += unilen;
11128      len += unilen;
11129      }
11130
11131      /* See comment just above for - 1 */
11132      len--;
11133     }
11134     else {
11135      REGC((char)ender, s++);
11136     }
11137
11138     if (next_is_quantifier) {
11139
11140      /* Here, the next input is a quantifier, and to get here,
11141      * the current character is the only one in the node.
11142      * Also, here <len> doesn't include the final byte for this
11143      * character */
11144      len++;
11145      goto loopdone;
11146     }
11147
11148    } /* End of loop through literal characters */
11149
11150    /* Here we have either exhausted the input or ran out of room in
11151    * the node.  (If we encountered a character that can't be in the
11152    * node, transfer is made directly to <loopdone>, and so we
11153    * wouldn't have fallen off the end of the loop.)  In the latter
11154    * case, we artificially have to split the node into two, because
11155    * we just don't have enough space to hold everything.  This
11156    * creates a problem if the final character participates in a
11157    * multi-character fold in the non-final position, as a match that
11158    * should have occurred won't, due to the way nodes are matched,
11159    * and our artificial boundary.  So back off until we find a non-
11160    * problematic character -- one that isn't at the beginning or
11161    * middle of such a fold.  (Either it doesn't participate in any
11162    * folds, or appears only in the final position of all the folds it
11163    * does participate in.)  A better solution with far fewer false
11164    * positives, and that would fill the nodes more completely, would
11165    * be to actually have available all the multi-character folds to
11166    * test against, and to back-off only far enough to be sure that
11167    * this node isn't ending with a partial one.  <upper_parse> is set
11168    * further below (if we need to reparse the node) to include just
11169    * up through that final non-problematic character that this code
11170    * identifies, so when it is set to less than the full node, we can
11171    * skip the rest of this */
11172    if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11173
11174     const STRLEN full_len = len;
11175
11176     assert(len >= MAX_NODE_STRING_SIZE);
11177
11178     /* Here, <s> points to the final byte of the final character.
11179     * Look backwards through the string until find a non-
11180     * problematic character */
11181
11182     if (! UTF) {
11183
11184      /* These two have no multi-char folds to non-UTF characters
11185      */
11186      if (ASCII_FOLD_RESTRICTED || LOC) {
11187       goto loopdone;
11188      }
11189
11190      while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11191      len = s - s0 + 1;
11192     }
11193     else {
11194      if (!  PL_NonL1NonFinalFold) {
11195       PL_NonL1NonFinalFold = _new_invlist_C_array(
11196           NonL1_Perl_Non_Final_Folds_invlist);
11197      }
11198
11199      /* Point to the first byte of the final character */
11200      s = (char *) utf8_hop((U8 *) s, -1);
11201
11202      while (s >= s0) {   /* Search backwards until find
11203           non-problematic char */
11204       if (UTF8_IS_INVARIANT(*s)) {
11205
11206        /* There are no ascii characters that participate
11207        * in multi-char folds under /aa.  In EBCDIC, the
11208        * non-ascii invariants are all control characters,
11209        * so don't ever participate in any folds. */
11210        if (ASCII_FOLD_RESTRICTED
11211         || ! IS_NON_FINAL_FOLD(*s))
11212        {
11213         break;
11214        }
11215       }
11216       else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11217
11218        /* No Latin1 characters participate in multi-char
11219        * folds under /l */
11220        if (LOC
11221         || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11222                 *s, *(s+1))))
11223        {
11224         break;
11225        }
11226       }
11227       else if (! _invlist_contains_cp(
11228           PL_NonL1NonFinalFold,
11229           valid_utf8_to_uvchr((U8 *) s, NULL)))
11230       {
11231        break;
11232       }
11233
11234       /* Here, the current character is problematic in that
11235       * it does occur in the non-final position of some
11236       * fold, so try the character before it, but have to
11237       * special case the very first byte in the string, so
11238       * we don't read outside the string */
11239       s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11240      } /* End of loop backwards through the string */
11241
11242      /* If there were only problematic characters in the string,
11243      * <s> will point to before s0, in which case the length
11244      * should be 0, otherwise include the length of the
11245      * non-problematic character just found */
11246      len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11247     }
11248
11249     /* Here, have found the final character, if any, that is
11250     * non-problematic as far as ending the node without splitting
11251     * it across a potential multi-char fold.  <len> contains the
11252     * number of bytes in the node up-to and including that
11253     * character, or is 0 if there is no such character, meaning
11254     * the whole node contains only problematic characters.  In
11255     * this case, give up and just take the node as-is.  We can't
11256     * do any better */
11257     if (len == 0) {
11258      len = full_len;
11259     } else {
11260
11261      /* Here, the node does contain some characters that aren't
11262      * problematic.  If one such is the final character in the
11263      * node, we are done */
11264      if (len == full_len) {
11265       goto loopdone;
11266      }
11267      else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11268
11269       /* If the final character is problematic, but the
11270       * penultimate is not, back-off that last character to
11271       * later start a new node with it */
11272       p = oldp;
11273       goto loopdone;
11274      }
11275
11276      /* Here, the final non-problematic character is earlier
11277      * in the input than the penultimate character.  What we do
11278      * is reparse from the beginning, going up only as far as
11279      * this final ok one, thus guaranteeing that the node ends
11280      * in an acceptable character.  The reason we reparse is
11281      * that we know how far in the character is, but we don't
11282      * know how to correlate its position with the input parse.
11283      * An alternate implementation would be to build that
11284      * correlation as we go along during the original parse,
11285      * but that would entail extra work for every node, whereas
11286      * this code gets executed only when the string is too
11287      * large for the node, and the final two characters are
11288      * problematic, an infrequent occurrence.  Yet another
11289      * possible strategy would be to save the tail of the
11290      * string, and the next time regatom is called, initialize
11291      * with that.  The problem with this is that unless you
11292      * back off one more character, you won't be guaranteed
11293      * regatom will get called again, unless regbranch,
11294      * regpiece ... are also changed.  If you do back off that
11295      * extra character, so that there is input guaranteed to
11296      * force calling regatom, you can't handle the case where
11297      * just the first character in the node is acceptable.  I
11298      * (khw) decided to try this method which doesn't have that
11299      * pitfall; if performance issues are found, we can do a
11300      * combination of the current approach plus that one */
11301      upper_parse = len;
11302      len = 0;
11303      s = s0;
11304      goto reparse;
11305     }
11306    }   /* End of verifying node ends with an appropriate char */
11307
11308   loopdone:   /* Jumped to when encounters something that shouldn't be in
11309      the node */
11310
11311    /* If 'maybe_exact' is still set here, means there are no
11312    * code points in the node that participate in folds */
11313    if (FOLD && maybe_exact) {
11314     OP(ret) = EXACT;
11315    }
11316
11317    /* I (khw) don't know if you can get here with zero length, but the
11318    * old code handled this situation by creating a zero-length EXACT
11319    * node.  Might as well be NOTHING instead */
11320    if (len == 0) {
11321     OP(ret) = NOTHING;
11322    }
11323    else{
11324     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11325    }
11326
11327    RExC_parse = p - 1;
11328    Set_Node_Cur_Length(ret); /* MJD */
11329    nextchar(pRExC_state);
11330    {
11331     /* len is STRLEN which is unsigned, need to copy to signed */
11332     IV iv = len;
11333     if (iv < 0)
11334      vFAIL("Internal disaster");
11335    }
11336
11337   } /* End of label 'defchar:' */
11338   break;
11339  } /* End of giant switch on input character */
11340
11341  return(ret);
11342 }
11343
11344 STATIC char *
11345 S_regwhite( RExC_state_t *pRExC_state, char *p )
11346 {
11347  const char *e = RExC_end;
11348
11349  PERL_ARGS_ASSERT_REGWHITE;
11350
11351  while (p < e) {
11352   if (isSPACE(*p))
11353    ++p;
11354   else if (*p == '#') {
11355    bool ended = 0;
11356    do {
11357     if (*p++ == '\n') {
11358      ended = 1;
11359      break;
11360     }
11361    } while (p < e);
11362    if (!ended)
11363     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11364   }
11365   else
11366    break;
11367  }
11368  return p;
11369 }
11370
11371 STATIC char *
11372 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11373 {
11374  /* Returns the next non-pattern-white space, non-comment character (the
11375  * latter only if 'recognize_comment is true) in the string p, which is
11376  * ended by RExC_end.  If there is no line break ending a comment,
11377  * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11378  const char *e = RExC_end;
11379
11380  PERL_ARGS_ASSERT_REGPATWS;
11381
11382  while (p < e) {
11383   STRLEN len;
11384   if ((len = is_PATWS_safe(p, e, UTF))) {
11385    p += len;
11386   }
11387   else if (recognize_comment && *p == '#') {
11388    bool ended = 0;
11389    do {
11390     p++;
11391     if (is_LNBREAK_safe(p, e, UTF)) {
11392      ended = 1;
11393      break;
11394     }
11395    } while (p < e);
11396    if (!ended)
11397     RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11398   }
11399   else
11400    break;
11401  }
11402  return p;
11403 }
11404
11405 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11406    Character classes ([:foo:]) can also be negated ([:^foo:]).
11407    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11408    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11409    but trigger failures because they are currently unimplemented. */
11410
11411 #define POSIXCC_DONE(c)   ((c) == ':')
11412 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11413 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11414
11415 PERL_STATIC_INLINE I32
11416 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11417 {
11418  dVAR;
11419  I32 namedclass = OOB_NAMEDCLASS;
11420
11421  PERL_ARGS_ASSERT_REGPPOSIXCC;
11422
11423  if (value == '[' && RExC_parse + 1 < RExC_end &&
11424   /* I smell either [: or [= or [. -- POSIX has been here, right? */
11425   POSIXCC(UCHARAT(RExC_parse)))
11426  {
11427   const char c = UCHARAT(RExC_parse);
11428   char* const s = RExC_parse++;
11429
11430   while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11431    RExC_parse++;
11432   if (RExC_parse == RExC_end) {
11433    if (strict) {
11434
11435     /* Try to give a better location for the error (than the end of
11436     * the string) by looking for the matching ']' */
11437     RExC_parse = s;
11438     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11439      RExC_parse++;
11440     }
11441     vFAIL2("Unmatched '%c' in POSIX class", c);
11442    }
11443    /* Grandfather lone [:, [=, [. */
11444    RExC_parse = s;
11445   }
11446   else {
11447    const char* const t = RExC_parse++; /* skip over the c */
11448    assert(*t == c);
11449
11450    if (UCHARAT(RExC_parse) == ']') {
11451     const char *posixcc = s + 1;
11452     RExC_parse++; /* skip over the ending ] */
11453
11454     if (*s == ':') {
11455      const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11456      const I32 skip = t - posixcc;
11457
11458      /* Initially switch on the length of the name.  */
11459      switch (skip) {
11460      case 4:
11461       if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11462               this is the Perl \w
11463               */
11464        namedclass = ANYOF_WORDCHAR;
11465       break;
11466      case 5:
11467       /* Names all of length 5.  */
11468       /* alnum alpha ascii blank cntrl digit graph lower
11469       print punct space upper  */
11470       /* Offset 4 gives the best switch position.  */
11471       switch (posixcc[4]) {
11472       case 'a':
11473        if (memEQ(posixcc, "alph", 4)) /* alpha */
11474         namedclass = ANYOF_ALPHA;
11475        break;
11476       case 'e':
11477        if (memEQ(posixcc, "spac", 4)) /* space */
11478         namedclass = ANYOF_PSXSPC;
11479        break;
11480       case 'h':
11481        if (memEQ(posixcc, "grap", 4)) /* graph */
11482         namedclass = ANYOF_GRAPH;
11483        break;
11484       case 'i':
11485        if (memEQ(posixcc, "asci", 4)) /* ascii */
11486         namedclass = ANYOF_ASCII;
11487        break;
11488       case 'k':
11489        if (memEQ(posixcc, "blan", 4)) /* blank */
11490         namedclass = ANYOF_BLANK;
11491        break;
11492       case 'l':
11493        if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11494         namedclass = ANYOF_CNTRL;
11495        break;
11496       case 'm':
11497        if (memEQ(posixcc, "alnu", 4)) /* alnum */
11498         namedclass = ANYOF_ALPHANUMERIC;
11499        break;
11500       case 'r':
11501        if (memEQ(posixcc, "lowe", 4)) /* lower */
11502         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11503        else if (memEQ(posixcc, "uppe", 4)) /* upper */
11504         namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11505        break;
11506       case 't':
11507        if (memEQ(posixcc, "digi", 4)) /* digit */
11508         namedclass = ANYOF_DIGIT;
11509        else if (memEQ(posixcc, "prin", 4)) /* print */
11510         namedclass = ANYOF_PRINT;
11511        else if (memEQ(posixcc, "punc", 4)) /* punct */
11512         namedclass = ANYOF_PUNCT;
11513        break;
11514       }
11515       break;
11516      case 6:
11517       if (memEQ(posixcc, "xdigit", 6))
11518        namedclass = ANYOF_XDIGIT;
11519       break;
11520      }
11521
11522      if (namedclass == OOB_NAMEDCLASS)
11523       Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11524          t - s - 1, s + 1);
11525
11526      /* The #defines are structured so each complement is +1 to
11527      * the normal one */
11528      if (complement) {
11529       namedclass++;
11530      }
11531      assert (posixcc[skip] == ':');
11532      assert (posixcc[skip+1] == ']');
11533     } else if (!SIZE_ONLY) {
11534      /* [[=foo=]] and [[.foo.]] are still future. */
11535
11536      /* adjust RExC_parse so the warning shows after
11537      the class closes */
11538      while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11539       RExC_parse++;
11540      vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11541     }
11542    } else {
11543     /* Maternal grandfather:
11544     * "[:" ending in ":" but not in ":]" */
11545     if (strict) {
11546      vFAIL("Unmatched '[' in POSIX class");
11547     }
11548
11549     /* Grandfather lone [:, [=, [. */
11550     RExC_parse = s;
11551    }
11552   }
11553  }
11554
11555  return namedclass;
11556 }
11557
11558 STATIC bool
11559 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11560 {
11561  /* This applies some heuristics at the current parse position (which should
11562  * be at a '[') to see if what follows might be intended to be a [:posix:]
11563  * class.  It returns true if it really is a posix class, of course, but it
11564  * also can return true if it thinks that what was intended was a posix
11565  * class that didn't quite make it.
11566  *
11567  * It will return true for
11568  *      [:alphanumerics:
11569  *      [:alphanumerics]  (as long as the ] isn't followed immediately by a
11570  *                         ')' indicating the end of the (?[
11571  *      [:any garbage including %^&$ punctuation:]
11572  *
11573  * This is designed to be called only from S_handle_regex_sets; it could be
11574  * easily adapted to be called from the spot at the beginning of regclass()
11575  * that checks to see in a normal bracketed class if the surrounding []
11576  * have been omitted ([:word:] instead of [[:word:]]).  But doing so would
11577  * change long-standing behavior, so I (khw) didn't do that */
11578  char* p = RExC_parse + 1;
11579  char first_char = *p;
11580
11581  PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11582
11583  assert(*(p - 1) == '[');
11584
11585  if (! POSIXCC(first_char)) {
11586   return FALSE;
11587  }
11588
11589  p++;
11590  while (p < RExC_end && isWORDCHAR(*p)) p++;
11591
11592  if (p >= RExC_end) {
11593   return FALSE;
11594  }
11595
11596  if (p - RExC_parse > 2    /* Got at least 1 word character */
11597   && (*p == first_char
11598    || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11599  {
11600   return TRUE;
11601  }
11602
11603  p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11604
11605  return (p
11606    && p - RExC_parse > 2 /* [:] evaluates to colon;
11607          [::] is a bad posix class. */
11608    && first_char == *(p - 1));
11609 }
11610
11611 STATIC regnode *
11612 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11613     char * const oregcomp_parse)
11614 {
11615  /* Handle the (?[...]) construct to do set operations */
11616
11617  U8 curchar;
11618  UV start, end; /* End points of code point ranges */
11619  SV* result_string;
11620  char *save_end, *save_parse;
11621  SV* final;
11622  STRLEN len;
11623  regnode* node;
11624  AV* stack;
11625  const bool save_fold = FOLD;
11626
11627  GET_RE_DEBUG_FLAGS_DECL;
11628
11629  PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11630
11631  if (LOC) {
11632   vFAIL("(?[...]) not valid in locale");
11633  }
11634  RExC_uni_semantics = 1;
11635
11636  /* This will return only an ANYOF regnode, or (unlikely) something smaller
11637  * (such as EXACT).  Thus we can skip most everything if just sizing.  We
11638  * call regclass to handle '[]' so as to not have to reinvent its parsing
11639  * rules here (throwing away the size it computes each time).  And, we exit
11640  * upon an unescaped ']' that isn't one ending a regclass.  To do both
11641  * these things, we need to realize that something preceded by a backslash
11642  * is escaped, so we have to keep track of backslashes */
11643  if (SIZE_ONLY) {
11644
11645   Perl_ck_warner_d(aTHX_
11646    packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11647    "The regex_sets feature is experimental" REPORT_LOCATION,
11648    (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11649
11650   while (RExC_parse < RExC_end) {
11651    SV* current = NULL;
11652    RExC_parse = regpatws(pRExC_state, RExC_parse,
11653         TRUE); /* means recognize comments */
11654    switch (*RExC_parse) {
11655     default:
11656      break;
11657     case '\\':
11658      /* Skip the next byte (which could cause us to end up in
11659      * the middle of a UTF-8 character, but since none of those
11660      * are confusable with anything we currently handle in this
11661      * switch (invariants all), it's safe.  We'll just hit the
11662      * default: case next time and keep on incrementing until
11663      * we find one of the invariants we do handle. */
11664      RExC_parse++;
11665      break;
11666     case '[':
11667     {
11668      /* If this looks like it is a [:posix:] class, leave the
11669      * parse pointer at the '[' to fool regclass() into
11670      * thinking it is part of a '[[:posix:]]'.  That function
11671      * will use strict checking to force a syntax error if it
11672      * doesn't work out to a legitimate class */
11673      bool is_posix_class
11674          = could_it_be_a_POSIX_class(pRExC_state);
11675      if (! is_posix_class) {
11676       RExC_parse++;
11677      }
11678
11679      /* regclass() can only return RESTART_UTF8 if multi-char
11680      folds are allowed.  */
11681      if (!regclass(pRExC_state, flagp,depth+1,
11682         is_posix_class, /* parse the whole char
11683              class only if not a
11684              posix class */
11685         FALSE, /* don't allow multi-char folds */
11686         TRUE, /* silence non-portable warnings. */
11687         &current))
11688       FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11689        *flagp);
11690
11691      /* function call leaves parse pointing to the ']', except
11692      * if we faked it */
11693      if (is_posix_class) {
11694       RExC_parse--;
11695      }
11696
11697      SvREFCNT_dec(current);   /* In case it returned something */
11698      break;
11699     }
11700
11701     case ']':
11702      RExC_parse++;
11703      if (RExC_parse < RExC_end
11704       && *RExC_parse == ')')
11705      {
11706       node = reganode(pRExC_state, ANYOF, 0);
11707       RExC_size += ANYOF_SKIP;
11708       nextchar(pRExC_state);
11709       Set_Node_Length(node,
11710         RExC_parse - oregcomp_parse + 1); /* MJD */
11711       return node;
11712      }
11713      goto no_close;
11714    }
11715    RExC_parse++;
11716   }
11717
11718   no_close:
11719   FAIL("Syntax error in (?[...])");
11720  }
11721
11722  /* Pass 2 only after this.  Everything in this construct is a
11723  * metacharacter.  Operands begin with either a '\' (for an escape
11724  * sequence), or a '[' for a bracketed character class.  Any other
11725  * character should be an operator, or parenthesis for grouping.  Both
11726  * types of operands are handled by calling regclass() to parse them.  It
11727  * is called with a parameter to indicate to return the computed inversion
11728  * list.  The parsing here is implemented via a stack.  Each entry on the
11729  * stack is a single character representing one of the operators, or the
11730  * '('; or else a pointer to an operand inversion list. */
11731
11732 #define IS_OPERAND(a)  (! SvIOK(a))
11733
11734  /* The stack starts empty.  It is a syntax error if the first thing parsed
11735  * is a binary operator; everything else is pushed on the stack.  When an
11736  * operand is parsed, the top of the stack is examined.  If it is a binary
11737  * operator, the item before it should be an operand, and both are replaced
11738  * by the result of doing that operation on the new operand and the one on
11739  * the stack.   Thus a sequence of binary operands is reduced to a single
11740  * one before the next one is parsed.
11741  *
11742  * A unary operator may immediately follow a binary in the input, for
11743  * example
11744  *      [a] + ! [b]
11745  * When an operand is parsed and the top of the stack is a unary operator,
11746  * the operation is performed, and then the stack is rechecked to see if
11747  * this new operand is part of a binary operation; if so, it is handled as
11748  * above.
11749  *
11750  * A '(' is simply pushed on the stack; it is valid only if the stack is
11751  * empty, or the top element of the stack is an operator or another '('
11752  * (for which the parenthesized expression will become an operand).  By the
11753  * time the corresponding ')' is parsed everything in between should have
11754  * been parsed and evaluated to a single operand (or else is a syntax
11755  * error), and is handled as a regular operand */
11756
11757  sv_2mortal((SV *)(stack = newAV()));
11758
11759  while (RExC_parse < RExC_end) {
11760   I32 top_index = av_tindex(stack);
11761   SV** top_ptr;
11762   SV* current = NULL;
11763
11764   /* Skip white space */
11765   RExC_parse = regpatws(pRExC_state, RExC_parse,
11766         TRUE); /* means recognize comments */
11767   if (RExC_parse >= RExC_end) {
11768    Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11769   }
11770   if ((curchar = UCHARAT(RExC_parse)) == ']') {
11771    break;
11772   }
11773
11774   switch (curchar) {
11775
11776    case '?':
11777     if (av_tindex(stack) >= 0   /* This makes sure that we can
11778            safely subtract 1 from
11779            RExC_parse in the next clause.
11780            If we have something on the
11781            stack, we have parsed something
11782            */
11783      && UCHARAT(RExC_parse - 1) == '('
11784      && RExC_parse < RExC_end)
11785     {
11786      /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11787      * This happens when we have some thing like
11788      *
11789      *   my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11790      *   ...
11791      *   qr/(?[ \p{Digit} & $thai_or_lao ])/;
11792      *
11793      * Here we would be handling the interpolated
11794      * '$thai_or_lao'.  We handle this by a recursive call to
11795      * ourselves which returns the inversion list the
11796      * interpolated expression evaluates to.  We use the flags
11797      * from the interpolated pattern. */
11798      U32 save_flags = RExC_flags;
11799      const char * const save_parse = ++RExC_parse;
11800
11801      parse_lparen_question_flags(pRExC_state);
11802
11803      if (RExC_parse == save_parse  /* Makes sure there was at
11804              least one flag (or this
11805              embedding wasn't compiled)
11806             */
11807       || RExC_parse >= RExC_end - 4
11808       || UCHARAT(RExC_parse) != ':'
11809       || UCHARAT(++RExC_parse) != '('
11810       || UCHARAT(++RExC_parse) != '?'
11811       || UCHARAT(++RExC_parse) != '[')
11812      {
11813
11814       /* In combination with the above, this moves the
11815       * pointer to the point just after the first erroneous
11816       * character (or if there are no flags, to where they
11817       * should have been) */
11818       if (RExC_parse >= RExC_end - 4) {
11819        RExC_parse = RExC_end;
11820       }
11821       else if (RExC_parse != save_parse) {
11822        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11823       }
11824       vFAIL("Expecting '(?flags:(?[...'");
11825      }
11826      RExC_parse++;
11827      (void) handle_regex_sets(pRExC_state, &current, flagp,
11828              depth+1, oregcomp_parse);
11829
11830      /* Here, 'current' contains the embedded expression's
11831      * inversion list, and RExC_parse points to the trailing
11832      * ']'; the next character should be the ')' which will be
11833      * paired with the '(' that has been put on the stack, so
11834      * the whole embedded expression reduces to '(operand)' */
11835      RExC_parse++;
11836
11837      RExC_flags = save_flags;
11838      goto handle_operand;
11839     }
11840     /* FALL THROUGH */
11841
11842    default:
11843     RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11844     vFAIL("Unexpected character");
11845
11846    case '\\':
11847     /* regclass() can only return RESTART_UTF8 if multi-char
11848     folds are allowed.  */
11849     if (!regclass(pRExC_state, flagp,depth+1,
11850        TRUE, /* means parse just the next thing */
11851        FALSE, /* don't allow multi-char folds */
11852        FALSE, /* don't silence non-portable warnings.  */
11853        &current))
11854      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11855       *flagp);
11856     /* regclass() will return with parsing just the \ sequence,
11857     * leaving the parse pointer at the next thing to parse */
11858     RExC_parse--;
11859     goto handle_operand;
11860
11861    case '[':   /* Is a bracketed character class */
11862    {
11863     bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11864
11865     if (! is_posix_class) {
11866      RExC_parse++;
11867     }
11868
11869     /* regclass() can only return RESTART_UTF8 if multi-char
11870     folds are allowed.  */
11871     if(!regclass(pRExC_state, flagp,depth+1,
11872        is_posix_class, /* parse the whole char class
11873             only if not a posix class */
11874        FALSE, /* don't allow multi-char folds */
11875        FALSE, /* don't silence non-portable warnings.  */
11876        &current))
11877      FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
11878       *flagp);
11879     /* function call leaves parse pointing to the ']', except if we
11880     * faked it */
11881     if (is_posix_class) {
11882      RExC_parse--;
11883     }
11884
11885     goto handle_operand;
11886    }
11887
11888    case '&':
11889    case '|':
11890    case '+':
11891    case '-':
11892    case '^':
11893     if (top_index < 0
11894      || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11895      || ! IS_OPERAND(*top_ptr))
11896     {
11897      RExC_parse++;
11898      vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11899     }
11900     av_push(stack, newSVuv(curchar));
11901     break;
11902
11903    case '!':
11904     av_push(stack, newSVuv(curchar));
11905     break;
11906
11907    case '(':
11908     if (top_index >= 0) {
11909      top_ptr = av_fetch(stack, top_index, FALSE);
11910      assert(top_ptr);
11911      if (IS_OPERAND(*top_ptr)) {
11912       RExC_parse++;
11913       vFAIL("Unexpected '(' with no preceding operator");
11914      }
11915     }
11916     av_push(stack, newSVuv(curchar));
11917     break;
11918
11919    case ')':
11920    {
11921     SV* lparen;
11922     if (top_index < 1
11923      || ! (current = av_pop(stack))
11924      || ! IS_OPERAND(current)
11925      || ! (lparen = av_pop(stack))
11926      || IS_OPERAND(lparen)
11927      || SvUV(lparen) != '(')
11928     {
11929      SvREFCNT_dec(current);
11930      RExC_parse++;
11931      vFAIL("Unexpected ')'");
11932     }
11933     top_index -= 2;
11934     SvREFCNT_dec_NN(lparen);
11935
11936     /* FALL THROUGH */
11937    }
11938
11939    handle_operand:
11940
11941     /* Here, we have an operand to process, in 'current' */
11942
11943     if (top_index < 0) {    /* Just push if stack is empty */
11944      av_push(stack, current);
11945     }
11946     else {
11947      SV* top = av_pop(stack);
11948      SV *prev = NULL;
11949      char current_operator;
11950
11951      if (IS_OPERAND(top)) {
11952       SvREFCNT_dec_NN(top);
11953       SvREFCNT_dec_NN(current);
11954       vFAIL("Operand with no preceding operator");
11955      }
11956      current_operator = (char) SvUV(top);
11957      switch (current_operator) {
11958       case '(':   /* Push the '(' back on followed by the new
11959          operand */
11960        av_push(stack, top);
11961        av_push(stack, current);
11962        SvREFCNT_inc(top);  /* Counters the '_dec' done
11963             just after the 'break', so
11964             it doesn't get wrongly freed
11965             */
11966        break;
11967
11968       case '!':
11969        _invlist_invert(current);
11970
11971        /* Unlike binary operators, the top of the stack,
11972        * now that this unary one has been popped off, may
11973        * legally be an operator, and we now have operand
11974        * for it. */
11975        top_index--;
11976        SvREFCNT_dec_NN(top);
11977        goto handle_operand;
11978
11979       case '&':
11980        prev = av_pop(stack);
11981        _invlist_intersection(prev,
11982             current,
11983             &current);
11984        av_push(stack, current);
11985        break;
11986
11987       case '|':
11988       case '+':
11989        prev = av_pop(stack);
11990        _invlist_union(prev, current, &current);
11991        av_push(stack, current);
11992        break;
11993
11994       case '-':
11995        prev = av_pop(stack);;
11996        _invlist_subtract(prev, current, &current);
11997        av_push(stack, current);
11998        break;
11999
12000       case '^':   /* The union minus the intersection */
12001       {
12002        SV* i = NULL;
12003        SV* u = NULL;
12004        SV* element;
12005
12006        prev = av_pop(stack);
12007        _invlist_union(prev, current, &u);
12008        _invlist_intersection(prev, current, &i);
12009        /* _invlist_subtract will overwrite current
12010         without freeing what it already contains */
12011        element = current;
12012        _invlist_subtract(u, i, &current);
12013        av_push(stack, current);
12014        SvREFCNT_dec_NN(i);
12015        SvREFCNT_dec_NN(u);
12016        SvREFCNT_dec_NN(element);
12017        break;
12018       }
12019
12020       default:
12021        Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12022     }
12023     SvREFCNT_dec_NN(top);
12024     SvREFCNT_dec(prev);
12025    }
12026   }
12027
12028   RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12029  }
12030
12031  if (av_tindex(stack) < 0   /* Was empty */
12032   || ((final = av_pop(stack)) == NULL)
12033   || ! IS_OPERAND(final)
12034   || av_tindex(stack) >= 0)  /* More left on stack */
12035  {
12036   vFAIL("Incomplete expression within '(?[ ])'");
12037  }
12038
12039  /* Here, 'final' is the resultant inversion list from evaluating the
12040  * expression.  Return it if so requested */
12041  if (return_invlist) {
12042   *return_invlist = final;
12043   return END;
12044  }
12045
12046  /* Otherwise generate a resultant node, based on 'final'.  regclass() is
12047  * expecting a string of ranges and individual code points */
12048  invlist_iterinit(final);
12049  result_string = newSVpvs("");
12050  while (invlist_iternext(final, &start, &end)) {
12051   if (start == end) {
12052    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12053   }
12054   else {
12055    Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12056              start,          end);
12057   }
12058  }
12059
12060  save_parse = RExC_parse;
12061  RExC_parse = SvPV(result_string, len);
12062  save_end = RExC_end;
12063  RExC_end = RExC_parse + len;
12064
12065  /* We turn off folding around the call, as the class we have constructed
12066  * already has all folding taken into consideration, and we don't want
12067  * regclass() to add to that */
12068  RExC_flags &= ~RXf_PMf_FOLD;
12069  /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12070  */
12071  node = regclass(pRExC_state, flagp,depth+1,
12072      FALSE, /* means parse the whole char class */
12073      FALSE, /* don't allow multi-char folds */
12074      TRUE, /* silence non-portable warnings.  The above may very
12075        well have generated non-portable code points, but
12076        they're valid on this machine */
12077      NULL);
12078  if (!node)
12079   FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12080      PTR2UV(flagp));
12081  if (save_fold) {
12082   RExC_flags |= RXf_PMf_FOLD;
12083  }
12084  RExC_parse = save_parse + 1;
12085  RExC_end = save_end;
12086  SvREFCNT_dec_NN(final);
12087  SvREFCNT_dec_NN(result_string);
12088
12089  nextchar(pRExC_state);
12090  Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12091  return node;
12092 }
12093 #undef IS_OPERAND
12094
12095 /* The names of properties whose definitions are not known at compile time are
12096  * stored in this SV, after a constant heading.  So if the length has been
12097  * changed since initialization, then there is a run-time definition. */
12098 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12099
12100 STATIC regnode *
12101 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12102     const bool stop_at_1,  /* Just parse the next thing, don't
12103           look for a full character class */
12104     bool allow_multi_folds,
12105     const bool silence_non_portable,   /* Don't output warnings
12106              about too large
12107              characters */
12108     SV** ret_invlist)  /* Return an inversion list, not a node */
12109 {
12110  /* parse a bracketed class specification.  Most of these will produce an
12111  * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12112  * EXACTFish node; [[:ascii:]], a POSIXA node; etc.  It is more complex
12113  * under /i with multi-character folds: it will be rewritten following the
12114  * paradigm of this example, where the <multi-fold>s are characters which
12115  * fold to multiple character sequences:
12116  *      /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12117  * gets effectively rewritten as:
12118  *      /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12119  * reg() gets called (recursively) on the rewritten version, and this
12120  * function will return what it constructs.  (Actually the <multi-fold>s
12121  * aren't physically removed from the [abcdefghi], it's just that they are
12122  * ignored in the recursion by means of a flag:
12123  * <RExC_in_multi_char_class>.)
12124  *
12125  * ANYOF nodes contain a bit map for the first 256 characters, with the
12126  * corresponding bit set if that character is in the list.  For characters
12127  * above 255, a range list or swash is used.  There are extra bits for \w,
12128  * etc. in locale ANYOFs, as what these match is not determinable at
12129  * compile time
12130  *
12131  * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12132  * to be restarted.  This can only happen if ret_invlist is non-NULL.
12133  */
12134
12135  dVAR;
12136  UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12137  IV range = 0;
12138  UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12139  regnode *ret;
12140  STRLEN numlen;
12141  IV namedclass = OOB_NAMEDCLASS;
12142  char *rangebegin = NULL;
12143  bool need_class = 0;
12144  SV *listsv = NULL;
12145  STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12146          than just initialized.  */
12147  SV* properties = NULL;    /* Code points that match \p{} \P{} */
12148  SV* posixes = NULL;     /* Code points that match classes like, [:word:],
12149        extended beyond the Latin1 range */
12150  UV element_count = 0;   /* Number of distinct elements in the class.
12151        Optimizations may be possible if this is tiny */
12152  AV * multi_char_matches = NULL; /* Code points that fold to more than one
12153          character; used under /i */
12154  UV n;
12155  char * stop_ptr = RExC_end;    /* where to stop parsing */
12156  const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12157             space? */
12158  const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12159
12160  /* Unicode properties are stored in a swash; this holds the current one
12161  * being parsed.  If this swash is the only above-latin1 component of the
12162  * character class, an optimization is to pass it directly on to the
12163  * execution engine.  Otherwise, it is set to NULL to indicate that there
12164  * are other things in the class that have to be dealt with at execution
12165  * time */
12166  SV* swash = NULL;  /* Code points that match \p{} \P{} */
12167
12168  /* Set if a component of this character class is user-defined; just passed
12169  * on to the engine */
12170  bool has_user_defined_property = FALSE;
12171
12172  /* inversion list of code points this node matches only when the target
12173  * string is in UTF-8.  (Because is under /d) */
12174  SV* depends_list = NULL;
12175
12176  /* inversion list of code points this node matches.  For much of the
12177  * function, it includes only those that match regardless of the utf8ness
12178  * of the target string */
12179  SV* cp_list = NULL;
12180
12181 #ifdef EBCDIC
12182  /* In a range, counts how many 0-2 of the ends of it came from literals,
12183  * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
12184  UV literal_endpoint = 0;
12185 #endif
12186  bool invert = FALSE;    /* Is this class to be complemented */
12187
12188  /* Is there any thing like \W or [:^digit:] that matches above the legal
12189  * Unicode range? */
12190  bool runtime_posix_matches_above_Unicode = FALSE;
12191
12192  regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12193   case we need to change the emitted regop to an EXACT. */
12194  const char * orig_parse = RExC_parse;
12195  const I32 orig_size = RExC_size;
12196  GET_RE_DEBUG_FLAGS_DECL;
12197
12198  PERL_ARGS_ASSERT_REGCLASS;
12199 #ifndef DEBUGGING
12200  PERL_UNUSED_ARG(depth);
12201 #endif
12202
12203  DEBUG_PARSE("clas");
12204
12205  /* Assume we are going to generate an ANYOF node. */
12206  ret = reganode(pRExC_state, ANYOF, 0);
12207
12208  if (SIZE_ONLY) {
12209   RExC_size += ANYOF_SKIP;
12210   listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12211  }
12212  else {
12213   ANYOF_FLAGS(ret) = 0;
12214
12215   RExC_emit += ANYOF_SKIP;
12216   if (LOC) {
12217    ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12218   }
12219   listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12220   initial_listsv_len = SvCUR(listsv);
12221   SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated.  */
12222  }
12223
12224  if (skip_white) {
12225   RExC_parse = regpatws(pRExC_state, RExC_parse,
12226        FALSE /* means don't recognize comments */);
12227  }
12228
12229  if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12230   RExC_parse++;
12231   invert = TRUE;
12232   allow_multi_folds = FALSE;
12233   RExC_naughty++;
12234   if (skip_white) {
12235    RExC_parse = regpatws(pRExC_state, RExC_parse,
12236         FALSE /* means don't recognize comments */);
12237   }
12238  }
12239
12240  /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12241  if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12242   const char *s = RExC_parse;
12243   const char  c = *s++;
12244
12245   while (isWORDCHAR(*s))
12246    s++;
12247   if (*s && c == *s && s[1] == ']') {
12248    SAVEFREESV(RExC_rx_sv);
12249    ckWARN3reg(s+2,
12250      "POSIX syntax [%c %c] belongs inside character classes",
12251      c, c);
12252    (void)ReREFCNT_inc(RExC_rx_sv);
12253   }
12254  }
12255
12256  /* If the caller wants us to just parse a single element, accomplish this
12257  * by faking the loop ending condition */
12258  if (stop_at_1 && RExC_end > RExC_parse) {
12259   stop_ptr = RExC_parse + 1;
12260  }
12261
12262  /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12263  if (UCHARAT(RExC_parse) == ']')
12264   goto charclassloop;
12265
12266 parseit:
12267  while (1) {
12268   if  (RExC_parse >= stop_ptr) {
12269    break;
12270   }
12271
12272   if (skip_white) {
12273    RExC_parse = regpatws(pRExC_state, RExC_parse,
12274         FALSE /* means don't recognize comments */);
12275   }
12276
12277   if  (UCHARAT(RExC_parse) == ']') {
12278    break;
12279   }
12280
12281  charclassloop:
12282
12283   namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12284   save_value = value;
12285   save_prevvalue = prevvalue;
12286
12287   if (!range) {
12288    rangebegin = RExC_parse;
12289    element_count++;
12290   }
12291   if (UTF) {
12292    value = utf8n_to_uvchr((U8*)RExC_parse,
12293         RExC_end - RExC_parse,
12294         &numlen, UTF8_ALLOW_DEFAULT);
12295    RExC_parse += numlen;
12296   }
12297   else
12298    value = UCHARAT(RExC_parse++);
12299
12300   if (value == '['
12301    && RExC_parse < RExC_end
12302    && POSIXCC(UCHARAT(RExC_parse)))
12303   {
12304    namedclass = regpposixcc(pRExC_state, value, strict);
12305   }
12306   else if (value == '\\') {
12307    if (UTF) {
12308     value = utf8n_to_uvchr((U8*)RExC_parse,
12309         RExC_end - RExC_parse,
12310         &numlen, UTF8_ALLOW_DEFAULT);
12311     RExC_parse += numlen;
12312    }
12313    else
12314     value = UCHARAT(RExC_parse++);
12315
12316    /* Some compilers cannot handle switching on 64-bit integer
12317    * values, therefore value cannot be an UV.  Yes, this will
12318    * be a problem later if we want switch on Unicode.
12319    * A similar issue a little bit later when switching on
12320    * namedclass. --jhi */
12321
12322    /* If the \ is escaping white space when white space is being
12323    * skipped, it means that that white space is wanted literally, and
12324    * is already in 'value'.  Otherwise, need to translate the escape
12325    * into what it signifies. */
12326    if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12327
12328    case 'w': namedclass = ANYOF_WORDCHAR; break;
12329    case 'W': namedclass = ANYOF_NWORDCHAR; break;
12330    case 's': namedclass = ANYOF_SPACE; break;
12331    case 'S': namedclass = ANYOF_NSPACE; break;
12332    case 'd': namedclass = ANYOF_DIGIT; break;
12333    case 'D': namedclass = ANYOF_NDIGIT; break;
12334    case 'v': namedclass = ANYOF_VERTWS; break;
12335    case 'V': namedclass = ANYOF_NVERTWS; break;
12336    case 'h': namedclass = ANYOF_HORIZWS; break;
12337    case 'H': namedclass = ANYOF_NHORIZWS; break;
12338    case 'N':  /* Handle \N{NAME} in class */
12339     {
12340      /* We only pay attention to the first char of
12341      multichar strings being returned. I kinda wonder
12342      if this makes sense as it does change the behaviour
12343      from earlier versions, OTOH that behaviour was broken
12344      as well. */
12345      if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12346          TRUE, /* => charclass */
12347          strict))
12348      {
12349       if (*flagp & RESTART_UTF8)
12350        FAIL("panic: grok_bslash_N set RESTART_UTF8");
12351       goto parseit;
12352      }
12353     }
12354     break;
12355    case 'p':
12356    case 'P':
12357     {
12358     char *e;
12359
12360     /* We will handle any undefined properties ourselves */
12361     U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12362
12363     if (RExC_parse >= RExC_end)
12364      vFAIL2("Empty \\%c{}", (U8)value);
12365     if (*RExC_parse == '{') {
12366      const U8 c = (U8)value;
12367      e = strchr(RExC_parse++, '}');
12368      if (!e)
12369       vFAIL2("Missing right brace on \\%c{}", c);
12370      while (isSPACE(UCHARAT(RExC_parse)))
12371       RExC_parse++;
12372      if (e == RExC_parse)
12373       vFAIL2("Empty \\%c{}", c);
12374      n = e - RExC_parse;
12375      while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12376       n--;
12377     }
12378     else {
12379      e = RExC_parse;
12380      n = 1;
12381     }
12382     if (!SIZE_ONLY) {
12383      SV* invlist;
12384      char* name;
12385
12386      if (UCHARAT(RExC_parse) == '^') {
12387       RExC_parse++;
12388       n--;
12389       /* toggle.  (The rhs xor gets the single bit that
12390       * differs between P and p; the other xor inverts just
12391       * that bit) */
12392       value ^= 'P' ^ 'p';
12393
12394       while (isSPACE(UCHARAT(RExC_parse))) {
12395        RExC_parse++;
12396        n--;
12397       }
12398      }
12399      /* Try to get the definition of the property into
12400      * <invlist>.  If /i is in effect, the effective property
12401      * will have its name be <__NAME_i>.  The design is
12402      * discussed in commit
12403      * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12404      Newx(name, n + sizeof("_i__\n"), char);
12405
12406      sprintf(name, "%s%.*s%s\n",
12407          (FOLD) ? "__" : "",
12408          (int)n,
12409          RExC_parse,
12410          (FOLD) ? "_i" : ""
12411      );
12412
12413      /* Look up the property name, and get its swash and
12414      * inversion list, if the property is found  */
12415      if (swash) {
12416       SvREFCNT_dec_NN(swash);
12417      }
12418      swash = _core_swash_init("utf8", name, &PL_sv_undef,
12419            1, /* binary */
12420            0, /* not tr/// */
12421            NULL, /* No inversion list */
12422            &swash_init_flags
12423            );
12424      if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12425       if (swash) {
12426        SvREFCNT_dec_NN(swash);
12427        swash = NULL;
12428       }
12429
12430       /* Here didn't find it.  It could be a user-defined
12431       * property that will be available at run-time.  If we
12432       * accept only compile-time properties, is an error;
12433       * otherwise add it to the list for run-time look up */
12434       if (ret_invlist) {
12435        RExC_parse = e + 1;
12436        vFAIL3("Property '%.*s' is unknown", (int) n, name);
12437       }
12438       Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12439           (value == 'p' ? '+' : '!'),
12440           name);
12441       has_user_defined_property = TRUE;
12442
12443       /* We don't know yet, so have to assume that the
12444       * property could match something in the Latin1 range,
12445       * hence something that isn't utf8.  Note that this
12446       * would cause things in <depends_list> to match
12447       * inappropriately, except that any \p{}, including
12448       * this one forces Unicode semantics, which means there
12449       * is <no depends_list> */
12450       ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12451      }
12452      else {
12453
12454       /* Here, did get the swash and its inversion list.  If
12455       * the swash is from a user-defined property, then this
12456       * whole character class should be regarded as such */
12457       has_user_defined_property =
12458          (swash_init_flags
12459          & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12460
12461       /* Invert if asking for the complement */
12462       if (value == 'P') {
12463        _invlist_union_complement_2nd(properties,
12464               invlist,
12465               &properties);
12466
12467        /* The swash can't be used as-is, because we've
12468        * inverted things; delay removing it to here after
12469        * have copied its invlist above */
12470        SvREFCNT_dec_NN(swash);
12471        swash = NULL;
12472       }
12473       else {
12474        _invlist_union(properties, invlist, &properties);
12475       }
12476      }
12477      Safefree(name);
12478     }
12479     RExC_parse = e + 1;
12480     namedclass = ANYOF_UNIPROP;  /* no official name, but it's
12481             named */
12482
12483     /* \p means they want Unicode semantics */
12484     RExC_uni_semantics = 1;
12485     }
12486     break;
12487    case 'n': value = '\n';   break;
12488    case 'r': value = '\r';   break;
12489    case 't': value = '\t';   break;
12490    case 'f': value = '\f';   break;
12491    case 'b': value = '\b';   break;
12492    case 'e': value = ASCII_TO_NATIVE('\033');break;
12493    case 'a': value = ASCII_TO_NATIVE('\007');break;
12494    case 'o':
12495     RExC_parse--; /* function expects to be pointed at the 'o' */
12496     {
12497      const char* error_msg;
12498      bool valid = grok_bslash_o(&RExC_parse,
12499            &value,
12500            &error_msg,
12501            SIZE_ONLY,   /* warnings in pass
12502                1 only */
12503            strict,
12504            silence_non_portable,
12505            UTF);
12506      if (! valid) {
12507       vFAIL(error_msg);
12508      }
12509     }
12510     if (PL_encoding && value < 0x100) {
12511      goto recode_encoding;
12512     }
12513     break;
12514    case 'x':
12515     RExC_parse--; /* function expects to be pointed at the 'x' */
12516     {
12517      const char* error_msg;
12518      bool valid = grok_bslash_x(&RExC_parse,
12519            &value,
12520            &error_msg,
12521            TRUE, /* Output warnings */
12522            strict,
12523            silence_non_portable,
12524            UTF);
12525      if (! valid) {
12526       vFAIL(error_msg);
12527      }
12528     }
12529     if (PL_encoding && value < 0x100)
12530      goto recode_encoding;
12531     break;
12532    case 'c':
12533     value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12534     break;
12535    case '0': case '1': case '2': case '3': case '4':
12536    case '5': case '6': case '7':
12537     {
12538      /* Take 1-3 octal digits */
12539      I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12540      numlen = (strict) ? 4 : 3;
12541      value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12542      RExC_parse += numlen;
12543      if (numlen != 3) {
12544       if (strict) {
12545        RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12546        vFAIL("Need exactly 3 octal digits");
12547       }
12548       else if (! SIZE_ONLY /* like \08, \178 */
12549         && numlen < 3
12550         && RExC_parse < RExC_end
12551         && isDIGIT(*RExC_parse)
12552         && ckWARN(WARN_REGEXP))
12553       {
12554        SAVEFREESV(RExC_rx_sv);
12555        reg_warn_non_literal_string(
12556         RExC_parse + 1,
12557         form_short_octal_warning(RExC_parse, numlen));
12558        (void)ReREFCNT_inc(RExC_rx_sv);
12559       }
12560      }
12561      if (PL_encoding && value < 0x100)
12562       goto recode_encoding;
12563      break;
12564     }
12565    recode_encoding:
12566     if (! RExC_override_recoding) {
12567      SV* enc = PL_encoding;
12568      value = reg_recode((const char)(U8)value, &enc);
12569      if (!enc) {
12570       if (strict) {
12571        vFAIL("Invalid escape in the specified encoding");
12572       }
12573       else if (SIZE_ONLY) {
12574        ckWARNreg(RExC_parse,
12575         "Invalid escape in the specified encoding");
12576       }
12577      }
12578      break;
12579     }
12580    default:
12581     /* Allow \_ to not give an error */
12582     if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12583      if (strict) {
12584       vFAIL2("Unrecognized escape \\%c in character class",
12585        (int)value);
12586      }
12587      else {
12588       SAVEFREESV(RExC_rx_sv);
12589       ckWARN2reg(RExC_parse,
12590        "Unrecognized escape \\%c in character class passed through",
12591        (int)value);
12592       (void)ReREFCNT_inc(RExC_rx_sv);
12593      }
12594     }
12595     break;
12596    }   /* End of switch on char following backslash */
12597   } /* end of handling backslash escape sequences */
12598 #ifdef EBCDIC
12599   else
12600    literal_endpoint++;
12601 #endif
12602
12603   /* Here, we have the current token in 'value' */
12604
12605   /* What matches in a locale is not known until runtime.  This includes
12606   * what the Posix classes (like \w, [:space:]) match.  Room must be
12607   * reserved (one time per class) to store such classes, either if Perl
12608   * is compiled so that locale nodes always should have this space, or
12609   * if there is such class info to be stored.  The space will contain a
12610   * bit for each named class that is to be matched against.  This isn't
12611   * needed for \p{} and pseudo-classes, as they are not affected by
12612   * locale, and hence are dealt with separately */
12613   if (LOC
12614    && ! need_class
12615    && (ANYOF_LOCALE == ANYOF_CLASS
12616     || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12617   {
12618    need_class = 1;
12619    if (SIZE_ONLY) {
12620     RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12621    }
12622    else {
12623     RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12624     ANYOF_CLASS_ZERO(ret);
12625    }
12626    ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12627   }
12628
12629   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12630
12631    /* a bad range like a-\d, a-[:digit:].  The '-' is taken as a
12632    * literal, as is the character that began the false range, i.e.
12633    * the 'a' in the examples */
12634    if (range) {
12635     if (!SIZE_ONLY) {
12636      const int w = (RExC_parse >= rangebegin)
12637         ? RExC_parse - rangebegin
12638         : 0;
12639      if (strict) {
12640       vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12641      }
12642      else {
12643       SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12644       ckWARN4reg(RExC_parse,
12645         "False [] range \"%*.*s\"",
12646         w, w, rangebegin);
12647       (void)ReREFCNT_inc(RExC_rx_sv);
12648       cp_list = add_cp_to_invlist(cp_list, '-');
12649       cp_list = add_cp_to_invlist(cp_list, prevvalue);
12650      }
12651     }
12652
12653     range = 0; /* this was not a true range */
12654     element_count += 2; /* So counts for three values */
12655    }
12656
12657    if (! SIZE_ONLY) {
12658     U8 classnum = namedclass_to_classnum(namedclass);
12659     if (namedclass >= ANYOF_MAX) {  /* If a special class */
12660      if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12661
12662       /* Here, should be \h, \H, \v, or \V.  Neither /d nor
12663       * /l make a difference in what these match.  There
12664       * would be problems if these characters had folds
12665       * other than themselves, as cp_list is subject to
12666       * folding. */
12667       if (classnum != _CC_VERTSPACE) {
12668        assert(   namedclass == ANYOF_HORIZWS
12669         || namedclass == ANYOF_NHORIZWS);
12670
12671        /* It turns out that \h is just a synonym for
12672        * XPosixBlank */
12673        classnum = _CC_BLANK;
12674       }
12675
12676       _invlist_union_maybe_complement_2nd(
12677         cp_list,
12678         PL_XPosix_ptrs[classnum],
12679         cBOOL(namedclass % 2), /* Complement if odd
12680               (NHORIZWS, NVERTWS)
12681               */
12682         &cp_list);
12683      }
12684     }
12685     else if (classnum == _CC_ASCII) {
12686 #ifdef HAS_ISASCII
12687      if (LOC) {
12688       ANYOF_CLASS_SET(ret, namedclass);
12689      }
12690      else
12691 #endif  /* Not isascii(); just use the hard-coded definition for it */
12692      {
12693       _invlist_union_maybe_complement_2nd(
12694         posixes,
12695         PL_ASCII,
12696         cBOOL(namedclass % 2), /* Complement if odd
12697               (NASCII) */
12698         &posixes);
12699
12700       /* The code points 128-255 added above will be
12701       * subtracted out below under /d, so the flag needs to
12702       * be set */
12703       if (namedclass == ANYOF_NASCII && DEPENDS_SEMANTICS) {
12704        ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12705       }
12706      }
12707     }
12708     else {  /* Garden variety class */
12709
12710      /* The ascii range inversion list */
12711      SV* ascii_source = PL_Posix_ptrs[classnum];
12712
12713      /* The full Latin1 range inversion list */
12714      SV* l1_source = PL_L1Posix_ptrs[classnum];
12715
12716      /* This code is structured into two major clauses.  The
12717      * first is for classes whose complete definitions may not
12718      * already be known.  It not, the Latin1 definition
12719      * (guaranteed to already known) is used plus code is
12720      * generated to load the rest at run-time (only if needed).
12721      * If the complete definition is known, it drops down to
12722      * the second clause, where the complete definition is
12723      * known */
12724
12725      if (classnum < _FIRST_NON_SWASH_CC) {
12726
12727       /* Here, the class has a swash, which may or not
12728       * already be loaded */
12729
12730       /* The name of the property to use to match the full
12731       * eXtended Unicode range swash for this character
12732       * class */
12733       const char *Xname = swash_property_names[classnum];
12734
12735       /* If returning the inversion list, we can't defer
12736       * getting this until runtime */
12737       if (ret_invlist && !  PL_utf8_swash_ptrs[classnum]) {
12738        PL_utf8_swash_ptrs[classnum] =
12739         _core_swash_init("utf8", Xname, &PL_sv_undef,
12740            1, /* binary */
12741            0, /* not tr/// */
12742            NULL, /* No inversion list */
12743            NULL  /* No flags */
12744            );
12745        assert(PL_utf8_swash_ptrs[classnum]);
12746       }
12747       if ( !  PL_utf8_swash_ptrs[classnum]) {
12748        if (namedclass % 2 == 0) { /* A non-complemented
12749               class */
12750         /* If not /a matching, there are code points we
12751         * don't know at compile time.  Arrange for the
12752         * unknown matches to be loaded at run-time, if
12753         * needed */
12754         if (! AT_LEAST_ASCII_RESTRICTED) {
12755          Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12756                 Xname);
12757         }
12758         if (LOC) {  /* Under locale, set run-time
12759            lookup */
12760          ANYOF_CLASS_SET(ret, namedclass);
12761         }
12762         else {
12763          /* Add the current class's code points to
12764          * the running total */
12765          _invlist_union(posixes,
12766             (AT_LEAST_ASCII_RESTRICTED)
12767               ? ascii_source
12768               : l1_source,
12769             &posixes);
12770         }
12771        }
12772        else {  /* A complemented class */
12773         if (AT_LEAST_ASCII_RESTRICTED) {
12774          /* Under /a should match everything above
12775          * ASCII, plus the complement of the set's
12776          * ASCII matches */
12777          _invlist_union_complement_2nd(posixes,
12778                 ascii_source,
12779                 &posixes);
12780         }
12781         else {
12782          /* Arrange for the unknown matches to be
12783          * loaded at run-time, if needed */
12784          Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12785                 Xname);
12786          runtime_posix_matches_above_Unicode = TRUE;
12787          if (LOC) {
12788           ANYOF_CLASS_SET(ret, namedclass);
12789          }
12790          else {
12791
12792           /* We want to match everything in
12793           * Latin1, except those things that
12794           * l1_source matches */
12795           SV* scratch_list = NULL;
12796           _invlist_subtract(PL_Latin1, l1_source,
12797               &scratch_list);
12798
12799           /* Add the list from this class to the
12800           * running total */
12801           if (! posixes) {
12802            posixes = scratch_list;
12803           }
12804           else {
12805            _invlist_union(posixes,
12806               scratch_list,
12807               &posixes);
12808            SvREFCNT_dec_NN(scratch_list);
12809           }
12810           if (DEPENDS_SEMANTICS) {
12811            ANYOF_FLAGS(ret)
12812             |= ANYOF_NON_UTF8_LATIN1_ALL;
12813           }
12814          }
12815         }
12816        }
12817        goto namedclass_done;
12818       }
12819
12820       /* Here, there is a swash loaded for the class.  If no
12821       * inversion list for it yet, get it */
12822       if (! PL_XPosix_ptrs[classnum]) {
12823        PL_XPosix_ptrs[classnum]
12824        = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12825       }
12826      }
12827
12828      /* Here there is an inversion list already loaded for the
12829      * entire class */
12830
12831      if (namedclass % 2 == 0) {  /* A non-complemented class,
12832             like ANYOF_PUNCT */
12833       if (! LOC) {
12834        /* For non-locale, just add it to any existing list
12835        * */
12836        _invlist_union(posixes,
12837           (AT_LEAST_ASCII_RESTRICTED)
12838            ? ascii_source
12839            : PL_XPosix_ptrs[classnum],
12840           &posixes);
12841       }
12842       else {  /* Locale */
12843        SV* scratch_list = NULL;
12844
12845        /* For above Latin1 code points, we use the full
12846        * Unicode range */
12847        _invlist_intersection(PL_AboveLatin1,
12848             PL_XPosix_ptrs[classnum],
12849             &scratch_list);
12850        /* And set the output to it, adding instead if
12851        * there already is an output.  Checking if
12852        * 'posixes' is NULL first saves an extra clone.
12853        * Its reference count will be decremented at the
12854        * next union, etc, or if this is the only
12855        * instance, at the end of the routine */
12856        if (! posixes) {
12857         posixes = scratch_list;
12858        }
12859        else {
12860         _invlist_union(posixes, scratch_list, &posixes);
12861         SvREFCNT_dec_NN(scratch_list);
12862        }
12863
12864 #ifndef HAS_ISBLANK
12865        if (namedclass != ANYOF_BLANK) {
12866 #endif
12867         /* Set this class in the node for runtime
12868         * matching */
12869         ANYOF_CLASS_SET(ret, namedclass);
12870 #ifndef HAS_ISBLANK
12871        }
12872        else {
12873         /* No isblank(), use the hard-coded ASCII-range
12874         * blanks, adding them to the running total. */
12875
12876         _invlist_union(posixes, ascii_source, &posixes);
12877        }
12878 #endif
12879       }
12880      }
12881      else {  /* A complemented class, like ANYOF_NPUNCT */
12882       if (! LOC) {
12883        _invlist_union_complement_2nd(
12884             posixes,
12885             (AT_LEAST_ASCII_RESTRICTED)
12886              ? ascii_source
12887              : PL_XPosix_ptrs[classnum],
12888             &posixes);
12889        /* Under /d, everything in the upper half of the
12890        * Latin1 range matches this complement */
12891        if (DEPENDS_SEMANTICS) {
12892         ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12893        }
12894       }
12895       else {  /* Locale */
12896        SV* scratch_list = NULL;
12897        _invlist_subtract(PL_AboveLatin1,
12898            PL_XPosix_ptrs[classnum],
12899            &scratch_list);
12900        if (! posixes) {
12901         posixes = scratch_list;
12902        }
12903        else {
12904         _invlist_union(posixes, scratch_list, &posixes);
12905         SvREFCNT_dec_NN(scratch_list);
12906        }
12907 #ifndef HAS_ISBLANK
12908        if (namedclass != ANYOF_NBLANK) {
12909 #endif
12910         ANYOF_CLASS_SET(ret, namedclass);
12911 #ifndef HAS_ISBLANK
12912        }
12913        else {
12914         /* Get the list of all code points in Latin1
12915         * that are not ASCII blanks, and add them to
12916         * the running total */
12917         _invlist_subtract(PL_Latin1, ascii_source,
12918             &scratch_list);
12919         _invlist_union(posixes, scratch_list, &posixes);
12920         SvREFCNT_dec_NN(scratch_list);
12921        }
12922 #endif
12923       }
12924      }
12925     }
12926    namedclass_done:
12927     continue;   /* Go get next character */
12928    }
12929   } /* end of namedclass \blah */
12930
12931   /* Here, we have a single value.  If 'range' is set, it is the ending
12932   * of a range--check its validity.  Later, we will handle each
12933   * individual code point in the range.  If 'range' isn't set, this
12934   * could be the beginning of a range, so check for that by looking
12935   * ahead to see if the next real character to be processed is the range
12936   * indicator--the minus sign */
12937
12938   if (skip_white) {
12939    RExC_parse = regpatws(pRExC_state, RExC_parse,
12940         FALSE /* means don't recognize comments */);
12941   }
12942
12943   if (range) {
12944    if (prevvalue > value) /* b-a */ {
12945     const int w = RExC_parse - rangebegin;
12946     Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12947     range = 0; /* not a valid range */
12948    }
12949   }
12950   else {
12951    prevvalue = value; /* save the beginning of the potential range */
12952    if (! stop_at_1     /* Can't be a range if parsing just one thing */
12953     && *RExC_parse == '-')
12954    {
12955     char* next_char_ptr = RExC_parse + 1;
12956     if (skip_white) {   /* Get the next real char after the '-' */
12957      next_char_ptr = regpatws(pRExC_state,
12958            RExC_parse + 1,
12959            FALSE); /* means don't recognize
12960               comments */
12961     }
12962
12963     /* If the '-' is at the end of the class (just before the ']',
12964     * it is a literal minus; otherwise it is a range */
12965     if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12966      RExC_parse = next_char_ptr;
12967
12968      /* a bad range like \w-, [:word:]- ? */
12969      if (namedclass > OOB_NAMEDCLASS) {
12970       if (strict || ckWARN(WARN_REGEXP)) {
12971        const int w =
12972         RExC_parse >= rangebegin ?
12973         RExC_parse - rangebegin : 0;
12974        if (strict) {
12975         vFAIL4("False [] range \"%*.*s\"",
12976          w, w, rangebegin);
12977        }
12978        else {
12979         vWARN4(RExC_parse,
12980          "False [] range \"%*.*s\"",
12981          w, w, rangebegin);
12982        }
12983       }
12984       if (!SIZE_ONLY) {
12985        cp_list = add_cp_to_invlist(cp_list, '-');
12986       }
12987       element_count++;
12988      } else
12989       range = 1; /* yeah, it's a range! */
12990      continue; /* but do it the next time */
12991     }
12992    }
12993   }
12994
12995   /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12996   * if not */
12997
12998   /* non-Latin1 code point implies unicode semantics.  Must be set in
12999   * pass1 so is there for the whole of pass 2 */
13000   if (value > 255) {
13001    RExC_uni_semantics = 1;
13002   }
13003
13004   /* Ready to process either the single value, or the completed range.
13005   * For single-valued non-inverted ranges, we consider the possibility
13006   * of multi-char folds.  (We made a conscious decision to not do this
13007   * for the other cases because it can often lead to non-intuitive
13008   * results.  For example, you have the peculiar case that:
13009   *  "s s" =~ /^[^\xDF]+$/i => Y
13010   *  "ss"  =~ /^[^\xDF]+$/i => N
13011   *
13012   * See [perl #89750] */
13013   if (FOLD && allow_multi_folds && value == prevvalue) {
13014    if (value == LATIN_SMALL_LETTER_SHARP_S
13015     || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13016               value)))
13017    {
13018     /* Here <value> is indeed a multi-char fold.  Get what it is */
13019
13020     U8 foldbuf[UTF8_MAXBYTES_CASE];
13021     STRLEN foldlen;
13022
13023     UV folded = _to_uni_fold_flags(
13024         value,
13025         foldbuf,
13026         &foldlen,
13027         FOLD_FLAGS_FULL
13028         | ((LOC) ?  FOLD_FLAGS_LOCALE
13029            : (ASCII_FOLD_RESTRICTED)
13030            ? FOLD_FLAGS_NOMIX_ASCII
13031            : 0)
13032         );
13033
13034     /* Here, <folded> should be the first character of the
13035     * multi-char fold of <value>, with <foldbuf> containing the
13036     * whole thing.  But, if this fold is not allowed (because of
13037     * the flags), <fold> will be the same as <value>, and should
13038     * be processed like any other character, so skip the special
13039     * handling */
13040     if (folded != value) {
13041
13042      /* Skip if we are recursed, currently parsing the class
13043      * again.  Otherwise add this character to the list of
13044      * multi-char folds. */
13045      if (! RExC_in_multi_char_class) {
13046       AV** this_array_ptr;
13047       AV* this_array;
13048       STRLEN cp_count = utf8_length(foldbuf,
13049              foldbuf + foldlen);
13050       SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13051
13052       Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13053
13054
13055       if (! multi_char_matches) {
13056        multi_char_matches = newAV();
13057       }
13058
13059       /* <multi_char_matches> is actually an array of arrays.
13060       * There will be one or two top-level elements: [2],
13061       * and/or [3].  The [2] element is an array, each
13062       * element thereof is a character which folds to two
13063       * characters; likewise for [3].  (Unicode guarantees a
13064       * maximum of 3 characters in any fold.)  When we
13065       * rewrite the character class below, we will do so
13066       * such that the longest folds are written first, so
13067       * that it prefers the longest matching strings first.
13068       * This is done even if it turns out that any
13069       * quantifier is non-greedy, out of programmer
13070       * laziness.  Tom Christiansen has agreed that this is
13071       * ok.  This makes the test for the ligature 'ffi' come
13072       * before the test for 'ff' */
13073       if (av_exists(multi_char_matches, cp_count)) {
13074        this_array_ptr = (AV**) av_fetch(multi_char_matches,
13075                cp_count, FALSE);
13076        this_array = *this_array_ptr;
13077       }
13078       else {
13079        this_array = newAV();
13080        av_store(multi_char_matches, cp_count,
13081          (SV*) this_array);
13082       }
13083       av_push(this_array, multi_fold);
13084      }
13085
13086      /* This element should not be processed further in this
13087      * class */
13088      element_count--;
13089      value = save_value;
13090      prevvalue = save_prevvalue;
13091      continue;
13092     }
13093    }
13094   }
13095
13096   /* Deal with this element of the class */
13097   if (! SIZE_ONLY) {
13098 #ifndef EBCDIC
13099    cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13100 #else
13101    SV* this_range = _new_invlist(1);
13102    _append_range_to_invlist(this_range, prevvalue, value);
13103
13104    /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13105    * If this range was specified using something like 'i-j', we want
13106    * to include only the 'i' and the 'j', and not anything in
13107    * between, so exclude non-ASCII, non-alphabetics from it.
13108    * However, if the range was specified with something like
13109    * [\x89-\x91] or [\x89-j], all code points within it should be
13110    * included.  literal_endpoint==2 means both ends of the range used
13111    * a literal character, not \x{foo} */
13112    if (literal_endpoint == 2
13113     && (prevvalue >= 'a' && value <= 'z')
13114      || (prevvalue >= 'A' && value <= 'Z'))
13115    {
13116     _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13117          &this_range);
13118    }
13119    _invlist_union(cp_list, this_range, &cp_list);
13120    literal_endpoint = 0;
13121 #endif
13122   }
13123
13124   range = 0; /* this range (if it was one) is done now */
13125  } /* End of loop through all the text within the brackets */
13126
13127  /* If anything in the class expands to more than one character, we have to
13128  * deal with them by building up a substitute parse string, and recursively
13129  * calling reg() on it, instead of proceeding */
13130  if (multi_char_matches) {
13131   SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13132   I32 cp_count;
13133   STRLEN len;
13134   char *save_end = RExC_end;
13135   char *save_parse = RExC_parse;
13136   bool first_time = TRUE;     /* First multi-char occurrence doesn't get
13137          a "|" */
13138   I32 reg_flags;
13139
13140   assert(! invert);
13141 #if 0   /* Have decided not to deal with multi-char folds in inverted classes,
13142   because too confusing */
13143   if (invert) {
13144    sv_catpv(substitute_parse, "(?:");
13145   }
13146 #endif
13147
13148   /* Look at the longest folds first */
13149   for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13150
13151    if (av_exists(multi_char_matches, cp_count)) {
13152     AV** this_array_ptr;
13153     SV* this_sequence;
13154
13155     this_array_ptr = (AV**) av_fetch(multi_char_matches,
13156             cp_count, FALSE);
13157     while ((this_sequence = av_pop(*this_array_ptr)) !=
13158                 &PL_sv_undef)
13159     {
13160      if (! first_time) {
13161       sv_catpv(substitute_parse, "|");
13162      }
13163      first_time = FALSE;
13164
13165      sv_catpv(substitute_parse, SvPVX(this_sequence));
13166     }
13167    }
13168   }
13169
13170   /* If the character class contains anything else besides these
13171   * multi-character folds, have to include it in recursive parsing */
13172   if (element_count) {
13173    sv_catpv(substitute_parse, "|[");
13174    sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13175    sv_catpv(substitute_parse, "]");
13176   }
13177
13178   sv_catpv(substitute_parse, ")");
13179 #if 0
13180   if (invert) {
13181    /* This is a way to get the parse to skip forward a whole named
13182    * sequence instead of matching the 2nd character when it fails the
13183    * first */
13184    sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13185   }
13186 #endif
13187
13188   RExC_parse = SvPV(substitute_parse, len);
13189   RExC_end = RExC_parse + len;
13190   RExC_in_multi_char_class = 1;
13191   RExC_emit = (regnode *)orig_emit;
13192
13193   ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13194
13195   *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13196
13197   RExC_parse = save_parse;
13198   RExC_end = save_end;
13199   RExC_in_multi_char_class = 0;
13200   SvREFCNT_dec_NN(multi_char_matches);
13201   return ret;
13202  }
13203
13204  /* If the character class contains only a single element, it may be
13205  * optimizable into another node type which is smaller and runs faster.
13206  * Check if this is the case for this class */
13207  if (element_count == 1 && ! ret_invlist) {
13208   U8 op = END;
13209   U8 arg = 0;
13210
13211   if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13212            [:digit:] or \p{foo} */
13213
13214    /* All named classes are mapped into POSIXish nodes, with its FLAG
13215    * argument giving which class it is */
13216    switch ((I32)namedclass) {
13217     case ANYOF_UNIPROP:
13218      break;
13219
13220     /* These don't depend on the charset modifiers.  They always
13221     * match under /u rules */
13222     case ANYOF_NHORIZWS:
13223     case ANYOF_HORIZWS:
13224      namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13225      /* FALLTHROUGH */
13226
13227     case ANYOF_NVERTWS:
13228     case ANYOF_VERTWS:
13229      op = POSIXU;
13230      goto join_posix;
13231
13232     /* The actual POSIXish node for all the rest depends on the
13233     * charset modifier.  The ones in the first set depend only on
13234     * ASCII or, if available on this platform, locale */
13235     case ANYOF_ASCII:
13236     case ANYOF_NASCII:
13237 #ifdef HAS_ISASCII
13238      op = (LOC) ? POSIXL : POSIXA;
13239 #else
13240      op = POSIXA;
13241 #endif
13242      goto join_posix;
13243
13244     case ANYOF_NCASED:
13245     case ANYOF_LOWER:
13246     case ANYOF_NLOWER:
13247     case ANYOF_UPPER:
13248     case ANYOF_NUPPER:
13249      /* under /a could be alpha */
13250      if (FOLD) {
13251       if (ASCII_RESTRICTED) {
13252        namedclass = ANYOF_ALPHA + (namedclass % 2);
13253       }
13254       else if (! LOC) {
13255        break;
13256       }
13257      }
13258      /* FALLTHROUGH */
13259
13260     /* The rest have more possibilities depending on the charset.
13261     * We take advantage of the enum ordering of the charset
13262     * modifiers to get the exact node type, */
13263     default:
13264      op = POSIXD + get_regex_charset(RExC_flags);
13265      if (op > POSIXA) { /* /aa is same as /a */
13266       op = POSIXA;
13267      }
13268 #ifndef HAS_ISBLANK
13269      if (op == POSIXL
13270       && (namedclass == ANYOF_BLANK
13271        || namedclass == ANYOF_NBLANK))
13272      {
13273       op = POSIXA;
13274      }
13275 #endif
13276
13277     join_posix:
13278      /* The odd numbered ones are the complements of the
13279      * next-lower even number one */
13280      if (namedclass % 2 == 1) {
13281       invert = ! invert;
13282       namedclass--;
13283      }
13284      arg = namedclass_to_classnum(namedclass);
13285      break;
13286    }
13287   }
13288   else if (value == prevvalue) {
13289
13290    /* Here, the class consists of just a single code point */
13291
13292    if (invert) {
13293     if (! LOC && value == '\n') {
13294      op = REG_ANY; /* Optimize [^\n] */
13295      *flagp |= HASWIDTH|SIMPLE;
13296      RExC_naughty++;
13297     }
13298    }
13299    else if (value < 256 || UTF) {
13300
13301     /* Optimize a single value into an EXACTish node, but not if it
13302     * would require converting the pattern to UTF-8. */
13303     op = compute_EXACTish(pRExC_state);
13304    }
13305   } /* Otherwise is a range */
13306   else if (! LOC) {   /* locale could vary these */
13307    if (prevvalue == '0') {
13308     if (value == '9') {
13309      arg = _CC_DIGIT;
13310      op = POSIXA;
13311     }
13312    }
13313   }
13314
13315   /* Here, we have changed <op> away from its initial value iff we found
13316   * an optimization */
13317   if (op != END) {
13318
13319    /* Throw away this ANYOF regnode, and emit the calculated one,
13320    * which should correspond to the beginning, not current, state of
13321    * the parse */
13322    const char * cur_parse = RExC_parse;
13323    RExC_parse = (char *)orig_parse;
13324    if ( SIZE_ONLY) {
13325     if (! LOC) {
13326
13327      /* To get locale nodes to not use the full ANYOF size would
13328      * require moving the code above that writes the portions
13329      * of it that aren't in other nodes to after this point.
13330      * e.g.  ANYOF_CLASS_SET */
13331      RExC_size = orig_size;
13332     }
13333    }
13334    else {
13335     RExC_emit = (regnode *)orig_emit;
13336     if (PL_regkind[op] == POSIXD) {
13337      if (invert) {
13338       op += NPOSIXD - POSIXD;
13339      }
13340     }
13341    }
13342
13343    ret = reg_node(pRExC_state, op);
13344
13345    if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13346     if (! SIZE_ONLY) {
13347      FLAGS(ret) = arg;
13348     }
13349     *flagp |= HASWIDTH|SIMPLE;
13350    }
13351    else if (PL_regkind[op] == EXACT) {
13352     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13353    }
13354
13355    RExC_parse = (char *) cur_parse;
13356
13357    SvREFCNT_dec(posixes);
13358    SvREFCNT_dec(cp_list);
13359    return ret;
13360   }
13361  }
13362
13363  if (SIZE_ONLY)
13364   return ret;
13365  /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13366
13367  /* If folding, we calculate all characters that could fold to or from the
13368  * ones already on the list */
13369  if (FOLD && cp_list) {
13370   UV start, end; /* End points of code point ranges */
13371
13372   SV* fold_intersection = NULL;
13373
13374   /* If the highest code point is within Latin1, we can use the
13375   * compiled-in Alphas list, and not have to go out to disk.  This
13376   * yields two false positives, the masculine and feminine ordinal
13377   * indicators, which are weeded out below using the
13378   * IS_IN_SOME_FOLD_L1() macro */
13379   if (invlist_highest(cp_list) < 256) {
13380    _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13381               &fold_intersection);
13382   }
13383   else {
13384
13385    /* Here, there are non-Latin1 code points, so we will have to go
13386    * fetch the list of all the characters that participate in folds
13387    */
13388    if (! PL_utf8_foldable) {
13389     SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13390          &PL_sv_undef, 1, 0);
13391     PL_utf8_foldable = _get_swash_invlist(swash);
13392     SvREFCNT_dec_NN(swash);
13393    }
13394
13395    /* This is a hash that for a particular fold gives all characters
13396    * that are involved in it */
13397    if (! PL_utf8_foldclosures) {
13398
13399     /* If we were unable to find any folds, then we likely won't be
13400     * able to find the closures.  So just create an empty list.
13401     * Folding will effectively be restricted to the non-Unicode
13402     * rules hard-coded into Perl.  (This case happens legitimately
13403     * during compilation of Perl itself before the Unicode tables
13404     * are generated) */
13405     if (_invlist_len(PL_utf8_foldable) == 0) {
13406      PL_utf8_foldclosures = newHV();
13407     }
13408     else {
13409      /* If the folds haven't been read in, call a fold function
13410      * to force that */
13411      if (! PL_utf8_tofold) {
13412       U8 dummy[UTF8_MAXBYTES+1];
13413
13414       /* This string is just a short named one above \xff */
13415       to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13416       assert(PL_utf8_tofold); /* Verify that worked */
13417      }
13418      PL_utf8_foldclosures =
13419          _swash_inversion_hash(PL_utf8_tofold);
13420     }
13421    }
13422
13423    /* Only the characters in this class that participate in folds need
13424    * be checked.  Get the intersection of this class and all the
13425    * possible characters that are foldable.  This can quickly narrow
13426    * down a large class */
13427    _invlist_intersection(PL_utf8_foldable, cp_list,
13428         &fold_intersection);
13429   }
13430
13431   /* Now look at the foldable characters in this class individually */
13432   invlist_iterinit(fold_intersection);
13433   while (invlist_iternext(fold_intersection, &start, &end)) {
13434    UV j;
13435
13436    /* Locale folding for Latin1 characters is deferred until runtime */
13437    if (LOC && start < 256) {
13438     start = 256;
13439    }
13440
13441    /* Look at every character in the range */
13442    for (j = start; j <= end; j++) {
13443
13444     U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13445     STRLEN foldlen;
13446     SV** listp;
13447
13448     if (j < 256) {
13449
13450      /* We have the latin1 folding rules hard-coded here so that
13451      * an innocent-looking character class, like /[ks]/i won't
13452      * have to go out to disk to find the possible matches.
13453      * XXX It would be better to generate these via regen, in
13454      * case a new version of the Unicode standard adds new
13455      * mappings, though that is not really likely, and may be
13456      * caught by the default: case of the switch below. */
13457
13458      if (IS_IN_SOME_FOLD_L1(j)) {
13459
13460       /* ASCII is always matched; non-ASCII is matched only
13461       * under Unicode rules */
13462       if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13463        cp_list =
13464         add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13465       }
13466       else {
13467        depends_list =
13468        add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13469       }
13470      }
13471
13472      if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13473       && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13474      {
13475       /* Certain Latin1 characters have matches outside
13476       * Latin1.  To get here, <j> is one of those
13477       * characters.   None of these matches is valid for
13478       * ASCII characters under /aa, which is why the 'if'
13479       * just above excludes those.  These matches only
13480       * happen when the target string is utf8.  The code
13481       * below adds the single fold closures for <j> to the
13482       * inversion list. */
13483       switch (j) {
13484        case 'k':
13485        case 'K':
13486         cp_list =
13487          add_cp_to_invlist(cp_list, KELVIN_SIGN);
13488         break;
13489        case 's':
13490        case 'S':
13491         cp_list = add_cp_to_invlist(cp_list,
13492              LATIN_SMALL_LETTER_LONG_S);
13493         break;
13494        case MICRO_SIGN:
13495         cp_list = add_cp_to_invlist(cp_list,
13496              GREEK_CAPITAL_LETTER_MU);
13497         cp_list = add_cp_to_invlist(cp_list,
13498              GREEK_SMALL_LETTER_MU);
13499         break;
13500        case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13501        case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13502         cp_list =
13503          add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13504         break;
13505        case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13506         cp_list = add_cp_to_invlist(cp_list,
13507           LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13508         break;
13509        case LATIN_SMALL_LETTER_SHARP_S:
13510         cp_list = add_cp_to_invlist(cp_list,
13511             LATIN_CAPITAL_LETTER_SHARP_S);
13512         break;
13513        case 'F': case 'f':
13514        case 'I': case 'i':
13515        case 'L': case 'l':
13516        case 'T': case 't':
13517        case 'A': case 'a':
13518        case 'H': case 'h':
13519        case 'J': case 'j':
13520        case 'N': case 'n':
13521        case 'W': case 'w':
13522        case 'Y': case 'y':
13523         /* These all are targets of multi-character
13524         * folds from code points that require UTF8 to
13525         * express, so they can't match unless the
13526         * target string is in UTF-8, so no action here
13527         * is necessary, as regexec.c properly handles
13528         * the general case for UTF-8 matching and
13529         * multi-char folds */
13530         break;
13531        default:
13532         /* Use deprecated warning to increase the
13533         * chances of this being output */
13534         ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13535         break;
13536       }
13537      }
13538      continue;
13539     }
13540
13541     /* Here is an above Latin1 character.  We don't have the rules
13542     * hard-coded for it.  First, get its fold.  This is the simple
13543     * fold, as the multi-character folds have been handled earlier
13544     * and separated out */
13545     _to_uni_fold_flags(j, foldbuf, &foldlen,
13546            ((LOC)
13547            ? FOLD_FLAGS_LOCALE
13548            : (ASCII_FOLD_RESTRICTED)
13549             ? FOLD_FLAGS_NOMIX_ASCII
13550             : 0));
13551
13552     /* Single character fold of above Latin1.  Add everything in
13553     * its fold closure to the list that this node should match.
13554     * The fold closures data structure is a hash with the keys
13555     * being the UTF-8 of every character that is folded to, like
13556     * 'k', and the values each an array of all code points that
13557     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
13558     * Multi-character folds are not included */
13559     if ((listp = hv_fetch(PL_utf8_foldclosures,
13560          (char *) foldbuf, foldlen, FALSE)))
13561     {
13562      AV* list = (AV*) *listp;
13563      IV k;
13564      for (k = 0; k <= av_len(list); k++) {
13565       SV** c_p = av_fetch(list, k, FALSE);
13566       UV c;
13567       if (c_p == NULL) {
13568        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13569       }
13570       c = SvUV(*c_p);
13571
13572       /* /aa doesn't allow folds between ASCII and non-; /l
13573       * doesn't allow them between above and below 256 */
13574       if ((ASCII_FOLD_RESTRICTED
13575         && (isASCII(c) != isASCII(j)))
13576        || (LOC && ((c < 256) != (j < 256))))
13577       {
13578        continue;
13579       }
13580
13581       /* Folds involving non-ascii Latin1 characters
13582       * under /d are added to a separate list */
13583       if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13584       {
13585        cp_list = add_cp_to_invlist(cp_list, c);
13586       }
13587       else {
13588       depends_list = add_cp_to_invlist(depends_list, c);
13589       }
13590      }
13591     }
13592    }
13593   }
13594   SvREFCNT_dec_NN(fold_intersection);
13595  }
13596
13597  /* And combine the result (if any) with any inversion list from posix
13598  * classes.  The lists are kept separate up to now because we don't want to
13599  * fold the classes (folding of those is automatically handled by the swash
13600  * fetching code) */
13601  if (posixes) {
13602   if (! DEPENDS_SEMANTICS) {
13603    if (cp_list) {
13604     _invlist_union(cp_list, posixes, &cp_list);
13605     SvREFCNT_dec_NN(posixes);
13606    }
13607    else {
13608     cp_list = posixes;
13609    }
13610   }
13611   else {
13612    /* Under /d, we put into a separate list the Latin1 things that
13613    * match only when the target string is utf8 */
13614    SV* nonascii_but_latin1_properties = NULL;
13615    _invlist_intersection(posixes, PL_Latin1,
13616         &nonascii_but_latin1_properties);
13617    _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13618        &nonascii_but_latin1_properties);
13619    _invlist_subtract(posixes, nonascii_but_latin1_properties,
13620        &posixes);
13621    if (cp_list) {
13622     _invlist_union(cp_list, posixes, &cp_list);
13623     SvREFCNT_dec_NN(posixes);
13624    }
13625    else {
13626     cp_list = posixes;
13627    }
13628
13629    if (depends_list) {
13630     _invlist_union(depends_list, nonascii_but_latin1_properties,
13631        &depends_list);
13632     SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13633    }
13634    else {
13635     depends_list = nonascii_but_latin1_properties;
13636    }
13637   }
13638  }
13639
13640  /* And combine the result (if any) with any inversion list from properties.
13641  * The lists are kept separate up to now so that we can distinguish the two
13642  * in regards to matching above-Unicode.  A run-time warning is generated
13643  * if a Unicode property is matched against a non-Unicode code point. But,
13644  * we allow user-defined properties to match anything, without any warning,
13645  * and we also suppress the warning if there is a portion of the character
13646  * class that isn't a Unicode property, and which matches above Unicode, \W
13647  * or [\x{110000}] for example.
13648  * (Note that in this case, unlike the Posix one above, there is no
13649  * <depends_list>, because having a Unicode property forces Unicode
13650  * semantics */
13651  if (properties) {
13652   bool warn_super = ! has_user_defined_property;
13653   if (cp_list) {
13654
13655    /* If it matters to the final outcome, see if a non-property
13656    * component of the class matches above Unicode.  If so, the
13657    * warning gets suppressed.  This is true even if just a single
13658    * such code point is specified, as though not strictly correct if
13659    * another such code point is matched against, the fact that they
13660    * are using above-Unicode code points indicates they should know
13661    * the issues involved */
13662    if (warn_super) {
13663     bool non_prop_matches_above_Unicode =
13664        runtime_posix_matches_above_Unicode
13665        | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13666     if (invert) {
13667      non_prop_matches_above_Unicode =
13668            !  non_prop_matches_above_Unicode;
13669     }
13670     warn_super = ! non_prop_matches_above_Unicode;
13671    }
13672
13673    _invlist_union(properties, cp_list, &cp_list);
13674    SvREFCNT_dec_NN(properties);
13675   }
13676   else {
13677    cp_list = properties;
13678   }
13679
13680   if (warn_super) {
13681    OP(ret) = ANYOF_WARN_SUPER;
13682   }
13683  }
13684
13685  /* Here, we have calculated what code points should be in the character
13686  * class.
13687  *
13688  * Now we can see about various optimizations.  Fold calculation (which we
13689  * did above) needs to take place before inversion.  Otherwise /[^k]/i
13690  * would invert to include K, which under /i would match k, which it
13691  * shouldn't.  Therefore we can't invert folded locale now, as it won't be
13692  * folded until runtime */
13693
13694  /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13695  * at compile time.  Besides not inverting folded locale now, we can't
13696  * invert if there are things such as \w, which aren't known until runtime
13697  * */
13698  if (invert
13699   && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13700   && ! depends_list
13701   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13702  {
13703   _invlist_invert(cp_list);
13704
13705   /* Any swash can't be used as-is, because we've inverted things */
13706   if (swash) {
13707    SvREFCNT_dec_NN(swash);
13708    swash = NULL;
13709   }
13710
13711   /* Clear the invert flag since have just done it here */
13712   invert = FALSE;
13713  }
13714
13715  if (ret_invlist) {
13716   *ret_invlist = cp_list;
13717   SvREFCNT_dec(swash);
13718
13719   /* Discard the generated node */
13720   if (SIZE_ONLY) {
13721    RExC_size = orig_size;
13722   }
13723   else {
13724    RExC_emit = orig_emit;
13725   }
13726   return orig_emit;
13727  }
13728
13729  /* If we didn't do folding, it's because some information isn't available
13730  * until runtime; set the run-time fold flag for these.  (We don't have to
13731  * worry about properties folding, as that is taken care of by the swash
13732  * fetching) */
13733  if (FOLD && LOC)
13734  {
13735  ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13736  }
13737
13738  /* Some character classes are equivalent to other nodes.  Such nodes take
13739  * up less room and generally fewer operations to execute than ANYOF nodes.
13740  * Above, we checked for and optimized into some such equivalents for
13741  * certain common classes that are easy to test.  Getting to this point in
13742  * the code means that the class didn't get optimized there.  Since this
13743  * code is only executed in Pass 2, it is too late to save space--it has
13744  * been allocated in Pass 1, and currently isn't given back.  But turning
13745  * things into an EXACTish node can allow the optimizer to join it to any
13746  * adjacent such nodes.  And if the class is equivalent to things like /./,
13747  * expensive run-time swashes can be avoided.  Now that we have more
13748  * complete information, we can find things necessarily missed by the
13749  * earlier code.  I (khw) am not sure how much to look for here.  It would
13750  * be easy, but perhaps too slow, to check any candidates against all the
13751  * node types they could possibly match using _invlistEQ(). */
13752
13753  if (cp_list
13754   && ! invert
13755   && ! depends_list
13756   && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13757   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13758  {
13759   UV start, end;
13760   U8 op = END;  /* The optimzation node-type */
13761   const char * cur_parse= RExC_parse;
13762
13763   invlist_iterinit(cp_list);
13764   if (! invlist_iternext(cp_list, &start, &end)) {
13765
13766    /* Here, the list is empty.  This happens, for example, when a
13767    * Unicode property is the only thing in the character class, and
13768    * it doesn't match anything.  (perluniprops.pod notes such
13769    * properties) */
13770    op = OPFAIL;
13771    *flagp |= HASWIDTH|SIMPLE;
13772   }
13773   else if (start == end) {    /* The range is a single code point */
13774    if (! invlist_iternext(cp_list, &start, &end)
13775
13776      /* Don't do this optimization if it would require changing
13777      * the pattern to UTF-8 */
13778     && (start < 256 || UTF))
13779    {
13780     /* Here, the list contains a single code point.  Can optimize
13781     * into an EXACT node */
13782
13783     value = start;
13784
13785     if (! FOLD) {
13786      op = EXACT;
13787     }
13788     else if (LOC) {
13789
13790      /* A locale node under folding with one code point can be
13791      * an EXACTFL, as its fold won't be calculated until
13792      * runtime */
13793      op = EXACTFL;
13794     }
13795     else {
13796
13797      /* Here, we are generally folding, but there is only one
13798      * code point to match.  If we have to, we use an EXACT
13799      * node, but it would be better for joining with adjacent
13800      * nodes in the optimization pass if we used the same
13801      * EXACTFish node that any such are likely to be.  We can
13802      * do this iff the code point doesn't participate in any
13803      * folds.  For example, an EXACTF of a colon is the same as
13804      * an EXACT one, since nothing folds to or from a colon. */
13805      if (value < 256) {
13806       if (IS_IN_SOME_FOLD_L1(value)) {
13807        op = EXACT;
13808       }
13809      }
13810      else {
13811       if (! PL_utf8_foldable) {
13812        SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13813             &PL_sv_undef, 1, 0);
13814        PL_utf8_foldable = _get_swash_invlist(swash);
13815        SvREFCNT_dec_NN(swash);
13816       }
13817       if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13818        op = EXACT;
13819       }
13820      }
13821
13822      /* If we haven't found the node type, above, it means we
13823      * can use the prevailing one */
13824      if (op == END) {
13825       op = compute_EXACTish(pRExC_state);
13826      }
13827     }
13828    }
13829   }
13830   else if (start == 0) {
13831    if (end == UV_MAX) {
13832     op = SANY;
13833     *flagp |= HASWIDTH|SIMPLE;
13834     RExC_naughty++;
13835    }
13836    else if (end == '\n' - 1
13837      && invlist_iternext(cp_list, &start, &end)
13838      && start == '\n' + 1 && end == UV_MAX)
13839    {
13840     op = REG_ANY;
13841     *flagp |= HASWIDTH|SIMPLE;
13842     RExC_naughty++;
13843    }
13844   }
13845   invlist_iterfinish(cp_list);
13846
13847   if (op != END) {
13848    RExC_parse = (char *)orig_parse;
13849    RExC_emit = (regnode *)orig_emit;
13850
13851    ret = reg_node(pRExC_state, op);
13852
13853    RExC_parse = (char *)cur_parse;
13854
13855    if (PL_regkind[op] == EXACT) {
13856     alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13857    }
13858
13859    SvREFCNT_dec_NN(cp_list);
13860    return ret;
13861   }
13862  }
13863
13864  /* Here, <cp_list> contains all the code points we can determine at
13865  * compile time that match under all conditions.  Go through it, and
13866  * for things that belong in the bitmap, put them there, and delete from
13867  * <cp_list>.  While we are at it, see if everything above 255 is in the
13868  * list, and if so, set a flag to speed up execution */
13869  ANYOF_BITMAP_ZERO(ret);
13870  if (cp_list) {
13871
13872   /* This gets set if we actually need to modify things */
13873   bool change_invlist = FALSE;
13874
13875   UV start, end;
13876
13877   /* Start looking through <cp_list> */
13878   invlist_iterinit(cp_list);
13879   while (invlist_iternext(cp_list, &start, &end)) {
13880    UV high;
13881    int i;
13882
13883    if (end == UV_MAX && start <= 256) {
13884     ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13885    }
13886
13887    /* Quit if are above what we should change */
13888    if (start > 255) {
13889     break;
13890    }
13891
13892    change_invlist = TRUE;
13893
13894    /* Set all the bits in the range, up to the max that we are doing */
13895    high = (end < 255) ? end : 255;
13896    for (i = start; i <= (int) high; i++) {
13897     if (! ANYOF_BITMAP_TEST(ret, i)) {
13898      ANYOF_BITMAP_SET(ret, i);
13899      prevvalue = value;
13900      value = i;
13901     }
13902    }
13903   }
13904   invlist_iterfinish(cp_list);
13905
13906   /* Done with loop; remove any code points that are in the bitmap from
13907   * <cp_list> */
13908   if (change_invlist) {
13909    _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13910   }
13911
13912   /* If have completely emptied it, remove it completely */
13913   if (_invlist_len(cp_list) == 0) {
13914    SvREFCNT_dec_NN(cp_list);
13915    cp_list = NULL;
13916   }
13917  }
13918
13919  if (invert) {
13920   ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13921  }
13922
13923  /* Here, the bitmap has been populated with all the Latin1 code points that
13924  * always match.  Can now add to the overall list those that match only
13925  * when the target string is UTF-8 (<depends_list>). */
13926  if (depends_list) {
13927   if (cp_list) {
13928    _invlist_union(cp_list, depends_list, &cp_list);
13929    SvREFCNT_dec_NN(depends_list);
13930   }
13931   else {
13932    cp_list = depends_list;
13933   }
13934  }
13935
13936  /* If there is a swash and more than one element, we can't use the swash in
13937  * the optimization below. */
13938  if (swash && element_count > 1) {
13939   SvREFCNT_dec_NN(swash);
13940   swash = NULL;
13941  }
13942
13943  if (! cp_list
13944   && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13945  {
13946   ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13947  }
13948  else {
13949   /* av[0] stores the character class description in its textual form:
13950   *       used later (regexec.c:Perl_regclass_swash()) to initialize the
13951   *       appropriate swash, and is also useful for dumping the regnode.
13952   * av[1] if NULL, is a placeholder to later contain the swash computed
13953   *       from av[0].  But if no further computation need be done, the
13954   *       swash is stored there now.
13955   * av[2] stores the cp_list inversion list for use in addition or
13956   *       instead of av[0]; used only if av[1] is NULL
13957   * av[3] is set if any component of the class is from a user-defined
13958   *       property; used only if av[1] is NULL */
13959   AV * const av = newAV();
13960   SV *rv;
13961
13962   av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13963       ? SvREFCNT_inc(listsv) : &PL_sv_undef);
13964   if (swash) {
13965    av_store(av, 1, swash);
13966    SvREFCNT_dec_NN(cp_list);
13967   }
13968   else {
13969    av_store(av, 1, NULL);
13970    if (cp_list) {
13971     av_store(av, 2, cp_list);
13972     av_store(av, 3, newSVuv(has_user_defined_property));
13973    }
13974   }
13975
13976   rv = newRV_noinc(MUTABLE_SV(av));
13977   n = add_data(pRExC_state, 1, "s");
13978   RExC_rxi->data->data[n] = (void*)rv;
13979   ARG_SET(ret, n);
13980  }
13981
13982  *flagp |= HASWIDTH|SIMPLE;
13983  return ret;
13984 }
13985 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13986
13987
13988 /* reg_skipcomment()
13989
13990    Absorbs an /x style # comments from the input stream.
13991    Returns true if there is more text remaining in the stream.
13992    Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13993    terminates the pattern without including a newline.
13994
13995    Note its the callers responsibility to ensure that we are
13996    actually in /x mode
13997
13998 */
13999
14000 STATIC bool
14001 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14002 {
14003  bool ended = 0;
14004
14005  PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14006
14007  while (RExC_parse < RExC_end)
14008   if (*RExC_parse++ == '\n') {
14009    ended = 1;
14010    break;
14011   }
14012  if (!ended) {
14013   /* we ran off the end of the pattern without ending
14014   the comment, so we have to add an \n when wrapping */
14015   RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14016   return 0;
14017  } else
14018   return 1;
14019 }
14020
14021 /* nextchar()
14022
14023    Advances the parse position, and optionally absorbs
14024    "whitespace" from the inputstream.
14025
14026    Without /x "whitespace" means (?#...) style comments only,
14027    with /x this means (?#...) and # comments and whitespace proper.
14028
14029    Returns the RExC_parse point from BEFORE the scan occurs.
14030
14031    This is the /x friendly way of saying RExC_parse++.
14032 */
14033
14034 STATIC char*
14035 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14036 {
14037  char* const retval = RExC_parse++;
14038
14039  PERL_ARGS_ASSERT_NEXTCHAR;
14040
14041  for (;;) {
14042   if (RExC_end - RExC_parse >= 3
14043    && *RExC_parse == '('
14044    && RExC_parse[1] == '?'
14045    && RExC_parse[2] == '#')
14046   {
14047    while (*RExC_parse != ')') {
14048     if (RExC_parse == RExC_end)
14049      FAIL("Sequence (?#... not terminated");
14050     RExC_parse++;
14051    }
14052    RExC_parse++;
14053    continue;
14054   }
14055   if (RExC_flags & RXf_PMf_EXTENDED) {
14056    if (isSPACE(*RExC_parse)) {
14057     RExC_parse++;
14058     continue;
14059    }
14060    else if (*RExC_parse == '#') {
14061     if ( reg_skipcomment( pRExC_state ) )
14062      continue;
14063    }
14064   }
14065   return retval;
14066  }
14067 }
14068
14069 /*
14070 - reg_node - emit a node
14071 */
14072 STATIC regnode *   /* Location. */
14073 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14074 {
14075  dVAR;
14076  regnode *ptr;
14077  regnode * const ret = RExC_emit;
14078  GET_RE_DEBUG_FLAGS_DECL;
14079
14080  PERL_ARGS_ASSERT_REG_NODE;
14081
14082  if (SIZE_ONLY) {
14083   SIZE_ALIGN(RExC_size);
14084   RExC_size += 1;
14085   return(ret);
14086  }
14087  if (RExC_emit >= RExC_emit_bound)
14088   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14089     op, RExC_emit, RExC_emit_bound);
14090
14091  NODE_ALIGN_FILL(ret);
14092  ptr = ret;
14093  FILL_ADVANCE_NODE(ptr, op);
14094  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
14095 #ifdef RE_TRACK_PATTERN_OFFSETS
14096  if (RExC_offsets) {         /* MJD */
14097   MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14098    "reg_node", __LINE__,
14099    PL_reg_name[op],
14100    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14101     ? "Overwriting end of array!\n" : "OK",
14102    (UV)(RExC_emit - RExC_emit_start),
14103    (UV)(RExC_parse - RExC_start),
14104    (UV)RExC_offsets[0]));
14105   Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14106  }
14107 #endif
14108  RExC_emit = ptr;
14109  return(ret);
14110 }
14111
14112 /*
14113 - reganode - emit a node with an argument
14114 */
14115 STATIC regnode *   /* Location. */
14116 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14117 {
14118  dVAR;
14119  regnode *ptr;
14120  regnode * const ret = RExC_emit;
14121  GET_RE_DEBUG_FLAGS_DECL;
14122
14123  PERL_ARGS_ASSERT_REGANODE;
14124
14125  if (SIZE_ONLY) {
14126   SIZE_ALIGN(RExC_size);
14127   RExC_size += 2;
14128   /*
14129   We can't do this:
14130
14131   assert(2==regarglen[op]+1);
14132
14133   Anything larger than this has to allocate the extra amount.
14134   If we changed this to be:
14135
14136   RExC_size += (1 + regarglen[op]);
14137
14138   then it wouldn't matter. Its not clear what side effect
14139   might come from that so its not done so far.
14140   -- dmq
14141   */
14142   return(ret);
14143  }
14144  if (RExC_emit >= RExC_emit_bound)
14145   Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14146     op, RExC_emit, RExC_emit_bound);
14147
14148  NODE_ALIGN_FILL(ret);
14149  ptr = ret;
14150  FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14151  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
14152 #ifdef RE_TRACK_PATTERN_OFFSETS
14153  if (RExC_offsets) {         /* MJD */
14154   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14155    "reganode",
14156    __LINE__,
14157    PL_reg_name[op],
14158    (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14159    "Overwriting end of array!\n" : "OK",
14160    (UV)(RExC_emit - RExC_emit_start),
14161    (UV)(RExC_parse - RExC_start),
14162    (UV)RExC_offsets[0]));
14163   Set_Cur_Node_Offset;
14164  }
14165 #endif
14166  RExC_emit = ptr;
14167  return(ret);
14168 }
14169
14170 /*
14171 - reguni - emit (if appropriate) a Unicode character
14172 */
14173 STATIC STRLEN
14174 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14175 {
14176  dVAR;
14177
14178  PERL_ARGS_ASSERT_REGUNI;
14179
14180  return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14181 }
14182
14183 /*
14184 - reginsert - insert an operator in front of already-emitted operand
14185 *
14186 * Means relocating the operand.
14187 */
14188 STATIC void
14189 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14190 {
14191  dVAR;
14192  regnode *src;
14193  regnode *dst;
14194  regnode *place;
14195  const int offset = regarglen[(U8)op];
14196  const int size = NODE_STEP_REGNODE + offset;
14197  GET_RE_DEBUG_FLAGS_DECL;
14198
14199  PERL_ARGS_ASSERT_REGINSERT;
14200  PERL_UNUSED_ARG(depth);
14201 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14202  DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14203  if (SIZE_ONLY) {
14204   RExC_size += size;
14205   return;
14206  }
14207
14208  src = RExC_emit;
14209  RExC_emit += size;
14210  dst = RExC_emit;
14211  if (RExC_open_parens) {
14212   int paren;
14213   /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14214   for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14215    if ( RExC_open_parens[paren] >= opnd ) {
14216     /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14217     RExC_open_parens[paren] += size;
14218    } else {
14219     /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14220    }
14221    if ( RExC_close_parens[paren] >= opnd ) {
14222     /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14223     RExC_close_parens[paren] += size;
14224    } else {
14225     /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14226    }
14227   }
14228  }
14229
14230  while (src > opnd) {
14231   StructCopy(--src, --dst, regnode);
14232 #ifdef RE_TRACK_PATTERN_OFFSETS
14233   if (RExC_offsets) {     /* MJD 20010112 */
14234    MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14235     "reg_insert",
14236     __LINE__,
14237     PL_reg_name[op],
14238     (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14239      ? "Overwriting end of array!\n" : "OK",
14240     (UV)(src - RExC_emit_start),
14241     (UV)(dst - RExC_emit_start),
14242     (UV)RExC_offsets[0]));
14243    Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14244    Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14245   }
14246 #endif
14247  }
14248
14249
14250  place = opnd;  /* Op node, where operand used to be. */
14251 #ifdef RE_TRACK_PATTERN_OFFSETS
14252  if (RExC_offsets) {         /* MJD */
14253   MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14254    "reginsert",
14255    __LINE__,
14256    PL_reg_name[op],
14257    (UV)(place - RExC_emit_start) > RExC_offsets[0]
14258    ? "Overwriting end of array!\n" : "OK",
14259    (UV)(place - RExC_emit_start),
14260    (UV)(RExC_parse - RExC_start),
14261    (UV)RExC_offsets[0]));
14262   Set_Node_Offset(place, RExC_parse);
14263   Set_Node_Length(place, 1);
14264  }
14265 #endif
14266  src = NEXTOPER(place);
14267  FILL_ADVANCE_NODE(place, op);
14268  REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
14269  Zero(src, offset, regnode);
14270 }
14271
14272 /*
14273 - regtail - set the next-pointer at the end of a node chain of p to val.
14274 - SEE ALSO: regtail_study
14275 */
14276 /* TODO: All three parms should be const */
14277 STATIC void
14278 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14279 {
14280  dVAR;
14281  regnode *scan;
14282  GET_RE_DEBUG_FLAGS_DECL;
14283
14284  PERL_ARGS_ASSERT_REGTAIL;
14285 #ifndef DEBUGGING
14286  PERL_UNUSED_ARG(depth);
14287 #endif
14288
14289  if (SIZE_ONLY)
14290   return;
14291
14292  /* Find last node. */
14293  scan = p;
14294  for (;;) {
14295   regnode * const temp = regnext(scan);
14296   DEBUG_PARSE_r({
14297    SV * const mysv=sv_newmortal();
14298    DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14299    regprop(RExC_rx, mysv, scan);
14300    PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14301     SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14302      (temp == NULL ? "->" : ""),
14303      (temp == NULL ? PL_reg_name[OP(val)] : "")
14304    );
14305   });
14306   if (temp == NULL)
14307    break;
14308   scan = temp;
14309  }
14310
14311  if (reg_off_by_arg[OP(scan)]) {
14312   ARG_SET(scan, val - scan);
14313  }
14314  else {
14315   NEXT_OFF(scan) = val - scan;
14316  }
14317 }
14318
14319 #ifdef DEBUGGING
14320 /*
14321 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14322 - Look for optimizable sequences at the same time.
14323 - currently only looks for EXACT chains.
14324
14325 This is experimental code. The idea is to use this routine to perform
14326 in place optimizations on branches and groups as they are constructed,
14327 with the long term intention of removing optimization from study_chunk so
14328 that it is purely analytical.
14329
14330 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14331 to control which is which.
14332
14333 */
14334 /* TODO: All four parms should be const */
14335
14336 STATIC U8
14337 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14338 {
14339  dVAR;
14340  regnode *scan;
14341  U8 exact = PSEUDO;
14342 #ifdef EXPERIMENTAL_INPLACESCAN
14343  I32 min = 0;
14344 #endif
14345  GET_RE_DEBUG_FLAGS_DECL;
14346
14347  PERL_ARGS_ASSERT_REGTAIL_STUDY;
14348
14349
14350  if (SIZE_ONLY)
14351   return exact;
14352
14353  /* Find last node. */
14354
14355  scan = p;
14356  for (;;) {
14357   regnode * const temp = regnext(scan);
14358 #ifdef EXPERIMENTAL_INPLACESCAN
14359   if (PL_regkind[OP(scan)] == EXACT) {
14360    bool has_exactf_sharp_s; /* Unexamined in this routine */
14361    if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14362     return EXACT;
14363   }
14364 #endif
14365   if ( exact ) {
14366    switch (OP(scan)) {
14367     case EXACT:
14368     case EXACTF:
14369     case EXACTFA:
14370     case EXACTFU:
14371     case EXACTFU_SS:
14372     case EXACTFU_TRICKYFOLD:
14373     case EXACTFL:
14374       if( exact == PSEUDO )
14375        exact= OP(scan);
14376       else if ( exact != OP(scan) )
14377        exact= 0;
14378     case NOTHING:
14379      break;
14380     default:
14381      exact= 0;
14382    }
14383   }
14384   DEBUG_PARSE_r({
14385    SV * const mysv=sv_newmortal();
14386    DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14387    regprop(RExC_rx, mysv, scan);
14388    PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14389     SvPV_nolen_const(mysv),
14390     REG_NODE_NUM(scan),
14391     PL_reg_name[exact]);
14392   });
14393   if (temp == NULL)
14394    break;
14395   scan = temp;
14396  }
14397  DEBUG_PARSE_r({
14398   SV * const mysv_val=sv_newmortal();
14399   DEBUG_PARSE_MSG("");
14400   regprop(RExC_rx, mysv_val, val);
14401   PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14402      SvPV_nolen_const(mysv_val),
14403      (IV)REG_NODE_NUM(val),
14404      (IV)(val - scan)
14405   );
14406  });
14407  if (reg_off_by_arg[OP(scan)]) {
14408   ARG_SET(scan, val - scan);
14409  }
14410  else {
14411   NEXT_OFF(scan) = val - scan;
14412  }
14413
14414  return exact;
14415 }
14416 #endif
14417
14418 /*
14419  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14420  */
14421 #ifdef DEBUGGING
14422 static void
14423 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14424 {
14425  int bit;
14426  int set=0;
14427  regex_charset cs;
14428
14429  for (bit=0; bit<32; bit++) {
14430   if (flags & (1<<bit)) {
14431    if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
14432     continue;
14433    }
14434    if (!set++ && lead)
14435     PerlIO_printf(Perl_debug_log, "%s",lead);
14436    PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14437   }
14438  }
14439  if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14440    if (!set++ && lead) {
14441     PerlIO_printf(Perl_debug_log, "%s",lead);
14442    }
14443    switch (cs) {
14444     case REGEX_UNICODE_CHARSET:
14445      PerlIO_printf(Perl_debug_log, "UNICODE");
14446      break;
14447     case REGEX_LOCALE_CHARSET:
14448      PerlIO_printf(Perl_debug_log, "LOCALE");
14449      break;
14450     case REGEX_ASCII_RESTRICTED_CHARSET:
14451      PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14452      break;
14453     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14454      PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14455      break;
14456     default:
14457      PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14458      break;
14459    }
14460  }
14461  if (lead)  {
14462   if (set)
14463    PerlIO_printf(Perl_debug_log, "\n");
14464   else
14465    PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14466  }
14467 }
14468 #endif
14469
14470 void
14471 Perl_regdump(pTHX_ const regexp *r)
14472 {
14473 #ifdef DEBUGGING
14474  dVAR;
14475  SV * const sv = sv_newmortal();
14476  SV *dsv= sv_newmortal();
14477  RXi_GET_DECL(r,ri);
14478  GET_RE_DEBUG_FLAGS_DECL;
14479
14480  PERL_ARGS_ASSERT_REGDUMP;
14481
14482  (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14483
14484  /* Header fields of interest. */
14485  if (r->anchored_substr) {
14486   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14487    RE_SV_DUMPLEN(r->anchored_substr), 30);
14488   PerlIO_printf(Perl_debug_log,
14489      "anchored %s%s at %"IVdf" ",
14490      s, RE_SV_TAIL(r->anchored_substr),
14491      (IV)r->anchored_offset);
14492  } else if (r->anchored_utf8) {
14493   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14494    RE_SV_DUMPLEN(r->anchored_utf8), 30);
14495   PerlIO_printf(Perl_debug_log,
14496      "anchored utf8 %s%s at %"IVdf" ",
14497      s, RE_SV_TAIL(r->anchored_utf8),
14498      (IV)r->anchored_offset);
14499  }
14500  if (r->float_substr) {
14501   RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14502    RE_SV_DUMPLEN(r->float_substr), 30);
14503   PerlIO_printf(Perl_debug_log,
14504      "floating %s%s at %"IVdf"..%"UVuf" ",
14505      s, RE_SV_TAIL(r->float_substr),
14506      (IV)r->float_min_offset, (UV)r->float_max_offset);
14507  } else if (r->float_utf8) {
14508   RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14509    RE_SV_DUMPLEN(r->float_utf8), 30);
14510   PerlIO_printf(Perl_debug_log,
14511      "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14512      s, RE_SV_TAIL(r->float_utf8),
14513      (IV)r->float_min_offset, (UV)r->float_max_offset);
14514  }
14515  if (r->check_substr || r->check_utf8)
14516   PerlIO_printf(Perl_debug_log,
14517      (const char *)
14518      (r->check_substr == r->float_substr
14519      && r->check_utf8 == r->float_utf8
14520      ? "(checking floating" : "(checking anchored"));
14521  if (r->extflags & RXf_NOSCAN)
14522   PerlIO_printf(Perl_debug_log, " noscan");
14523  if (r->extflags & RXf_CHECK_ALL)
14524   PerlIO_printf(Perl_debug_log, " isall");
14525  if (r->check_substr || r->check_utf8)
14526   PerlIO_printf(Perl_debug_log, ") ");
14527
14528  if (ri->regstclass) {
14529   regprop(r, sv, ri->regstclass);
14530   PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14531  }
14532  if (r->extflags & RXf_ANCH) {
14533   PerlIO_printf(Perl_debug_log, "anchored");
14534   if (r->extflags & RXf_ANCH_BOL)
14535    PerlIO_printf(Perl_debug_log, "(BOL)");
14536   if (r->extflags & RXf_ANCH_MBOL)
14537    PerlIO_printf(Perl_debug_log, "(MBOL)");
14538   if (r->extflags & RXf_ANCH_SBOL)
14539    PerlIO_printf(Perl_debug_log, "(SBOL)");
14540   if (r->extflags & RXf_ANCH_GPOS)
14541    PerlIO_printf(Perl_debug_log, "(GPOS)");
14542   PerlIO_putc(Perl_debug_log, ' ');
14543  }
14544  if (r->extflags & RXf_GPOS_SEEN)
14545   PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14546  if (r->intflags & PREGf_SKIP)
14547   PerlIO_printf(Perl_debug_log, "plus ");
14548  if (r->intflags & PREGf_IMPLICIT)
14549   PerlIO_printf(Perl_debug_log, "implicit ");
14550  PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14551  if (r->extflags & RXf_EVAL_SEEN)
14552   PerlIO_printf(Perl_debug_log, "with eval ");
14553  PerlIO_printf(Perl_debug_log, "\n");
14554  DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14555 #else
14556  PERL_ARGS_ASSERT_REGDUMP;
14557  PERL_UNUSED_CONTEXT;
14558  PERL_UNUSED_ARG(r);
14559 #endif /* DEBUGGING */
14560 }
14561
14562 /*
14563 - regprop - printable representation of opcode
14564 */
14565 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14566 STMT_START { \
14567   if (do_sep) {                           \
14568    Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14569    if (flags & ANYOF_INVERT)           \
14570     /*make sure the invert info is in each */ \
14571     sv_catpvs(sv, "^");             \
14572    do_sep = 0;                         \
14573   }                                       \
14574 } STMT_END
14575
14576 void
14577 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14578 {
14579 #ifdef DEBUGGING
14580  dVAR;
14581  int k;
14582
14583  /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14584  static const char * const anyofs[] = {
14585 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14586  || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6                   \
14587  || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9            \
14588  || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12               \
14589  || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15               \
14590  || _CC_VERTSPACE != 16
14591   #error Need to adjust order of anyofs[]
14592 #endif
14593   "[\\w]",
14594   "[\\W]",
14595   "[\\d]",
14596   "[\\D]",
14597   "[:alpha:]",
14598   "[:^alpha:]",
14599   "[:lower:]",
14600   "[:^lower:]",
14601   "[:upper:]",
14602   "[:^upper:]",
14603   "[:punct:]",
14604   "[:^punct:]",
14605   "[:print:]",
14606   "[:^print:]",
14607   "[:alnum:]",
14608   "[:^alnum:]",
14609   "[:graph:]",
14610   "[:^graph:]",
14611   "[:cased:]",
14612   "[:^cased:]",
14613   "[\\s]",
14614   "[\\S]",
14615   "[:blank:]",
14616   "[:^blank:]",
14617   "[:xdigit:]",
14618   "[:^xdigit:]",
14619   "[:space:]",
14620   "[:^space:]",
14621   "[:cntrl:]",
14622   "[:^cntrl:]",
14623   "[:ascii:]",
14624   "[:^ascii:]",
14625   "[\\v]",
14626   "[\\V]"
14627  };
14628  RXi_GET_DECL(prog,progi);
14629  GET_RE_DEBUG_FLAGS_DECL;
14630
14631  PERL_ARGS_ASSERT_REGPROP;
14632
14633  sv_setpvs(sv, "");
14634
14635  if (OP(o) > REGNODE_MAX)  /* regnode.type is unsigned */
14636   /* It would be nice to FAIL() here, but this may be called from
14637   regexec.c, and it would be hard to supply pRExC_state. */
14638   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14639  sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14640
14641  k = PL_regkind[OP(o)];
14642
14643  if (k == EXACT) {
14644   sv_catpvs(sv, " ");
14645   /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14646   * is a crude hack but it may be the best for now since
14647   * we have no flag "this EXACTish node was UTF-8"
14648   * --jhi */
14649   pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14650     PERL_PV_ESCAPE_UNI_DETECT |
14651     PERL_PV_ESCAPE_NONASCII   |
14652     PERL_PV_PRETTY_ELLIPSES   |
14653     PERL_PV_PRETTY_LTGT       |
14654     PERL_PV_PRETTY_NOCLEAR
14655     );
14656  } else if (k == TRIE) {
14657   /* print the details of the trie in dumpuntil instead, as
14658   * progi->data isn't available here */
14659   const char op = OP(o);
14660   const U32 n = ARG(o);
14661   const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14662    (reg_ac_data *)progi->data->data[n] :
14663    NULL;
14664   const reg_trie_data * const trie
14665    = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14666
14667   Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14668   DEBUG_TRIE_COMPILE_r(
14669    Perl_sv_catpvf(aTHX_ sv,
14670     "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14671     (UV)trie->startstate,
14672     (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14673     (UV)trie->wordcount,
14674     (UV)trie->minlen,
14675     (UV)trie->maxlen,
14676     (UV)TRIE_CHARCOUNT(trie),
14677     (UV)trie->uniquecharcount
14678    )
14679   );
14680   if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14681    int i;
14682    int rangestart = -1;
14683    U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14684    sv_catpvs(sv, "[");
14685    for (i = 0; i <= 256; i++) {
14686     if (i < 256 && BITMAP_TEST(bitmap,i)) {
14687      if (rangestart == -1)
14688       rangestart = i;
14689     } else if (rangestart != -1) {
14690      if (i <= rangestart + 3)
14691       for (; rangestart < i; rangestart++)
14692        put_byte(sv, rangestart);
14693      else {
14694       put_byte(sv, rangestart);
14695       sv_catpvs(sv, "-");
14696       put_byte(sv, i - 1);
14697      }
14698      rangestart = -1;
14699     }
14700    }
14701    sv_catpvs(sv, "]");
14702   }
14703
14704  } else if (k == CURLY) {
14705   if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14706    Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14707   Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14708  }
14709  else if (k == WHILEM && o->flags)   /* Ordinal/of */
14710   Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14711  else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14712   Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14713   if ( RXp_PAREN_NAMES(prog) ) {
14714    if ( k != REF || (OP(o) < NREF)) {
14715     AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14716     SV **name= av_fetch(list, ARG(o), 0 );
14717     if (name)
14718      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14719    }
14720    else {
14721     AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14722     SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14723     I32 *nums=(I32*)SvPVX(sv_dat);
14724     SV **name= av_fetch(list, nums[0], 0 );
14725     I32 n;
14726     if (name) {
14727      for ( n=0; n<SvIVX(sv_dat); n++ ) {
14728       Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14729          (n ? "," : ""), (IV)nums[n]);
14730      }
14731      Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14732     }
14733    }
14734   }
14735  } else if (k == GOSUB)
14736   Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14737  else if (k == VERB) {
14738   if (!o->flags)
14739    Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14740       SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14741  } else if (k == LOGICAL)
14742   Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14743  else if (k == ANYOF) {
14744   int i, rangestart = -1;
14745   const U8 flags = ANYOF_FLAGS(o);
14746   int do_sep = 0;
14747
14748
14749   if (flags & ANYOF_LOCALE)
14750    sv_catpvs(sv, "{loc}");
14751   if (flags & ANYOF_LOC_FOLD)
14752    sv_catpvs(sv, "{i}");
14753   Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14754   if (flags & ANYOF_INVERT)
14755    sv_catpvs(sv, "^");
14756
14757   /* output what the standard cp 0-255 bitmap matches */
14758   for (i = 0; i <= 256; i++) {
14759    if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14760     if (rangestart == -1)
14761      rangestart = i;
14762    } else if (rangestart != -1) {
14763     if (i <= rangestart + 3)
14764      for (; rangestart < i; rangestart++)
14765       put_byte(sv, rangestart);
14766     else {
14767      put_byte(sv, rangestart);
14768      sv_catpvs(sv, "-");
14769      put_byte(sv, i - 1);
14770     }
14771     do_sep = 1;
14772     rangestart = -1;
14773    }
14774   }
14775
14776   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14777   /* output any special charclass tests (used entirely under use locale) */
14778   if (ANYOF_CLASS_TEST_ANY_SET(o))
14779    for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14780     if (ANYOF_CLASS_TEST(o,i)) {
14781      sv_catpv(sv, anyofs[i]);
14782      do_sep = 1;
14783     }
14784
14785   EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14786
14787   if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14788    sv_catpvs(sv, "{non-utf8-latin1-all}");
14789   }
14790
14791   /* output information about the unicode matching */
14792   if (flags & ANYOF_UNICODE_ALL)
14793    sv_catpvs(sv, "{unicode_all}");
14794   else if (ANYOF_NONBITMAP(o))
14795    sv_catpvs(sv, "{unicode}");
14796   if (flags & ANYOF_NONBITMAP_NON_UTF8)
14797    sv_catpvs(sv, "{outside bitmap}");
14798
14799   if (ANYOF_NONBITMAP(o)) {
14800    SV *lv; /* Set if there is something outside the bit map */
14801    SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14802    bool byte_output = FALSE;   /* If something in the bitmap has been
14803           output */
14804
14805    if (lv && lv != &PL_sv_undef) {
14806     if (sw) {
14807      U8 s[UTF8_MAXBYTES_CASE+1];
14808
14809      for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14810       uvchr_to_utf8(s, i);
14811
14812       if (i < 256
14813        && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
14814                things already
14815                output as part
14816                of the bitmap */
14817        && swash_fetch(sw, s, TRUE))
14818       {
14819        if (rangestart == -1)
14820         rangestart = i;
14821       } else if (rangestart != -1) {
14822        byte_output = TRUE;
14823        if (i <= rangestart + 3)
14824         for (; rangestart < i; rangestart++) {
14825          put_byte(sv, rangestart);
14826         }
14827        else {
14828         put_byte(sv, rangestart);
14829         sv_catpvs(sv, "-");
14830         put_byte(sv, i-1);
14831        }
14832        rangestart = -1;
14833       }
14834      }
14835     }
14836
14837     {
14838      char *s = savesvpv(lv);
14839      char * const origs = s;
14840
14841      while (*s && *s != '\n')
14842       s++;
14843
14844      if (*s == '\n') {
14845       const char * const t = ++s;
14846
14847       if (byte_output) {
14848        sv_catpvs(sv, " ");
14849       }
14850
14851       while (*s) {
14852        if (*s == '\n') {
14853
14854         /* Truncate very long output */
14855         if (s - origs > 256) {
14856          Perl_sv_catpvf(aTHX_ sv,
14857             "%.*s...",
14858             (int) (s - origs - 1),
14859             t);
14860          goto out_dump;
14861         }
14862         *s = ' ';
14863        }
14864        else if (*s == '\t') {
14865         *s = '-';
14866        }
14867        s++;
14868       }
14869       if (s[-1] == ' ')
14870        s[-1] = 0;
14871
14872       sv_catpv(sv, t);
14873      }
14874
14875     out_dump:
14876
14877      Safefree(origs);
14878     }
14879     SvREFCNT_dec_NN(lv);
14880    }
14881   }
14882
14883   Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14884  }
14885  else if (k == POSIXD || k == NPOSIXD) {
14886   U8 index = FLAGS(o) * 2;
14887   if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14888    Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14889   }
14890   else {
14891    sv_catpv(sv, anyofs[index]);
14892   }
14893  }
14894  else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14895   Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14896 #else
14897  PERL_UNUSED_CONTEXT;
14898  PERL_UNUSED_ARG(sv);
14899  PERL_UNUSED_ARG(o);
14900  PERL_UNUSED_ARG(prog);
14901 #endif /* DEBUGGING */
14902 }
14903
14904 SV *
14905 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14906 {    /* Assume that RE_INTUIT is set */
14907  dVAR;
14908  struct regexp *const prog = ReANY(r);
14909  GET_RE_DEBUG_FLAGS_DECL;
14910
14911  PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14912  PERL_UNUSED_CONTEXT;
14913
14914  DEBUG_COMPILE_r(
14915   {
14916    const char * const s = SvPV_nolen_const(prog->check_substr
14917      ? prog->check_substr : prog->check_utf8);
14918
14919    if (!PL_colorset) reginitcolors();
14920    PerlIO_printf(Perl_debug_log,
14921      "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14922      PL_colors[4],
14923      prog->check_substr ? "" : "utf8 ",
14924      PL_colors[5],PL_colors[0],
14925      s,
14926      PL_colors[1],
14927      (strlen(s) > 60 ? "..." : ""));
14928   } );
14929
14930  return prog->check_substr ? prog->check_substr : prog->check_utf8;
14931 }
14932
14933 /*
14934    pregfree()
14935
14936    handles refcounting and freeing the perl core regexp structure. When
14937    it is necessary to actually free the structure the first thing it
14938    does is call the 'free' method of the regexp_engine associated to
14939    the regexp, allowing the handling of the void *pprivate; member
14940    first. (This routine is not overridable by extensions, which is why
14941    the extensions free is called first.)
14942
14943    See regdupe and regdupe_internal if you change anything here.
14944 */
14945 #ifndef PERL_IN_XSUB_RE
14946 void
14947 Perl_pregfree(pTHX_ REGEXP *r)
14948 {
14949  SvREFCNT_dec(r);
14950 }
14951
14952 void
14953 Perl_pregfree2(pTHX_ REGEXP *rx)
14954 {
14955  dVAR;
14956  struct regexp *const r = ReANY(rx);
14957  GET_RE_DEBUG_FLAGS_DECL;
14958
14959  PERL_ARGS_ASSERT_PREGFREE2;
14960
14961  if (r->mother_re) {
14962   ReREFCNT_dec(r->mother_re);
14963  } else {
14964   CALLREGFREE_PVT(rx); /* free the private data */
14965   SvREFCNT_dec(RXp_PAREN_NAMES(r));
14966   Safefree(r->xpv_len_u.xpvlenu_pv);
14967  }
14968  if (r->substrs) {
14969   SvREFCNT_dec(r->anchored_substr);
14970   SvREFCNT_dec(r->anchored_utf8);
14971   SvREFCNT_dec(r->float_substr);
14972   SvREFCNT_dec(r->float_utf8);
14973   Safefree(r->substrs);
14974  }
14975  RX_MATCH_COPY_FREE(rx);
14976 #ifdef PERL_ANY_COW
14977  SvREFCNT_dec(r->saved_copy);
14978 #endif
14979  Safefree(r->offs);
14980  SvREFCNT_dec(r->qr_anoncv);
14981  rx->sv_u.svu_rx = 0;
14982 }
14983
14984 /*  reg_temp_copy()
14985
14986  This is a hacky workaround to the structural issue of match results
14987  being stored in the regexp structure which is in turn stored in
14988  PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14989  could be PL_curpm in multiple contexts, and could require multiple
14990  result sets being associated with the pattern simultaneously, such
14991  as when doing a recursive match with (??{$qr})
14992
14993  The solution is to make a lightweight copy of the regexp structure
14994  when a qr// is returned from the code executed by (??{$qr}) this
14995  lightweight copy doesn't actually own any of its data except for
14996  the starp/end and the actual regexp structure itself.
14997
14998 */
14999
15000
15001 REGEXP *
15002 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15003 {
15004  struct regexp *ret;
15005  struct regexp *const r = ReANY(rx);
15006  const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15007
15008  PERL_ARGS_ASSERT_REG_TEMP_COPY;
15009
15010  if (!ret_x)
15011   ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15012  else {
15013   SvOK_off((SV *)ret_x);
15014   if (islv) {
15015    /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15016    to the regexp.  (For SVt_REGEXPs, sv_upgrade has already
15017    made both spots point to the same regexp body.) */
15018    REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15019    assert(!SvPVX(ret_x));
15020    ret_x->sv_u.svu_rx = temp->sv_any;
15021    temp->sv_any = NULL;
15022    SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15023    SvREFCNT_dec_NN(temp);
15024    /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15025    ing below will not set it. */
15026    SvCUR_set(ret_x, SvCUR(rx));
15027   }
15028  }
15029  /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15030  sv_force_normal(sv) is called.  */
15031  SvFAKE_on(ret_x);
15032  ret = ReANY(ret_x);
15033
15034  SvFLAGS(ret_x) |= SvUTF8(rx);
15035  /* We share the same string buffer as the original regexp, on which we
15036  hold a reference count, incremented when mother_re is set below.
15037  The string pointer is copied here, being part of the regexp struct.
15038  */
15039  memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15040   sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15041  if (r->offs) {
15042   const I32 npar = r->nparens+1;
15043   Newx(ret->offs, npar, regexp_paren_pair);
15044   Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15045  }
15046  if (r->substrs) {
15047   Newx(ret->substrs, 1, struct reg_substr_data);
15048   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15049
15050   SvREFCNT_inc_void(ret->anchored_substr);
15051   SvREFCNT_inc_void(ret->anchored_utf8);
15052   SvREFCNT_inc_void(ret->float_substr);
15053   SvREFCNT_inc_void(ret->float_utf8);
15054
15055   /* check_substr and check_utf8, if non-NULL, point to either their
15056   anchored or float namesakes, and don't hold a second reference.  */
15057  }
15058  RX_MATCH_COPIED_off(ret_x);
15059 #ifdef PERL_ANY_COW
15060  ret->saved_copy = NULL;
15061 #endif
15062  ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15063  SvREFCNT_inc_void(ret->qr_anoncv);
15064
15065  return ret_x;
15066 }
15067 #endif
15068
15069 /* regfree_internal()
15070
15071    Free the private data in a regexp. This is overloadable by
15072    extensions. Perl takes care of the regexp structure in pregfree(),
15073    this covers the *pprivate pointer which technically perl doesn't
15074    know about, however of course we have to handle the
15075    regexp_internal structure when no extension is in use.
15076
15077    Note this is called before freeing anything in the regexp
15078    structure.
15079  */
15080
15081 void
15082 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15083 {
15084  dVAR;
15085  struct regexp *const r = ReANY(rx);
15086  RXi_GET_DECL(r,ri);
15087  GET_RE_DEBUG_FLAGS_DECL;
15088
15089  PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15090
15091  DEBUG_COMPILE_r({
15092   if (!PL_colorset)
15093    reginitcolors();
15094   {
15095    SV *dsv= sv_newmortal();
15096    RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15097     dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15098    PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15099     PL_colors[4],PL_colors[5],s);
15100   }
15101  });
15102 #ifdef RE_TRACK_PATTERN_OFFSETS
15103  if (ri->u.offsets)
15104   Safefree(ri->u.offsets);             /* 20010421 MJD */
15105 #endif
15106  if (ri->code_blocks) {
15107   int n;
15108   for (n = 0; n < ri->num_code_blocks; n++)
15109    SvREFCNT_dec(ri->code_blocks[n].src_regex);
15110   Safefree(ri->code_blocks);
15111  }
15112
15113  if (ri->data) {
15114   int n = ri->data->count;
15115
15116   while (--n >= 0) {
15117   /* If you add a ->what type here, update the comment in regcomp.h */
15118    switch (ri->data->what[n]) {
15119    case 'a':
15120    case 'r':
15121    case 's':
15122    case 'S':
15123    case 'u':
15124     SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15125     break;
15126    case 'f':
15127     Safefree(ri->data->data[n]);
15128     break;
15129    case 'l':
15130    case 'L':
15131     break;
15132    case 'T':
15133     { /* Aho Corasick add-on structure for a trie node.
15134      Used in stclass optimization only */
15135      U32 refcount;
15136      reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15137      OP_REFCNT_LOCK;
15138      refcount = --aho->refcount;
15139      OP_REFCNT_UNLOCK;
15140      if ( !refcount ) {
15141       PerlMemShared_free(aho->states);
15142       PerlMemShared_free(aho->fail);
15143       /* do this last!!!! */
15144       PerlMemShared_free(ri->data->data[n]);
15145       PerlMemShared_free(ri->regstclass);
15146      }
15147     }
15148     break;
15149    case 't':
15150     {
15151      /* trie structure. */
15152      U32 refcount;
15153      reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15154      OP_REFCNT_LOCK;
15155      refcount = --trie->refcount;
15156      OP_REFCNT_UNLOCK;
15157      if ( !refcount ) {
15158       PerlMemShared_free(trie->charmap);
15159       PerlMemShared_free(trie->states);
15160       PerlMemShared_free(trie->trans);
15161       if (trie->bitmap)
15162        PerlMemShared_free(trie->bitmap);
15163       if (trie->jump)
15164        PerlMemShared_free(trie->jump);
15165       PerlMemShared_free(trie->wordinfo);
15166       /* do this last!!!! */
15167       PerlMemShared_free(ri->data->data[n]);
15168      }
15169     }
15170     break;
15171    default:
15172     Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15173    }
15174   }
15175   Safefree(ri->data->what);
15176   Safefree(ri->data);
15177  }
15178
15179  Safefree(ri);
15180 }
15181
15182 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15183 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15184 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15185
15186 /*
15187    re_dup - duplicate a regexp.
15188
15189    This routine is expected to clone a given regexp structure. It is only
15190    compiled under USE_ITHREADS.
15191
15192    After all of the core data stored in struct regexp is duplicated
15193    the regexp_engine.dupe method is used to copy any private data
15194    stored in the *pprivate pointer. This allows extensions to handle
15195    any duplication it needs to do.
15196
15197    See pregfree() and regfree_internal() if you change anything here.
15198 */
15199 #if defined(USE_ITHREADS)
15200 #ifndef PERL_IN_XSUB_RE
15201 void
15202 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15203 {
15204  dVAR;
15205  I32 npar;
15206  const struct regexp *r = ReANY(sstr);
15207  struct regexp *ret = ReANY(dstr);
15208
15209  PERL_ARGS_ASSERT_RE_DUP_GUTS;
15210
15211  npar = r->nparens+1;
15212  Newx(ret->offs, npar, regexp_paren_pair);
15213  Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15214
15215  if (ret->substrs) {
15216   /* Do it this way to avoid reading from *r after the StructCopy().
15217   That way, if any of the sv_dup_inc()s dislodge *r from the L1
15218   cache, it doesn't matter.  */
15219   const bool anchored = r->check_substr
15220    ? r->check_substr == r->anchored_substr
15221    : r->check_utf8 == r->anchored_utf8;
15222   Newx(ret->substrs, 1, struct reg_substr_data);
15223   StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15224
15225   ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15226   ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15227   ret->float_substr = sv_dup_inc(ret->float_substr, param);
15228   ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15229
15230   /* check_substr and check_utf8, if non-NULL, point to either their
15231   anchored or float namesakes, and don't hold a second reference.  */
15232
15233   if (ret->check_substr) {
15234    if (anchored) {
15235     assert(r->check_utf8 == r->anchored_utf8);
15236     ret->check_substr = ret->anchored_substr;
15237     ret->check_utf8 = ret->anchored_utf8;
15238    } else {
15239     assert(r->check_substr == r->float_substr);
15240     assert(r->check_utf8 == r->float_utf8);
15241     ret->check_substr = ret->float_substr;
15242     ret->check_utf8 = ret->float_utf8;
15243    }
15244   } else if (ret->check_utf8) {
15245    if (anchored) {
15246     ret->check_utf8 = ret->anchored_utf8;
15247    } else {
15248     ret->check_utf8 = ret->float_utf8;
15249    }
15250   }
15251  }
15252
15253  RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15254  ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15255
15256  if (ret->pprivate)
15257   RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15258
15259  if (RX_MATCH_COPIED(dstr))
15260   ret->subbeg  = SAVEPVN(ret->subbeg, ret->sublen);
15261  else
15262   ret->subbeg = NULL;
15263 #ifdef PERL_ANY_COW
15264  ret->saved_copy = NULL;
15265 #endif
15266
15267  /* Whether mother_re be set or no, we need to copy the string.  We
15268  cannot refrain from copying it when the storage points directly to
15269  our mother regexp, because that's
15270    1: a buffer in a different thread
15271    2: something we no longer hold a reference on
15272    so we need to copy it locally.  */
15273  RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15274  ret->mother_re   = NULL;
15275  ret->gofs = 0;
15276 }
15277 #endif /* PERL_IN_XSUB_RE */
15278
15279 /*
15280    regdupe_internal()
15281
15282    This is the internal complement to regdupe() which is used to copy
15283    the structure pointed to by the *pprivate pointer in the regexp.
15284    This is the core version of the extension overridable cloning hook.
15285    The regexp structure being duplicated will be copied by perl prior
15286    to this and will be provided as the regexp *r argument, however
15287    with the /old/ structures pprivate pointer value. Thus this routine
15288    may override any copying normally done by perl.
15289
15290    It returns a pointer to the new regexp_internal structure.
15291 */
15292
15293 void *
15294 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15295 {
15296  dVAR;
15297  struct regexp *const r = ReANY(rx);
15298  regexp_internal *reti;
15299  int len;
15300  RXi_GET_DECL(r,ri);
15301
15302  PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15303
15304  len = ProgLen(ri);
15305
15306  Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15307  Copy(ri->program, reti->program, len+1, regnode);
15308
15309  reti->num_code_blocks = ri->num_code_blocks;
15310  if (ri->code_blocks) {
15311   int n;
15312   Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15313     struct reg_code_block);
15314   Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15315     struct reg_code_block);
15316   for (n = 0; n < ri->num_code_blocks; n++)
15317    reti->code_blocks[n].src_regex = (REGEXP*)
15318      sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15319  }
15320  else
15321   reti->code_blocks = NULL;
15322
15323  reti->regstclass = NULL;
15324
15325  if (ri->data) {
15326   struct reg_data *d;
15327   const int count = ri->data->count;
15328   int i;
15329
15330   Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15331     char, struct reg_data);
15332   Newx(d->what, count, U8);
15333
15334   d->count = count;
15335   for (i = 0; i < count; i++) {
15336    d->what[i] = ri->data->what[i];
15337    switch (d->what[i]) {
15338     /* see also regcomp.h and regfree_internal() */
15339    case 'a': /* actually an AV, but the dup function is identical.  */
15340    case 'r':
15341    case 's':
15342    case 'S':
15343    case 'u': /* actually an HV, but the dup function is identical.  */
15344     d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15345     break;
15346    case 'f':
15347     /* This is cheating. */
15348     Newx(d->data[i], 1, struct regnode_charclass_class);
15349     StructCopy(ri->data->data[i], d->data[i],
15350        struct regnode_charclass_class);
15351     reti->regstclass = (regnode*)d->data[i];
15352     break;
15353    case 'T':
15354     /* Trie stclasses are readonly and can thus be shared
15355     * without duplication. We free the stclass in pregfree
15356     * when the corresponding reg_ac_data struct is freed.
15357     */
15358     reti->regstclass= ri->regstclass;
15359     /* Fall through */
15360    case 't':
15361     OP_REFCNT_LOCK;
15362     ((reg_trie_data*)ri->data->data[i])->refcount++;
15363     OP_REFCNT_UNLOCK;
15364     /* Fall through */
15365    case 'l':
15366    case 'L':
15367     d->data[i] = ri->data->data[i];
15368     break;
15369    default:
15370     Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15371    }
15372   }
15373
15374   reti->data = d;
15375  }
15376  else
15377   reti->data = NULL;
15378
15379  reti->name_list_idx = ri->name_list_idx;
15380
15381 #ifdef RE_TRACK_PATTERN_OFFSETS
15382  if (ri->u.offsets) {
15383   Newx(reti->u.offsets, 2*len+1, U32);
15384   Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15385  }
15386 #else
15387  SetProgLen(reti,len);
15388 #endif
15389
15390  return (void*)reti;
15391 }
15392
15393 #endif    /* USE_ITHREADS */
15394
15395 #ifndef PERL_IN_XSUB_RE
15396
15397 /*
15398  - regnext - dig the "next" pointer out of a node
15399  */
15400 regnode *
15401 Perl_regnext(pTHX_ regnode *p)
15402 {
15403  dVAR;
15404  I32 offset;
15405
15406  if (!p)
15407   return(NULL);
15408
15409  if (OP(p) > REGNODE_MAX) {  /* regnode.type is unsigned */
15410   Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15411  }
15412
15413  offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15414  if (offset == 0)
15415   return(NULL);
15416
15417  return(p+offset);
15418 }
15419 #endif
15420
15421 STATIC void
15422 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15423 {
15424  va_list args;
15425  STRLEN l1 = strlen(pat1);
15426  STRLEN l2 = strlen(pat2);
15427  char buf[512];
15428  SV *msv;
15429  const char *message;
15430
15431  PERL_ARGS_ASSERT_RE_CROAK2;
15432
15433  if (l1 > 510)
15434   l1 = 510;
15435  if (l1 + l2 > 510)
15436   l2 = 510 - l1;
15437  Copy(pat1, buf, l1 , char);
15438  Copy(pat2, buf + l1, l2 , char);
15439  buf[l1 + l2] = '\n';
15440  buf[l1 + l2 + 1] = '\0';
15441 #ifdef I_STDARG
15442  /* ANSI variant takes additional second argument */
15443  va_start(args, pat2);
15444 #else
15445  va_start(args);
15446 #endif
15447  msv = vmess(buf, &args);
15448  va_end(args);
15449  message = SvPV_const(msv,l1);
15450  if (l1 > 512)
15451   l1 = 512;
15452  Copy(message, buf, l1 , char);
15453  buf[l1-1] = '\0';   /* Overwrite \n */
15454  Perl_croak(aTHX_ "%s", buf);
15455 }
15456
15457 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
15458
15459 #ifndef PERL_IN_XSUB_RE
15460 void
15461 Perl_save_re_context(pTHX)
15462 {
15463  dVAR;
15464
15465  struct re_save_state *state;
15466
15467  SAVEVPTR(PL_curcop);
15468  SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15469
15470  state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15471  PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15472  SSPUSHUV(SAVEt_RE_STATE);
15473
15474  Copy(&PL_reg_state, state, 1, struct re_save_state);
15475
15476  PL_reg_oldsaved = NULL;
15477  PL_reg_oldsavedlen = 0;
15478  PL_reg_oldsavedoffset = 0;
15479  PL_reg_oldsavedcoffset = 0;
15480  PL_reg_maxiter = 0;
15481  PL_reg_leftiter = 0;
15482  PL_reg_poscache = NULL;
15483  PL_reg_poscache_size = 0;
15484 #ifdef PERL_ANY_COW
15485  PL_nrs = NULL;
15486 #endif
15487
15488  /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15489  if (PL_curpm) {
15490   const REGEXP * const rx = PM_GETRE(PL_curpm);
15491   if (rx) {
15492    U32 i;
15493    for (i = 1; i <= RX_NPARENS(rx); i++) {
15494     char digits[TYPE_CHARS(long)];
15495     const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15496     GV *const *const gvp
15497      = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15498
15499     if (gvp) {
15500      GV * const gv = *gvp;
15501      if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15502       save_scalar(gv);
15503     }
15504    }
15505   }
15506  }
15507 }
15508 #endif
15509
15510 #ifdef DEBUGGING
15511
15512 STATIC void
15513 S_put_byte(pTHX_ SV *sv, int c)
15514 {
15515  PERL_ARGS_ASSERT_PUT_BYTE;
15516
15517  /* Our definition of isPRINT() ignores locales, so only bytes that are
15518  not part of UTF-8 are considered printable. I assume that the same
15519  holds for UTF-EBCDIC.
15520  Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15521  which Wikipedia says:
15522
15523  EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15524  ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15525  identical, to the ASCII delete (DEL) or rubout control character. ...
15526  it is typically mapped to hexadecimal code 9F, in order to provide a
15527  unique character mapping in both directions)
15528
15529  So the old condition can be simplified to !isPRINT(c)  */
15530  if (!isPRINT(c)) {
15531   if (c < 256) {
15532    Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15533   }
15534   else {
15535    Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15536   }
15537  }
15538  else {
15539   const char string = c;
15540   if (c == '-' || c == ']' || c == '\\' || c == '^')
15541    sv_catpvs(sv, "\\");
15542   sv_catpvn(sv, &string, 1);
15543  }
15544 }
15545
15546
15547 #define CLEAR_OPTSTART \
15548  if (optstart) STMT_START { \
15549    DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15550    optstart=NULL; \
15551  } STMT_END
15552
15553 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15554
15555 STATIC const regnode *
15556 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15557    const regnode *last, const regnode *plast,
15558    SV* sv, I32 indent, U32 depth)
15559 {
15560  dVAR;
15561  U8 op = PSEUDO; /* Arbitrary non-END op. */
15562  const regnode *next;
15563  const regnode *optstart= NULL;
15564
15565  RXi_GET_DECL(r,ri);
15566  GET_RE_DEBUG_FLAGS_DECL;
15567
15568  PERL_ARGS_ASSERT_DUMPUNTIL;
15569
15570 #ifdef DEBUG_DUMPUNTIL
15571  PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15572   last ? last-start : 0,plast ? plast-start : 0);
15573 #endif
15574
15575  if (plast && plast < last)
15576   last= plast;
15577
15578  while (PL_regkind[op] != END && (!last || node < last)) {
15579   /* While that wasn't END last time... */
15580   NODE_ALIGN(node);
15581   op = OP(node);
15582   if (op == CLOSE || op == WHILEM)
15583    indent--;
15584   next = regnext((regnode *)node);
15585
15586   /* Where, what. */
15587   if (OP(node) == OPTIMIZED) {
15588    if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15589     optstart = node;
15590    else
15591     goto after_print;
15592   } else
15593    CLEAR_OPTSTART;
15594
15595   regprop(r, sv, node);
15596   PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15597      (int)(2*indent + 1), "", SvPVX_const(sv));
15598
15599   if (OP(node) != OPTIMIZED) {
15600    if (next == NULL)  /* Next ptr. */
15601     PerlIO_printf(Perl_debug_log, " (0)");
15602    else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15603     PerlIO_printf(Perl_debug_log, " (FAIL)");
15604    else
15605     PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15606    (void)PerlIO_putc(Perl_debug_log, '\n');
15607   }
15608
15609  after_print:
15610   if (PL_regkind[(U8)op] == BRANCHJ) {
15611    assert(next);
15612    {
15613     const regnode *nnode = (OP(next) == LONGJMP
15614          ? regnext((regnode *)next)
15615          : next);
15616     if (last && nnode > last)
15617      nnode = last;
15618     DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15619    }
15620   }
15621   else if (PL_regkind[(U8)op] == BRANCH) {
15622    assert(next);
15623    DUMPUNTIL(NEXTOPER(node), next);
15624   }
15625   else if ( PL_regkind[(U8)op]  == TRIE ) {
15626    const regnode *this_trie = node;
15627    const char op = OP(node);
15628    const U32 n = ARG(node);
15629    const reg_ac_data * const ac = op>=AHOCORASICK ?
15630    (reg_ac_data *)ri->data->data[n] :
15631    NULL;
15632    const reg_trie_data * const trie =
15633     (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15634 #ifdef DEBUGGING
15635    AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15636 #endif
15637    const regnode *nextbranch= NULL;
15638    I32 word_idx;
15639    sv_setpvs(sv, "");
15640    for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15641     SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15642
15643     PerlIO_printf(Perl_debug_log, "%*s%s ",
15644     (int)(2*(indent+3)), "",
15645      elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15646        PL_colors[0], PL_colors[1],
15647        (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15648        PERL_PV_PRETTY_ELLIPSES    |
15649        PERL_PV_PRETTY_LTGT
15650        )
15651        : "???"
15652     );
15653     if (trie->jump) {
15654      U16 dist= trie->jump[word_idx+1];
15655      PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15656         (UV)((dist ? this_trie + dist : next) - start));
15657      if (dist) {
15658       if (!nextbranch)
15659        nextbranch= this_trie + trie->jump[0];
15660       DUMPUNTIL(this_trie + dist, nextbranch);
15661      }
15662      if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15663       nextbranch= regnext((regnode *)nextbranch);
15664     } else {
15665      PerlIO_printf(Perl_debug_log, "\n");
15666     }
15667    }
15668    if (last && next > last)
15669     node= last;
15670    else
15671     node= next;
15672   }
15673   else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
15674    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15675      NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15676   }
15677   else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15678    assert(next);
15679    DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15680   }
15681   else if ( op == PLUS || op == STAR) {
15682    DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15683   }
15684   else if (PL_regkind[(U8)op] == ANYOF) {
15685    /* arglen 1 + class block */
15686    node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15687      ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15688    node = NEXTOPER(node);
15689   }
15690   else if (PL_regkind[(U8)op] == EXACT) {
15691    /* Literal string, where present. */
15692    node += NODE_SZ_STR(node) - 1;
15693    node = NEXTOPER(node);
15694   }
15695   else {
15696    node = NEXTOPER(node);
15697    node += regarglen[(U8)op];
15698   }
15699   if (op == CURLYX || op == OPEN)
15700    indent++;
15701  }
15702  CLEAR_OPTSTART;
15703 #ifdef DEBUG_DUMPUNTIL
15704  PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15705 #endif
15706  return node;
15707 }
15708
15709 #endif /* DEBUGGING */
15710
15711 /*
15712  * Local variables:
15713  * c-indentation-style: bsd
15714  * c-basic-offset: 4
15715  * indent-tabs-mode: nil
15716  * End:
15717  *
15718  * ex: set ts=8 sts=4 sw=4 et:
15719  */